diff -Nru vc-0.7.4/.appveyor.yml vc-1.3.0/.appveyor.yml --- vc-0.7.4/.appveyor.yml 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/.appveyor.yml 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,19 @@ +version: 1.0.{build} +os: Visual Studio 2015 +configuration: Release +platform: + - x64 + #- x86 + +clone_depth: 50 + +environment: + matrix: + - subset: sse + - subset: avx + +build_script: +- cmd: >- + CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %PLATFORM% + + ctest -VV -S C:\projects\vc\test.cmake diff -Nru vc-0.7.4/avx/casts.h vc-1.3.0/avx/casts.h --- vc-0.7.4/avx/casts.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/casts.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,60 +1,62 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_CASTS_H -#define AVX_CASTS_H +#ifndef VC_AVX_CASTS_H_ +#define VC_AVX_CASTS_H_ #include "intrinsics.h" #include "types.h" +#include "../sse/casts.h" +#include "shuffle.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { namespace AVX { - template static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param256 v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R; - -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - template static Vc_INTRINSIC T avx_cast(__m128 v) { return avx_cast(param128 (v)); } - template static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast(param128i(v)); } - template static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast(param128d(v)); } - template static Vc_INTRINSIC T avx_cast(__m256 v) { return avx_cast(param256 (v)); } - template static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast(param256i(v)); } - template static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast(param256d(v)); } -#endif +namespace Casts +{ + template Vc_INTRINSIC_L T avx_cast(__m128 v) Vc_INTRINSIC_R; + template Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R; + template Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R; + template Vc_INTRINSIC_L T avx_cast(__m256 v) Vc_INTRINSIC_R; + template Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R; + template Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R; // 128 -> 128 - template<> Vc_INTRINSIC m128 avx_cast(param128 v) { return v; } - template<> Vc_INTRINSIC m128 avx_cast(param128i v) { return _mm_castsi128_ps(v); } - template<> Vc_INTRINSIC m128 avx_cast(param128d v) { return _mm_castpd_ps(v); } - template<> Vc_INTRINSIC m128i avx_cast(param128 v) { return _mm_castps_si128(v); } - template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; } - template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); } - template<> Vc_INTRINSIC m128d avx_cast(param128 v) { return _mm_castps_pd(v); } - template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); } - template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; } + template<> Vc_INTRINSIC __m128 avx_cast(__m128 v) { return v; } + template<> Vc_INTRINSIC __m128 avx_cast(__m128i v) { return _mm_castsi128_ps(v); } + template<> Vc_INTRINSIC __m128 avx_cast(__m128d v) { return _mm_castpd_ps(v); } + template<> Vc_INTRINSIC __m128i avx_cast(__m128 v) { return _mm_castps_si128(v); } + template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; } + template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); } + template<> Vc_INTRINSIC __m128d avx_cast(__m128 v) { return _mm_castps_pd(v); } + template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); } + template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; } // 128 -> 256 // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never @@ -64,127 +66,238 @@ // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck, // do we really want to rely on specific compiler behavior here? - template<> Vc_INTRINSIC m256 avx_cast(param128 v) { return _mm256_castps128_ps256(v); } - template<> Vc_INTRINSIC m256 avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); } - template<> Vc_INTRINSIC m256 avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); } - template<> Vc_INTRINSIC m256i avx_cast(param128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); } - template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); } - template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); } - template<> Vc_INTRINSIC m256d avx_cast(param128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); } - template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); } - template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); } - -#ifdef VC_MSVC - static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); } - static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); } - static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); } + template<> Vc_INTRINSIC __m256 avx_cast(__m128 v) { return _mm256_castps128_ps256(v); } + template<> Vc_INTRINSIC __m256 avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); } + template<> Vc_INTRINSIC __m256 avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); } + template<> Vc_INTRINSIC __m256i avx_cast(__m128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); } + template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); } + template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); } + template<> Vc_INTRINSIC __m256d avx_cast(__m128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); } + template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); } + template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); } + +#if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG + static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); } + static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); } + static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); } #else - static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_castps128_ps256(v); } - static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); } - static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); } -#ifdef VC_ICC - static Vc_INTRINSIC Vc_CONST m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); } - static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); } - static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); } -#endif + static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); } + static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); } + static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); } #endif // 256 -> 128 - template<> Vc_INTRINSIC m128 avx_cast(param256 v) { return _mm256_castps256_ps128(v); } - template<> Vc_INTRINSIC m128 avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); } - template<> Vc_INTRINSIC m128 avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); } - template<> Vc_INTRINSIC m128i avx_cast(param256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); } - template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); } - template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); } - template<> Vc_INTRINSIC m128d avx_cast(param256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); } - template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); } - template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); } + template<> Vc_INTRINSIC __m128 avx_cast(__m256 v) { return _mm256_castps256_ps128(v); } + template<> Vc_INTRINSIC __m128 avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); } + template<> Vc_INTRINSIC __m128 avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); } + template<> Vc_INTRINSIC __m128i avx_cast(__m256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); } + template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); } + template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); } + template<> Vc_INTRINSIC __m128d avx_cast(__m256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); } + template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); } + template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); } // 256 -> 256 - template<> Vc_INTRINSIC m256 avx_cast(param256 v) { return v; } - template<> Vc_INTRINSIC m256 avx_cast(param256i v) { return _mm256_castsi256_ps(v); } - template<> Vc_INTRINSIC m256 avx_cast(param256d v) { return _mm256_castpd_ps(v); } - template<> Vc_INTRINSIC m256i avx_cast(param256 v) { return _mm256_castps_si256(v); } - template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; } - template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); } - template<> Vc_INTRINSIC m256d avx_cast(param256 v) { return _mm256_castps_pd(v); } - template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); } - template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; } + template<> Vc_INTRINSIC __m256 avx_cast(__m256 v) { return v; } + template<> Vc_INTRINSIC __m256 avx_cast(__m256i v) { return _mm256_castsi256_ps(v); } + template<> Vc_INTRINSIC __m256 avx_cast(__m256d v) { return _mm256_castpd_ps(v); } + template<> Vc_INTRINSIC __m256i avx_cast(__m256 v) { return _mm256_castps_si256(v); } + template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; } + template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); } + template<> Vc_INTRINSIC __m256d avx_cast(__m256 v) { return _mm256_castps_pd(v); } + template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); } + template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; } // simplify splitting 256-bit registers in 128-bit registers - Vc_INTRINSIC Vc_CONST m128 lo128(param256 v) { return avx_cast(v); } - Vc_INTRINSIC Vc_CONST m128d lo128(param256d v) { return avx_cast(v); } - Vc_INTRINSIC Vc_CONST m128i lo128(param256i v) { return avx_cast(v); } - Vc_INTRINSIC Vc_CONST m128 hi128(param256 v) { return _mm256_extractf128_ps(v, 1); } - Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); } - Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); } + Vc_INTRINSIC Vc_CONST __m128 lo128(__m256 v) { return avx_cast<__m128>(v); } + Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); } + Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); } + Vc_INTRINSIC Vc_CONST __m128 hi128(__m256 v) { return extract128<1>(v); } + Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); } + Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); } // simplify combining 128-bit registers in 256-bit registers - Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, param128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256 concat(param128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } + Vc_INTRINSIC Vc_CONST __m256 concat(__m128 a, __m128 b) { return insert128<1>(avx_cast<__m256 >(a), b); } + Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); } + Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); } + +} // namespace Casts +using namespace Casts; +} // namespace AVX + +namespace AVX2 +{ +using namespace AVX::Casts; +} // namespace AVX2 + +namespace AVX +{ +template struct ConvertTag {}; + +Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag) { return _mm256_cvttps_epi32(v); } +Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return _mm256_cvttpd_epi32(v); } +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } +Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { +#ifdef Vc_IMPL_AVX2 + return _mm256_cvtepi16_epi32(v); +#else + return AVX::srai_epi32<16>( + concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); +#endif +} +Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { +#ifdef Vc_IMPL_AVX2 + return _mm256_cvtepu16_epi32(v); +#else + return AVX::srli_epi32<16>( + concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif +} - template struct StaticCastHelper {}; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v) { return _mm256_cvttps_epi32(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast(_mm256_cvttpd_epi32(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; - template<> struct StaticCastHelper { static inline Vc_CONST m256i cast(param256 v) { - return _mm256_castps_si256(_mm256_blendv_ps( - _mm256_castsi256_ps(_mm256_cvttps_epi32(v)), - _mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())), - _mm256_cmpge_ps(v, _mm256_set2power31_ps()) - )); - - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast(_mm256_cvttpd_epi32(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v) { return avx_cast(_mm256_cvtpd_ps(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v) { return _mm256_cvtepi32_ps(v); } }; - template<> struct StaticCastHelper { static inline Vc_CONST m256 cast(param256i v) { - return _mm256_blendv_ps( - _mm256_cvtepi32_ps(v), - _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()), - _mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256())) - ); - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v) { return _mm256_cvtps_pd(avx_cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag) { + using namespace AVX; + return _mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(_mm256_cvttps_epi32(v)), + _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())), + set2power31_epu32())), + cmpge_ps(v, set2power31_ps()))); +} +Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { + using namespace AVX; + return _mm_xor_si128( + _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))), + _mm_set2power31_epu32()); +} +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } +Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { +#ifdef Vc_IMPL_AVX2 + return _mm256_cvtepi16_epi32(v); +#else + return AVX::srai_epi32<16>( + concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); +#endif +} +Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { +#ifdef Vc_IMPL_AVX2 + return _mm256_cvtepu16_epi32(v); +#else + return AVX::srli_epi32<16>( + concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); +#endif +} -#include "undomacros.h" +Vc_INTRINSIC __m256 convert(__m256 v, ConvertTag) { return v; } +Vc_INTRINSIC __m128 convert(__m256d v, ConvertTag) { return _mm256_cvtpd_ps(v); } +Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag) { return _mm256_cvtepi32_ps(v); } +Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag) { + // this is complicated because cvtepi32_ps only supports signed input. Thus, all + // input values with the MSB set would produce a negative result. We can reuse the + // cvtepi32_ps instruction if we unset the MSB. But then the rounding results can be + // different. Since float uses 24 bits for the mantissa (effectively), the 9-bit LSB + // determines the rounding direction. (Consider the bits ...8'7654'3210. The bits [0:7] + // need to be dropped and if > 0x80 round up, if < 0x80 round down. If [0:7] == 0x80 + // then the rounding direction is determined by bit [8] for round to even. That's why + // the 9th bit is relevant for the rounding decision.) + // If the MSB of the input is set to 0, the cvtepi32_ps instruction makes its rounding + // decision on the lowest 8 bits instead. A second rounding decision is made when + // float(0x8000'0000) is added. This will rarely fix the rounding issue. + // + // Here's what the standard rounding mode expects: + // 0xc0000080 should cvt to 0xc0000000 + // 0xc0000081 should cvt to 0xc0000100 + // -- should cvt to 0xc0000100 + // 0xc000017f should cvt to 0xc0000100 + // 0xc0000180 should cvt to 0xc0000200 + // + // However: using float(input ^ 0x8000'0000) + float(0x8000'0000) we get: + // 0xc0000081 would cvt to 0xc0000000 + // 0xc00000c0 would cvt to 0xc0000000 + // 0xc00000c1 would cvt to 0xc0000100 + // 0xc000013f would cvt to 0xc0000100 + // 0xc0000140 would cvt to 0xc0000200 + // + // Solution: float(input & 0x7fff'fe00) + (float(0x8000'0000) + float(input & 0x1ff)) + // This ensures the rounding decision is made on the 9-bit LSB when 0x8000'0000 is + // added to the float value of the low 8 bits of the input. + using namespace AVX; + return _mm256_blendv_ps( + _mm256_cvtepi32_ps(v), + _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))), + _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256( + v, set1_epi32(0x000001ff))))), + _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256()))); +} +Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); } +Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_ps(convert(v, ConvertTag())); } + +Vc_INTRINSIC __m256d convert(__m128 v, ConvertTag) { return _mm256_cvtps_pd(v); } +Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag) { return v; } +Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_pd(v); } +Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { + using namespace AVX; + return _mm256_add_pd( + _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())), + set1_pd(1u << 31)); } +Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag()); } +Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return convert(convert(v, SSE::ConvertTag()), ConvertTag()); } + +Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { + const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); + const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); + const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); + const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); + return _mm_unpacklo_epi16(tmp2, tmp3); +} +Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { + const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); + const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); + const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); + const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); + return _mm_unpacklo_epi16(tmp2, tmp3); +} +Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } +Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return convert(convert(v, ConvertTag()), SSE::ConvertTag()); } +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } + +Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { + auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); + auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); + auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); + auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); + return _mm_unpacklo_epi16(tmp2, tmp3); +} +Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { + auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); + auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); + auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); + auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); + return _mm_unpacklo_epi16(tmp2, tmp3); +} +Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } +Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return convert(convert(v, ConvertTag()), SSE::ConvertTag()); } +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } +Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } + +template +Vc_INTRINSIC auto convert( + typename std::conditional<(sizeof(From) < sizeof(To)), + typename SSE::VectorTraits::VectorType, + typename AVX::VectorTypeHelper::Type>::type v) + -> decltype(convert(v, ConvertTag())) +{ + return convert(v, ConvertTag()); +} + +template > +Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper::Type v) + -> decltype(convert(lo128(v), ConvertTag())) +{ + return convert(lo128(v), ConvertTag()); +} +} // namespace AVX +} // namespace Vc -#endif // AVX_CASTS_H +#endif // VC_AVX_CASTS_H_ diff -Nru vc-0.7.4/avx/const_data.h vc-1.3.0/avx/const_data.h --- vc-0.7.4/avx/const_data.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/const_data.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,37 +1,46 @@ /* This file is part of the Vc library. {{{ +Copyright © 2012-2015 Matthias Kretz - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ -#ifndef VC_AVX_CONST_DATA_H -#define VC_AVX_CONST_DATA_H +#ifndef VC_AVX_CONST_DATA_H_ +#define VC_AVX_CONST_DATA_H_ +#include "../common/data.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc + +namespace Vc_VERSIONED_NAMESPACE { namespace AVX { -ALIGN(64) extern const unsigned int _IndexesFromZero32[8]; -ALIGN(16) extern const unsigned short _IndexesFromZero16[8]; -ALIGN(16) extern const unsigned char _IndexesFromZero8[16]; +alignas(64) extern const unsigned int _IndexesFromZero32[ 8]; +alignas(16) extern const unsigned short _IndexesFromZero16[16]; +alignas(16) extern const unsigned char _IndexesFromZero8 [32]; -struct STRUCT_ALIGN1(64) c_general +struct alignas(64) c_general { static const float oneFloat; static const unsigned int absMaskFloat[2]; @@ -43,18 +52,18 @@ static const double oneDouble; static const unsigned long long frexpMask; static const unsigned long long highMaskDouble; -} STRUCT_ALIGN2(64); +}; template struct c_trig { - ALIGN(64) static const T data[]; + alignas(64) static const T data[]; }; template struct c_log { typedef float floatAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast(&data[i]); } - ALIGN(64) static const unsigned int data[]; + alignas(64) static const unsigned int data[]; }; template<> struct c_log @@ -62,13 +71,23 @@ enum VectorSize { Size = 16 / sizeof(double) }; typedef double doubleAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast(&data[i]); } - ALIGN(64) static const unsigned long long data[]; + alignas(64) static const unsigned long long data[]; }; -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +} // namespace AVX +} // namespace Vc -#include "undomacros.h" +namespace Vc_VERSIONED_NAMESPACE +{ +namespace AVX2 +{ + using AVX::_IndexesFromZero8; + using AVX::_IndexesFromZero16; + using AVX::_IndexesFromZero32; + using AVX::c_general; + using AVX::c_trig; + using AVX::c_log; +} // namespace AVX2 +} // namespace Vc -#endif // VC_AVX_CONST_DATA_H +#endif // VC_AVX_CONST_DATA_H_ diff -Nru vc-0.7.4/avx/const.h vc-1.3.0/avx/const.h --- vc-0.7.4/avx/const.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/const.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,36 +1,42 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_CONST_H -#define VC_AVX_CONST_H +#ifndef VC_AVX_CONST_H_ +#define VC_AVX_CONST_H_ #include +#include "types.h" #include "const_data.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { namespace AVX { - template class Vector; - template struct IndexesFromZeroData; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast(&_IndexesFromZero32[0]); } @@ -102,11 +108,13 @@ template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::highMaskFloat)); } template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble)); } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::highMaskFloat)); } -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +} // namespace AVX -#include "undomacros.h" +namespace AVX2 +{ +using AVX::IndexesFromZeroData; +using AVX::Const; +} // namespace AVX2 +} // namespace Vc -#endif // VC_AVX_CONST_H +#endif // VC_AVX_CONST_H_ diff -Nru vc-0.7.4/avx/debug.h vc-1.3.0/avx/debug.h --- vc-0.7.4/avx/debug.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/debug.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,36 +1,47 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2011-2015 Matthias Kretz - Copyright (C) 2011-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_DEBUG_H -#define VC_AVX_DEBUG_H +#ifndef VC_AVX_DEBUG_H_ +#define VC_AVX_DEBUG_H_ #ifndef NDEBUG -#include "vectorbase.h" +#include "vector.h" #include #include #endif -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { namespace AVX { +template struct AddType { + const U &d; +}; +template AddType addType(const U &x) { return {x}; } #ifdef NDEBUG class DebugStream @@ -61,6 +72,11 @@ template DebugStream &operator<<(const T &x) { std::cerr << x; return *this; } + template DebugStream &operator<<(AddType &&x) + { + printVector(x.d); + return *this; + } DebugStream &operator<<(__m128 x) { printVector(x); return *this; @@ -93,10 +109,16 @@ }; #endif -#define VC_DEBUG ::Vc::AVX::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__) +#ifdef Vc_DEBUG +#undef Vc_DEBUG +#endif +#ifdef Vc_MSVC +#define Vc_DEBUG Vc::AVX::DebugStream(__FUNCSIG__, __FILE__, __LINE__) +#else +#define Vc_DEBUG Vc::AVX::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__) +#endif -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +} // namespace AVX +} // namespace Vc -#endif // VC_AVX_DEBUG_H +#endif // VC_AVX_DEBUG_H_ diff -Nru vc-0.7.4/avx/deinterleave.tcc vc-1.3.0/avx/deinterleave.tcc --- vc-0.7.4/avx/deinterleave.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/deinterleave.tcc 2016-10-27 02:05:02.000000000 -0500 @@ -1,29 +1,36 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2010-2015 Matthias Kretz - Copyright (C) 2010-2011 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { -namespace AVX +namespace AVX2 { -inline void deinterleave(double_v &VC_RESTRICT a, double_v &VC_RESTRICT b, double_v &VC_RESTRICT c) +inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c) { // estimated latency (AVX): 4.5 cycles const m256d tmp0 = Mem::shuffle128(a.data(), b.data()); const m256d tmp1 = Mem::shuffle128(a.data(), c.data()); @@ -33,7 +40,7 @@ c.data() = Mem::shuffle(tmp1, tmp2); } -inline void deinterleave(float_v &VC_RESTRICT a, float_v &VC_RESTRICT b, float_v &VC_RESTRICT c) +inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c) { // abc abc abc // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 @@ -54,21 +61,23 @@ c.data() = Mem::permute(tmp2); } -inline void deinterleave(int_v &VC_RESTRICT a, int_v &VC_RESTRICT b, int_v &VC_RESTRICT c) +inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } -inline void deinterleave(uint_v &VC_RESTRICT a, uint_v &VC_RESTRICT b, uint_v &VC_RESTRICT c) +inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } -inline void deinterleave(Vector &VC_RESTRICT a, Vector &VC_RESTRICT b, - Vector &VC_RESTRICT c) +inline void deinterleave(Vector &Vc_RESTRICT , Vector &Vc_RESTRICT , + Vector &Vc_RESTRICT ) { + return; + /* TODO: // abc abc abc // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211 @@ -86,10 +95,11 @@ a.data() = Mem::permuteHi(Mem::permuteLo(tmp0)); b.data() = Mem::permuteHi(Mem::permuteLo(tmp1)); c.data() = Mem::permuteHi(Mem::permuteLo(tmp2)); + */ } -inline void deinterleave(Vector &VC_RESTRICT a, Vector &VC_RESTRICT b, - Vector &VC_RESTRICT c) +inline void deinterleave(Vector &Vc_RESTRICT a, Vector &Vc_RESTRICT b, + Vector &Vc_RESTRICT c) { deinterleave(reinterpret_cast &>(a), reinterpret_cast &>(b), reinterpret_cast &>(c)); @@ -109,174 +119,172 @@ b.data() = _mm256_unpackhi_ps(tmp2, tmp3); // b7 b5 b3 b1 a7 a5 a3 a1 } -inline void deinterleave(Vector &a, Vector &b) +inline void deinterleave(Vector &a, // a0 b0 a1 b1 a2 b2 a3 b3 | a4 b4 a5 ... + Vector &b) // a8 b8 a9 ... { - m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 - m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 - m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 - m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 - a.data() = _mm_unpacklo_epi16(tmp2, tmp3); - b.data() = _mm_unpackhi_epi16(tmp2, tmp3); + auto v0 = Mem::shuffle128(a.data(), b.data()); + auto v1 = Mem::shuffle128(a.data(), b.data()); + auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ... + auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ... + v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ... + v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ... + a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ... + b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ... +} + +inline void deinterleave(Vector &a, Vector &b) +{ + auto v0 = Mem::shuffle128(a.data(), b.data()); + auto v1 = Mem::shuffle128(a.data(), b.data()); + auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ... + auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ... + v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ... + v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ... + a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ... + b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ... } -inline void deinterleave(Vector &a, Vector &b) +} // namespace AVX2 +namespace Detail { - m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 - m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 - m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 - m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 - a.data() = _mm_unpacklo_epi16(tmp2, tmp3); - b.data() = _mm_unpackhi_epi16(tmp2, tmp3); -} - -} // namespace AVX - - -namespace Internal -{ - -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const float *m, A align) +template +inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align) { a.load(m, align); - b.load(m + float_v::Size, align); - Vc::AVX::deinterleave(a, b); + b.load(m + AVX2::float_v::Size, align); + Vc::AVX2::deinterleave(a, b); } -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const short *m, A align) +template +inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f) { - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_srai_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), - _mm_srai_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16))); - b.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_srai_epi32(AVX::lo128(tmp), 16), - _mm_srai_epi32(AVX::hi128(tmp), 16))); + using namespace Vc::AVX2; + const auto tmp = Detail::load32(m, f); + a.data() = + _mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), + _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16))); + b.data() = _mm256_cvtepi32_ps( + concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16))); } -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const unsigned short *m, A align) +template +inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f) { - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_blend_epi16(AVX::lo128(tmp), _mm_setzero_si128(), 0xaa), - _mm_blend_epi16(AVX::hi128(tmp), _mm_setzero_si128(), 0xaa))); - b.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_srli_epi32(AVX::lo128(tmp), 16), - _mm_srli_epi32(AVX::hi128(tmp), 16))); + using namespace Vc::AVX2; + const auto tmp = Detail::load32(m, f); + a.data() = _mm256_cvtepi32_ps( + concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa), + _mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa))); + b.data() = _mm256_cvtepi32_ps( + concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16))); } -template inline void HelperImpl::deinterleave( - sfloat_v &_a, sfloat_v &_b, const MemT *m, A align) +template +inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align) { - float_v &a = reinterpret_cast(_a); - float_v &b = reinterpret_cast(_b); - HelperImpl::deinterleave(a, b, m, align); -} + using namespace Vc::AVX2; -template inline void HelperImpl::deinterleave( - double_v &a, double_v &b, const double *m, A align) -{ a.load(m, align); - b.load(m + double_v::Size, align); + b.load(m + AVX2::double_v::Size, align); - m256d tmp0 = Mem::shuffle128(a.data(), b.data()); // b1 b0 a1 a0 - m256d tmp1 = Mem::shuffle128(a.data(), b.data()); // b3 b2 a3 a2 + m256d tmp0 = Mem::shuffle128(a.data(), b.data()); // b1 b0 a1 a0 + m256d tmp1 = Mem::shuffle128(a.data(), b.data()); // b3 b2 a3 a2 - a.data() = _mm256_unpacklo_pd(tmp0, tmp1); // b2 b0 a2 a0 - b.data() = _mm256_unpackhi_pd(tmp0, tmp1); // b3 b1 a3 a1 + a.data() = _mm256_unpacklo_pd(tmp0, tmp1); // b2 b0 a2 a0 + b.data() = _mm256_unpackhi_pd(tmp0, tmp1); // b3 b1 a3 a1 } -template inline void HelperImpl::deinterleave( - int_v &a, int_v &b, const int *m, A align) +template +inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align) { - using Vc::AVX::m256; + using namespace AVX; a.load(m, align); - b.load(m + int_v::Size, align); + b.load(m + AVX2::int_v::Size, align); - const m256 tmp0 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); - const m256 tmp1 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); + const m256 tmp0 = avx_cast(Mem::shuffle128(a.data(), b.data())); + const m256 tmp1 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 - a.data() = AVX::avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 - b.data() = AVX::avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 + a.data() = avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 + b.data() = avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 } -template inline void HelperImpl::deinterleave( - int_v &a, int_v &b, const short *m, A align) +template +inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f) { - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = Vc::AVX::concat( - _mm_srai_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), - _mm_srai_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16)); - b.data() = Vc::AVX::concat( - _mm_srai_epi32(AVX::lo128(tmp), 16), - _mm_srai_epi32(AVX::hi128(tmp), 16)); + using namespace Vc::AVX; + const AVX2::short_v tmp0(m, f); + const m256i tmp = tmp0.data(); + a.data() = concat( + _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), + _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)); + b.data() = concat( + _mm_srai_epi32(lo128(tmp), 16), + _mm_srai_epi32(hi128(tmp), 16)); } -template inline void HelperImpl::deinterleave( - uint_v &a, uint_v &b, const unsigned int *m, A align) +template +inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align) { - using Vc::AVX::m256; + using namespace AVX; a.load(m, align); - b.load(m + uint_v::Size, align); + b.load(m + AVX2::uint_v::Size, align); - const m256 tmp0 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); - const m256 tmp1 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); + const m256 tmp0 = avx_cast(Mem::shuffle128(a.data(), b.data())); + const m256 tmp1 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 - a.data() = AVX::avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 - b.data() = AVX::avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 + a.data() = avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 + b.data() = avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 } -template inline void HelperImpl::deinterleave( - uint_v &a, uint_v &b, const unsigned short *m, A align) +template +inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f) { - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = Vc::AVX::concat( - _mm_srli_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), - _mm_srli_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16)); - b.data() = Vc::AVX::concat( - _mm_srli_epi32(AVX::lo128(tmp), 16), - _mm_srli_epi32(AVX::hi128(tmp), 16)); + using namespace Vc::AVX; + const AVX2::ushort_v tmp0(m, f); + const m256i tmp = tmp0.data(); + a.data() = concat( + _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), + _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)); + b.data() = concat( + _mm_srai_epi32(lo128(tmp), 16), + _mm_srai_epi32(hi128(tmp), 16)); } -template inline void HelperImpl::deinterleave( - short_v &a, short_v &b, const short *m, A align) +template +inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align) { a.load(m, align); - b.load(m + short_v::Size, align); - Vc::AVX::deinterleave(a, b); + b.load(m + AVX2::short_v::Size, align); + Vc::AVX2::deinterleave(a, b); } -template inline void HelperImpl::deinterleave( - ushort_v &a, ushort_v &b, const unsigned short *m, A align) +template +inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align) { a.load(m, align); - b.load(m + ushort_v::Size, align); - Vc::AVX::deinterleave(a, b); + b.load(m + AVX2::ushort_v::Size, align); + Vc::AVX2::deinterleave(a, b); } // only support M == V::EntryType -> no specialization -template -inline Vc_FLATTEN void HelperImpl::deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, const M *VC_RESTRICT memory, A align) +template +Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector &Vc_RESTRICT a, + AVX2::Vector &Vc_RESTRICT b, + AVX2::Vector &Vc_RESTRICT c, + const M *Vc_RESTRICT memory, Flags align) { + using V = AVX2::Vector; a.load(&memory[0 * V::Size], align); b.load(&memory[1 * V::Size], align); c.load(&memory[2 * V::Size], align); - Vc::AVX::deinterleave(a, b, c); + Vc::AVX2::deinterleave(a, b, c); } -} // namespace Internal -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +} // namespace Detail +} // namespace Vc diff -Nru vc-0.7.4/avx/detail.h vc-1.3.0/avx/detail.h --- vc-0.7.4/avx/detail.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/avx/detail.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,2171 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_AVX_DETAIL_H_ +#define VC_AVX_DETAIL_H_ + +#include "../sse/detail.h" +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Detail +{ +// (converting) load functions {{{1 +template +Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, + typename Flags::EnableIfAligned = nullptr) +{ + return _mm256_load_ps(x); +} +template +Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, + typename Flags::EnableIfUnaligned = nullptr) +{ + return _mm256_loadu_ps(x); +} +template +Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, + typename Flags::EnableIfStreaming = nullptr) +{ + return AvxIntrinsics::stream_load<__m256>(x); +} + +template +Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, + typename Flags::EnableIfAligned = nullptr) +{ + return _mm256_load_pd(x); +} +template +Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, + typename Flags::EnableIfUnaligned = nullptr) +{ + return _mm256_loadu_pd(x); +} +template +Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, + typename Flags::EnableIfStreaming = nullptr) +{ + return AvxIntrinsics::stream_load<__m256d>(x); +} + +template ::value>> +Vc_INTRINSIC Vc_PURE __m256i +load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr) +{ + return _mm256_load_si256(reinterpret_cast(x)); +} +template ::value>> +Vc_INTRINSIC Vc_PURE __m256i +load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr) +{ + return _mm256_loadu_si256(reinterpret_cast(x)); +} +template ::value>> +Vc_INTRINSIC Vc_PURE __m256i +load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr) +{ + return AvxIntrinsics::stream_load<__m256i>(x); +} + +// load32{{{2 +Vc_INTRINSIC __m256 load32(const float *mem, when_aligned) +{ + return _mm256_load_ps(mem); +} +Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned) +{ + return _mm256_loadu_ps(mem); +} +Vc_INTRINSIC __m256 load32(const float *mem, when_streaming) +{ + return AvxIntrinsics::stream_load<__m256>(mem); +} +Vc_INTRINSIC __m256d load32(const double *mem, when_aligned) +{ + return _mm256_load_pd(mem); +} +Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned) +{ + return _mm256_loadu_pd(mem); +} +Vc_INTRINSIC __m256d load32(const double *mem, when_streaming) +{ + return AvxIntrinsics::stream_load<__m256d>(mem); +} +template Vc_INTRINSIC __m256i load32(const T *mem, when_aligned) +{ + static_assert(std::is_integral::value, "load32 is only intended for integral T"); + return _mm256_load_si256(reinterpret_cast(mem)); +} +template Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned) +{ + static_assert(std::is_integral::value, "load32 is only intended for integral T"); + return _mm256_loadu_si256(reinterpret_cast(mem)); +} +template Vc_INTRINSIC __m256i load32(const T *mem, when_streaming) +{ + static_assert(std::is_integral::value, "load32 is only intended for integral T"); + return AvxIntrinsics::stream_load<__m256i>(mem); +} + +// MSVC workarounds{{{2 +#ifdef Vc_MSVC +// work around: "fatal error C1001: An internal error has occurred in the compiler." +Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>) +{ + return _mm256_load_si256(reinterpret_cast(mem)); +} + +Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>) +{ + return _mm256_loadu_pd(mem); +} + +template +Vc_INTRINSIC __m256 load(const float *mem, when_aligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_load_ps(mem); +} + +template +Vc_INTRINSIC __m256 load(const float *mem, when_unaligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_loadu_ps(mem); +} + +template +Vc_INTRINSIC __m256 load(const float *mem, when_streaming, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return AvxIntrinsics::stream_load<__m256>(mem); +} + +template +Vc_INTRINSIC __m256d load(const double *mem, when_aligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_load_pd(mem); +} + +template +Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_loadu_pd(mem); +} + +template +Vc_INTRINSIC __m256d load(const double *mem, when_streaming, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return AvxIntrinsics::stream_load<__m256d>(mem); +} + +template +Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_load_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_loadu_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const uint *mem, when_streaming, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return AvxIntrinsics::stream_load<__m256i>(mem); +} + +template +Vc_INTRINSIC __m256i load(const int *mem, when_unaligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_loadu_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const int *mem, when_aligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_load_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const int *mem, when_streaming, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return AvxIntrinsics::stream_load<__m256i>(mem); +} + +template +Vc_INTRINSIC __m256i load(const short *mem, when_unaligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_loadu_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const short *mem, when_aligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_load_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const short *mem, when_streaming, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return AvxIntrinsics::stream_load<__m256i>(mem); +} + +template +Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_loadu_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return _mm256_load_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming, + enable_if<(std::is_same::value && + std::is_same::value)> = nullarg) +{ + return AvxIntrinsics::stream_load<__m256i>(mem); +} + +#endif // Vc_MSVC + +// short {{{2 +template +Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>) +{ + return load32(mem, f); +} +template +Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>) +{ + return AVX::cvtepu8_epi16(load16(mem, f)); +} +template +Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>) +{ + return AVX::cvtepi8_epi16(load16(mem, f)); +} + +// ushort {{{2 +template +Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>) +{ + return AVX::cvtepu8_epi16(load16(mem, f)); +} + +// int {{{2 +template +Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>) +{ + return load32(mem, f); +} +template +Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>) +{ + return AVX::cvtepu16_epi32(load16(mem, f)); +} +template +Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>) +{ + return AVX::cvtepi16_epi32(load16(mem, f)); +} +template +Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>) +{ + return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); +} +template +Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>) +{ + return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); +} + +// uint {{{2 +template +Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>) +{ + return AVX::cvtepu16_epi32(load16(mem, f)); +} +template +Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>) +{ + return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); +} + +// double {{{2 +template +Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>) +{ + return AVX::convert(load16(mem, f)); +} +template +Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>) +{ + return AVX::convert(load16(mem, f)); +} +template +Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>) +{ + return AVX::convert(load16(mem, f)); +} +template +Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>) +{ + return AVX::convert(load16(mem, f)); +} +template +Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>) +{ + return AVX::convert(load16(mem, f)); +} +template +Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>) +{ + return AVX::convert(load16(mem, f)); +} +template +Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>) +{ + return AVX::convert(load16(mem, f)); +} + +// float {{{2 +template +Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>) +{ + return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)), + _mm256_cvtpd_ps(load32(&mem[4], f))); +} +template +Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>) +{ + const auto v = load32(mem, f); + return _mm256_blendv_ps( + _mm256_cvtepi32_ps(v), + _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())), + AVX::set2power31_ps()), + _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256()))); +} +template +Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>) +{ + return AVX::convert(load32(mem, f)); +} +template ::value>> +Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>) +{ + return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f)); +} +template +Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>) +{ + return AVX::convert(load16(mem, f)); +} +template +Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>) +{ + return AVX::convert(load16(mem, f)); +} +/* +template struct LoadHelper { + static __m256 load(const unsigned char *mem, Flags) + { + return _mm256_cvtepi32_ps( + cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem)))); + } +}; +template struct LoadHelper { + static __m256 load(const signed char *mem, Flags) + { + return _mm256_cvtepi32_ps( + cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast(mem)))); + } +}; +*/ + +// shifted{{{1 +template +Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k) +{ + return AVX::avx_cast(AVX::zeroExtend( + _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16))); +} +template +Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T> +shifted(T k) +{ + return AVX::avx_cast( + AVX::alignr(Mem::permute128(AVX::avx_cast<__m256i>(k)), + Mem::permute128(AVX::avx_cast<__m256i>(k)))); +} +template +Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k) +{ + return AVX::avx_cast(Mem::permute128(AVX::avx_cast<__m256i>( + _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount)))); +} +template +Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T> +shifted(T k) +{ + return AVX::avx_cast( + AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k), + Mem::permute128(AVX::avx_cast<__m256i>(k)))); +} +// mask_cast{{{1 +template Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k) +{ + static_assert(From == To, "Incorrect mask cast."); + static_assert(std::is_same::value, "Incorrect mask cast."); + return AVX::avx_cast<__m256>(k); +} + +// 4 -> 4 +template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k) +{ + return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k))); +} + +template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k) +{ + const auto kk = _mm_castsi128_ps(k); + return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk)); +} + +// 4 -> 8 +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k) +{ + // aabb ccdd -> abcd 0000 + return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), + _mm_setzero_si128())); +} + +template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k) +{ + // aaaa bbbb cccc dddd -> abcd 0000 + return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128())); +} + +template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k) +{ + return AVX::zeroExtend(AVX::avx_cast<__m128>(k)); +} + +// 4 -> 16 +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k) +{ + // aaaa bbbb cccc dddd -> abcd 0000 0000 0000 + return AVX::zeroExtend(mask_cast<4, 8, __m128>(k)); +} + +// 8 -> 4 +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k) +{ + // aabb ccdd eeff gghh -> aaaa bbbb cccc dddd + const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k)); + return AVX::concat(_mm_unpacklo_ps(lo, lo), + _mm_unpackhi_ps(lo, lo)); +} + +template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k) +{ + return AVX::avx_cast<__m128>(AVX::lo128(k)); +} + +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k) +{ + // abcd efgh -> aaaa bbbb cccc dddd + const auto tmp = _mm_unpacklo_epi16(k, k); // aa bb cc dd + return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), // aaaa bbbb + _mm_unpackhi_epi32(tmp, tmp))); // cccc dddd +} + +// 8 -> 8 +template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k) +{ + // aabb ccdd eeff gghh -> abcd efgh + return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k))); +} + +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k) +{ + return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k), + _mm_unpackhi_epi16(k, k))); +} + +// 8 -> 16 +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k) +{ + // aabb ccdd eeff gghh -> abcd efgh 0000 0000 + return AVX::zeroExtend(mask_cast<8, 8, __m128>(k)); +} + +// 16 -> 8 +#ifdef Vc_IMPL_AVX2 +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k) +{ + // abcd efgh ijkl mnop -> aabb ccdd eeff gghh + const auto flipped = Mem::permute4x64(k); + return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped)); +} +#endif + +// 16 -> 4 +template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k) +{ + // abcd efgh ijkl mnop -> aaaa bbbb cccc dddd + const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k)); // aabb ccdd + return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp))); +} + +// allone{{{1 +template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); } +template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); } +template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); } + +// zero{{{1 +template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); } +template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); } +template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); } + +// one{{{1 +Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); } +Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); } +Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); } +Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); } +Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); } +Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); } +Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); } +Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); } + +// negate{{{1 +Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant) +{ + return _mm256_xor_ps(v, AVX::setsignmask_ps()); +} +Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant) +{ + return _mm256_xor_pd(v, AVX::setsignmask_pd()); +} +Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant) +{ + return AVX::sign_epi32(v, Detail::allone<__m256i>()); +} +Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant) +{ + return AVX::sign_epi16(v, Detail::allone<__m256i>()); +} + +// xor_{{{1 +Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); } +Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); } +Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b) +{ +#ifdef Vc_IMPL_AVX2 + return _mm256_xor_si256(a, b); +#else + return _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +#endif +} + +// or_{{{1 +Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); } +Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); } +Vc_INTRINSIC __m256i or_(__m256i a, __m256i b) +{ +#ifdef Vc_IMPL_AVX2 + return _mm256_or_si256(a, b); +#else + return _mm256_castps_si256( + _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +#endif +} + +// and_{{{1 +Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); } +Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); } +Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) { +#ifdef Vc_IMPL_AVX2 + return _mm256_and_si256(a, b); +#else + return _mm256_castps_si256( + _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +#endif +} + +// andnot_{{{1 +Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); } +Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); } +Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b) +{ +#ifdef Vc_IMPL_AVX2 + return _mm256_andnot_si256(a, b); +#else + return _mm256_castps_si256( + _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +#endif +} + +// not_{{{1 +Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); } +Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); } +Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); } + +// blend{{{1 +Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); } +Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); } +Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); } + +// abs{{{1 +Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); } +Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); } +Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); } +Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; } +Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); } +Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; } +Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); } +Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; } + +// add{{{1 +Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); } +Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); } +Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); } +Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); } +Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); } +Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); } + +// sub{{{1 +Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); } +Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); } +Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); } +Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); } +Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); } +Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); } + +// mul{{{1 +Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); } +Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); } +Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); } +Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); } +Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); } +Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); } + +// mul{{{1 +Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); } +Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); } +Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) { + using namespace AVX; + const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); + const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); + const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); + const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); + return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), + _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2))); +} +Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) { + // SSE/AVX only has signed int conversion to doubles. Therefore we first adjust the input before + // conversion and take the adjustment back after the conversion. + // It could be argued that for b this is not really important because division by a b >= 2^31 is + // useless. But for full correctness it cannot be ignored. + using namespace AVX; + const __m256i aa = add_epi32(a, set1_epi32(-2147483648)); + const __m256i bb = add_epi32(b, set1_epi32(-2147483648)); + const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.)); + const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.)); + const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.)); + const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.)); + // there is one remaining problem: a >= 2^31 and b == 1 + // in that case the return value would be 2^31 + return avx_cast<__m256i>(_mm256_blendv_ps( + avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), + _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))), + avx_cast<__m256>(a), + avx_cast<__m256>(cmpeq_epi32(b, setone_epi32())))); +} +Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) { + using namespace AVX; + const __m256 lo = + _mm256_div_ps(convert(lo128(a)), convert(lo128(b))); + const __m256 hi = + _mm256_div_ps(convert(hi128(a)), convert(hi128(b))); + return concat(convert(lo), convert(hi)); +} + +// horizontal add{{{1 +template Vc_INTRINSIC T add(Common::IntrinsicType a, T) +{ + return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())}; +} + +// horizontal mul{{{1 +template Vc_INTRINSIC T mul(Common::IntrinsicType a, T) +{ + return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())}; +} + +// horizontal min{{{1 +template Vc_INTRINSIC T min(Common::IntrinsicType a, T) +{ + return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())}; +} + +// horizontal max{{{1 +template Vc_INTRINSIC T max(Common::IntrinsicType a, T) +{ + return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())}; +} +// cmpeq{{{1 +Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); } +Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); } +Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); } +Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); } +Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); } +Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); } + +// cmpneq{{{1 +Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); } +Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); } +Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); } +Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); } +Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); } +Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); } +Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); } +Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); } + +// cmpgt{{{1 +Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); } +Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); } +Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); } +Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); } +Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); } +Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); } +Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); } +Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); } + +// cmpge{{{1 +Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); } +Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); } +Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); } +Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); } +Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); } +Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); } +Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); } +Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); } + +// cmple{{{1 +Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); } +Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); } +Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); } +Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); } +Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); } +Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); } +Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); } +Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); } + +// cmplt{{{1 +Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); } +Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); } +Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); } +Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); } +Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); } +Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); } +Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); } +Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); } + +// fma{{{1 +Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) { +#ifdef Vc_IMPL_FMA4 + return _mm256_macc_ps(a, b, c); +#elif defined Vc_IMPL_FMA + return _mm256_fmadd_ps(a, b, c); +#else + using namespace AVX; + __m256d v1_0 = _mm256_cvtps_pd(lo128(a)); + __m256d v1_1 = _mm256_cvtps_pd(hi128(a)); + __m256d v2_0 = _mm256_cvtps_pd(lo128(b)); + __m256d v2_1 = _mm256_cvtps_pd(hi128(b)); + __m256d v3_0 = _mm256_cvtps_pd(lo128(c)); + __m256d v3_1 = _mm256_cvtps_pd(hi128(c)); + return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), + _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); +#endif +} +Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double) +{ +#ifdef Vc_IMPL_FMA4 + return _mm256_macc_pd(a, b, c); +#elif defined Vc_IMPL_FMA + return _mm256_fmadd_pd(a, b, c); +#else + using namespace AVX; + __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast( + &c_general::highMaskDouble))); + __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast( + &c_general::highMaskDouble))); + const __m256d l1 = _mm256_sub_pd(a, h1); + const __m256d l2 = _mm256_sub_pd(b, h2); + const __m256d ll = mul(l1, l2, double()); + const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double()); + const __m256d hh = mul(h1, h2, double()); + // ll < lh < hh for all entries is certain + const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double()); // |lh| < |c| + const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3); + const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3); + return add(add(ll, x, double()), add(y, hh, double()), double()); +#endif +} +template Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T) +{ + return add(mul(a, b, T()), c, T()); +} + +// shiftRight{{{1 +template Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32(a); } +template Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32(a); } +template Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16(a); } +template Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16(a); } +//template Vc_INTRINSIC __m256i shiftRight(__m256i a, schar) { return AVX::srai_epi8 (a); } +//template Vc_INTRINSIC __m256i shiftRight(__m256i a, uchar) { return AVX::srli_epi8 (a); } + +Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); } +Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); } +Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); } +Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); } +//Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, schar) { return AVX::sra_epi8 (a, _mm_cvtsi32_si128(shift)); } +//Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uchar) { return AVX::srl_epi8 (a, _mm_cvtsi32_si128(shift)); } + +// shiftLeft{{{1 +template Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32(a); } +template Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32(a); } +template Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16(a); } +template Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16(a); } +//template Vc_INTRINSIC __m256i shiftLeft(__m256i a, schar) { return AVX::slli_epi8 (a); } +//template Vc_INTRINSIC __m256i shiftLeft(__m256i a, uchar) { return AVX::slli_epi8 (a); } + +Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); } +Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); } +Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); } +Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); } +//Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, schar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); } +//Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uchar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); } + +// zeroExtendIfNeeded{{{1 +Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; } +Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; } +Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; } +Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); } +Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); } +Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); } + +// broadcast{{{1 +Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); } +Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); } +Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); } +Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); } +Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); } +Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); } +Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); } +Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); } +Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); } + +// sorted{{{1 +template = AVXImpl && Impl <= AVX2Impl)>> +Vc_CONST_L AVX2::Vector sorted(AVX2::Vector x) Vc_CONST_R; +template Vc_INTRINSIC Vc_CONST AVX2::Vector sorted(AVX2::Vector x) +{ + return sorted(x); +} + +// shifted{{{1 +template +static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount) +{ + using namespace AVX; + constexpr int S = sizeof(T); + switch (amount) { + case 0: return v; + case 1: return shifted( 1 * S)>(v); + case 2: return shifted( 2 * S)>(v); + case 3: return shifted( 3 * S)>(v); + case -1: return shifted(-1 * S)>(v); + case -2: return shifted(-2 * S)>(v); + case -3: return shifted(-3 * S)>(v); + } + if (sizeof(T) <= 4) { + switch (amount) { + case 4: return shifted( 4 * S)>(v); + case 5: return shifted( 5 * S)>(v); + case 6: return shifted( 6 * S)>(v); + case 7: return shifted( 7 * S)>(v); + case -4: return shifted(-4 * S)>(v); + case -5: return shifted(-5 * S)>(v); + case -6: return shifted(-6 * S)>(v); + case -7: return shifted(-7 * S)>(v); + } + if (sizeof(T) <= 2) { + switch (amount) { + case 8: return shifted( 8 * S)>(v); + case 9: return shifted( 9 * S)>(v); + case 10: return shifted( 10 * S)>(v); + case 11: return shifted( 11 * S)>(v); + case 12: return shifted( 12 * S)>(v); + case 13: return shifted( 13 * S)>(v); + case 14: return shifted( 14 * S)>(v); + case 15: return shifted( 15 * S)>(v); + case -8: return shifted(- 8 * S)>(v); + case -9: return shifted(- 9 * S)>(v); + case -10: return shifted(-10 * S)>(v); + case -11: return shifted(-11 * S)>(v); + case -12: return shifted(-12 * S)>(v); + case -13: return shifted(-13 * S)>(v); + case -14: return shifted(-14 * S)>(v); + case -15: return shifted(-15 * S)>(v); + } + if (sizeof(T) == 1) { + switch (amount) { + case 16: return shifted( 16)>(v); + case 17: return shifted( 17)>(v); + case 18: return shifted( 18)>(v); + case 19: return shifted( 19)>(v); + case 20: return shifted( 20)>(v); + case 21: return shifted( 21)>(v); + case 22: return shifted( 22)>(v); + case 23: return shifted( 23)>(v); + case 24: return shifted( 24)>(v); + case 25: return shifted( 25)>(v); + case 26: return shifted( 26)>(v); + case 27: return shifted( 27)>(v); + case 28: return shifted( 28)>(v); + case 29: return shifted( 29)>(v); + case 30: return shifted( 30)>(v); + case 31: return shifted( 31)>(v); + case -16: return shifted(-16)>(v); + case -17: return shifted(-17)>(v); + case -18: return shifted(-18)>(v); + case -19: return shifted(-19)>(v); + case -20: return shifted(-20)>(v); + case -21: return shifted(-21)>(v); + case -22: return shifted(-22)>(v); + case -23: return shifted(-23)>(v); + case -24: return shifted(-24)>(v); + case -25: return shifted(-25)>(v); + case -26: return shifted(-26)>(v); + case -27: return shifted(-27)>(v); + case -28: return shifted(-28)>(v); + case -29: return shifted(-29)>(v); + case -30: return shifted(-30)>(v); + case -31: return shifted(-31)>(v); + } + } + } + } + return avx_cast(_mm256_setzero_ps()); +} + +template +static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount) +{ + using namespace AVX; + switch (amount) { + case 0: return v; + case 1: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(1 * sizeof(T)))); + case 2: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(2 * sizeof(T)))); + case 3: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(3 * sizeof(T)))); + case -1: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(1 * sizeof(T)))); + case -2: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(2 * sizeof(T)))); + case -3: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(3 * sizeof(T)))); + } + if (sizeof(T) <= 2) { + switch (amount) { + case 4: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(4 * sizeof(T)))); + case 5: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(5 * sizeof(T)))); + case 6: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(6 * sizeof(T)))); + case 7: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(7 * sizeof(T)))); + case -4: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(4 * sizeof(T)))); + case -5: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(5 * sizeof(T)))); + case -6: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(6 * sizeof(T)))); + case -7: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(7 * sizeof(T)))); + } + } + return avx_cast(_mm_setzero_ps()); +} +// rotated{{{1 +template +static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v, + int amount) +{ + using namespace AVX; + const __m128i vLo = avx_cast<__m128i>(lo128(v)); + const __m128i vHi = avx_cast<__m128i>(hi128(v)); + switch (static_cast(amount) % N) { + case 0: + return v; + case 1: + return avx_cast(concat(SSE::alignr_epi8(vHi, vLo), + SSE::alignr_epi8(vLo, vHi))); + case 2: + return Mem::permute128(v); + case 3: + return avx_cast(concat(SSE::alignr_epi8(vLo, vHi), + SSE::alignr_epi8(vHi, vLo))); + } + return avx_cast(_mm256_setzero_ps()); +} + +template +static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v, + int amount) +{ + using namespace AVX; + const __m128i vLo = avx_cast<__m128i>(lo128(v)); + const __m128i vHi = avx_cast<__m128i>(hi128(v)); + switch (static_cast(amount) % N) { + case 0: + return v; + case 1: + return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi))); + case 2: + return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi))); + case 3: + return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi))); + case 4: + return Mem::permute128(v); + case 5: + return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo))); + case 6: + return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo))); + case 7: + return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo))); + } + return avx_cast(_mm256_setzero_ps()); +} + +#ifdef Vc_IMPL_AVX2 +template +static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated( + V v, int amount) +{ + using namespace AVX; + const __m128i vLo = avx_cast<__m128i>(lo128(v)); + const __m128i vHi = avx_cast<__m128i>(hi128(v)); + switch (static_cast(amount) % N) { + case 0: + return v; + case 1: + return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi))); + case 2: + return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi))); + case 3: + return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi))); + case 4: + return Mem::permute4x64(v); + case 5: + return avx_cast(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi))); + case 6: + return avx_cast(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi))); + case 7: + return avx_cast(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo), + SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi))); + case 8: + return Mem::permute128(v); + case 9: + return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo))); + case 10: + return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo))); + case 11: + return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo))); + case 12: + return Mem::permute4x64(v); + case 13: + return avx_cast(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo))); + case 14: + return avx_cast(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo))); + case 15: + return avx_cast(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi), + SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo))); + } + return avx_cast(_mm256_setzero_ps()); +} +#endif // Vc_IMPL_AVX2 + +// testc{{{1 +Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } +Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); } +Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); } +Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); } + +// testz{{{1 +Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } +Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); } +Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); } +Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); } + +// testnzc{{{1 +Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } +Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); } +Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); } +Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); } + +// movemask{{{1 +Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); } +Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); } +Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); } +Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); } +Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); } +Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); } + +// mask_store{{{1 +template +Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags) +{ + static_assert( + N == 4 || N == 8 || N == 16, + "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries"); + switch (N) { + case 4: + *reinterpret_cast *>(mem) = + (_mm_movemask_epi8(AVX::lo128(k)) | + (_mm_movemask_epi8(AVX::hi128(k)) << 16)) & + 0x01010101; + break; + case 8: { + const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15); + const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128()); +#ifdef __x86_64__ + *reinterpret_cast *>(mem) = _mm_cvtsi128_si64(k3); +#else + *reinterpret_cast *>(mem) = _mm_cvtsi128_si32(k3); + *reinterpret_cast *>(mem + 4) = _mm_extract_epi32(k3, 1); +#endif + } break; + case 16: { + const auto bools = Detail::and_(AVX::_mm_setone_epu8(), + _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k))); + if (Flags::IsAligned) { + _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools); + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools); + } + } break; + default: + Vc_UNREACHABLE(); + } +} + +// mask_load{{{1 +template +Vc_INTRINSIC R mask_load(const bool *mem, Flags, + enable_if::value> = nullarg) +{ + static_assert(N == 4 || N == 8, + "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries"); + switch (N) { + case 4: { + __m128i k = _mm_cvtsi32_si128(*reinterpret_cast *>(mem)); + k = _mm_unpacklo_epi8(k, k); + k = _mm_unpacklo_epi16(k, k); + k = _mm_cmpgt_epi32(k, _mm_setzero_si128()); + return AVX::avx_cast<__m128>(k); + } + case 8: { +#ifdef __x86_64__ + __m128i k = _mm_cvtsi64_si128(*reinterpret_cast *>(mem)); +#else + __m128i k = _mm_castpd_si128( + _mm_load_sd(reinterpret_cast *>(mem))); +#endif + return AVX::avx_cast<__m128>( + _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128())); + } + default: + Vc_UNREACHABLE(); + } +} + +template +Vc_INTRINSIC R mask_load(const bool *mem, Flags, + enable_if::value> = nullarg) +{ + static_assert( + N == 4 || N == 8 || N == 16, + "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries"); + switch (N) { + case 4: { + __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps( + _mm_set1_ps(*reinterpret_cast *>(mem)), + AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000)))); + k = _mm_cmpgt_epi32(k, _mm_setzero_si128()); + return AVX::avx_cast<__m256>( + AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k))); + } + case 8: { +#ifdef __x86_64__ + __m128i k = _mm_cvtsi64_si128(*reinterpret_cast *>(mem)); +#else + __m128i k = _mm_castpd_si128( + _mm_load_sd(reinterpret_cast *>(mem))); +#endif + k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()); + return AVX::avx_cast<__m256>( + AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k))); + } + case 16: { + const auto k128 = _mm_cmpgt_epi8( + Flags::IsAligned ? _mm_load_si128(reinterpret_cast(mem)) + : _mm_loadu_si128(reinterpret_cast(mem)), + _mm_setzero_si128()); + return AVX::avx_cast<__m256>( + AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128))); + } + default: + Vc_UNREACHABLE(); + } +} + +// mask_to_int{{{1 +template +Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R; +template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k) +{ + return movemask(AVX::avx_cast<__m256d>(k)); +} +template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k) +{ + return movemask(AVX::avx_cast<__m256>(k)); +} +#ifdef Vc_IMPL_BMI2 +template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k) +{ + return _pext_u32(movemask(k), 0x55555555u); +} +#endif +template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k) +{ + return movemask(k); +} + +//InterleaveImpl{{{1 +template struct InterleaveImpl { + template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ + const typename V::AsArg v0, // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ... + const typename V::AsArg v1) // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ... + { + const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ... + const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ... + using namespace AVX; + *reinterpret_cast *>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0)); + *reinterpret_cast *>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1); + *reinterpret_cast *>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2); + *reinterpret_cast *>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3); + *reinterpret_cast *>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1)); + *reinterpret_cast *>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1); + *reinterpret_cast *>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2); + *reinterpret_cast *>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3); + *reinterpret_cast *>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0)); + *reinterpret_cast *>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1); + *reinterpret_cast *>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2); + *reinterpret_cast *>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3); + *reinterpret_cast *>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1)); + *reinterpret_cast *>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1); + *reinterpret_cast *>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2); + *reinterpret_cast *>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3); + }/*}}}*/ + static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1) + { + const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ... + const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ... + V(Mem::shuffle128(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned); + V(Mem::shuffle128(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned); + }/*}}}*/ + template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) + { + interleave(data, i, v0, v1); + v2.scatter(data + 2, i); + }/*}}}*/ + template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3) + { + const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ... + const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ... + const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ... + const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ... + + const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ... + const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11 + const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13 + const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15 + + using namespace AVX; + auto &&store = [&](__m256i x, int offset) { + _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x)); + _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x)); + _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x)); + _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x))); + }; + store(tmp4, 0); + store(tmp5, 2); + store(tmp6, 4); + store(tmp7, 6); + }/*}}}*/ + static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3) + { + const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ... + const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ... + const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ... + const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ... + + const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ... + const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11 + const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13 + const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15 + + V(Mem::shuffle128(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned); + V(Mem::shuffle128(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned); + V(Mem::shuffle128(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned); + V(Mem::shuffle128(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned); + }/*}}}*/ + template // interleave 5 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4) + { + interleave(data, i, v0, v1, v2, v3); + v4.scatter(data + 4, i); + } + template // interleave 6 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5); + } + template // interleave 7 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5, + const typename V::AsArg v6) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5, v6); + } + template // interleave 8 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5, + const typename V::AsArg v6, const typename V::AsArg v7) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5, v6, v7); + } + //}}}2 + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1) + { + const __m256i tmp4 = // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 b9 a10 b10 a11 b11 + _mm256_setr_epi32(*reinterpret_cast *>(&data[i[0]]), + *reinterpret_cast *>(&data[i[1]]), + *reinterpret_cast *>(&data[i[2]]), + *reinterpret_cast *>(&data[i[3]]), + *reinterpret_cast *>(&data[i[8]]), + *reinterpret_cast *>(&data[i[9]]), + *reinterpret_cast *>(&data[i[10]]), + *reinterpret_cast *>(&data[i[11]])); + const __m256i tmp5 = // a4 b4 a5 b5 a6 b6 a7 b7 | a12 b12 a13 b13 a14 b14 a15 b15 + _mm256_setr_epi32(*reinterpret_cast *>(&data[i[4]]), + *reinterpret_cast *>(&data[i[5]]), + *reinterpret_cast *>(&data[i[6]]), + *reinterpret_cast *>(&data[i[7]]), + *reinterpret_cast *>(&data[i[12]]), + *reinterpret_cast *>(&data[i[13]]), + *reinterpret_cast *>(&data[i[14]]), + *reinterpret_cast *>(&data[i[15]])); + + const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5); // a0 a4 b0 b4 a1 a5 b1 b5 | a8 a12 b8 b12 a9 a13 b9 b13 + const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5); // a2 a6 b2 b6 a3 a7 b3 b7 | a10 a14 b10 b14 a11 a15 b11 b15 + + const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 a10 a12 a14 b8 ... + const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3); // a1 a3 a5 a7 b1 b3 b5 b7 | a9 a11 a13 a15 b9 ... + + v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ... + v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ... + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2) + { + using namespace AVX; + const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[0]]), + *reinterpret_cast *>(&data[i[1]]), + *reinterpret_cast *>(&data[i[8]]), + *reinterpret_cast *>(&data[i[9]]))); + const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[2]]), + *reinterpret_cast *>(&data[i[3]]), + *reinterpret_cast *>(&data[i[10]]), + *reinterpret_cast *>(&data[i[11]]))); + const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[4]]), + *reinterpret_cast *>(&data[i[5]]), + *reinterpret_cast *>(&data[i[12]]), + *reinterpret_cast *>(&data[i[13]]))); + const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[6]]), + *reinterpret_cast *>(&data[i[7]]), + *reinterpret_cast *>(&data[i[14]]), + *reinterpret_cast *>(&data[i[15]]))); + const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 XX XX | a8 a12 b8 ... + const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ... + const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ... + const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ... + + const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ... + const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 XX ... + const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ... + const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 XX ... + + v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ... + v1.data() = AVX::unpackhi_epi16(tmp8, tmp10); + v2.data() = AVX::unpacklo_epi16(tmp9, tmp11); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3) + { + using namespace AVX; + const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[0]]), + *reinterpret_cast *>(&data[i[1]]), + *reinterpret_cast *>(&data[i[8]]), + *reinterpret_cast *>(&data[i[9]]))); + const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[2]]), + *reinterpret_cast *>(&data[i[3]]), + *reinterpret_cast *>(&data[i[10]]), + *reinterpret_cast *>(&data[i[11]]))); + const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[4]]), + *reinterpret_cast *>(&data[i[5]]), + *reinterpret_cast *>(&data[i[12]]), + *reinterpret_cast *>(&data[i[13]]))); + const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(*reinterpret_cast *>(&data[i[6]]), + *reinterpret_cast *>(&data[i[7]]), + *reinterpret_cast *>(&data[i[14]]), + *reinterpret_cast *>(&data[i[15]]))); + const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 a12 b8 ... + const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ... + const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ... + const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ... + + const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ... + const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 d0 ... + const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ... + const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 d1 ... + + v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ... + v1.data() = AVX::unpackhi_epi16(tmp8, tmp10); + v2.data() = AVX::unpacklo_epi16(tmp9, tmp11); + v3.data() = AVX::unpackhi_epi16(tmp9, tmp11); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) + { + using namespace AVX; + const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), + _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); + const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), + _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); + const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), + _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); + const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), + _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); + const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), + _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); + const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), + _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); + const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), + _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); + const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), + _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); + + const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... + const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 + const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 + const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 + const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 + const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 + const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 + const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 + + const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... + const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 + const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 + const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 + const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 + const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 + + v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); + v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); + v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); + v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); + v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) + { + using namespace AVX; + const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), + _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); + const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), + _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); + const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), + _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); + const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), + _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); + const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), + _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); + const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), + _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); + const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), + _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); + const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), + _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); + + const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... + const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 + const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 + const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 + const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 + const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 + const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 + const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 + + const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... + const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 + const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 + const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 + const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 + const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 + + v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); + v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); + v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); + v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); + v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); + v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) + { + using namespace AVX; + const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), + _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); + const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), + _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); + const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), + _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); + const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), + _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); + const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), + _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); + const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), + _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); + const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), + _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); + const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), + _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); + + const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... + const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 + const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 + const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 + const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 + const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 + const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 + const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 + + const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... + const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 + const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 + const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 + const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 + const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 + const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 + const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 + + v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); + v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); + v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); + v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); + v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); + v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); + v6.data() = AVX::unpacklo_epi16(tmp14, tmp15); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) + { + using namespace AVX; + const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), + _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); + const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), + _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); + const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), + _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); + const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), + _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); + const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), + _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); + const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), + _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); + const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), + _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); + const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), + _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); + + const __m256i tmp2 = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ... + const __m256i tmp4 = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 + const __m256i tmp3 = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 + const __m256i tmp5 = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 + const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 + const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 + const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 + const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 + + const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ... + const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 + const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 + const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 + const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 + const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 + const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 + const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 + + v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); + v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); + v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); + v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); + v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); + v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); + v6.data() = AVX::unpacklo_epi16(tmp14, tmp15); + v7.data() = AVX::unpackhi_epi16(tmp14, tmp15); + }/*}}}*/ +}; +template struct InterleaveImpl { + template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1) + { + using namespace AVX; + // [0a 1a 0b 1b 0e 1e 0f 1f]: + const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v1.data())); + // [0c 1c 0d 1d 0g 1g 0h 1h]: + const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v1.data())); + _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0)); + _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0)); + _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1)); + _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1)); + _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0)); + _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0)); + _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1)); + _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1)); + }/*}}}*/ + static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1) + { + using namespace AVX; + // [0a 1a 0b 1b 0e 1e 0f 1f]: + const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v1.data())); + // [0c 1c 0d 1d 0g 1g 0h 1h]: + const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v1.data())); + _mm_storeu_ps(reinterpret_cast *>(&data[i[0]]), lo128(tmp0)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[2]]), lo128(tmp1)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[4]]), hi128(tmp0)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[6]]), hi128(tmp1)); + }/*}}}*/ + template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) + { + using namespace AVX; +#ifdef Vc_USE_MASKMOV_SCATTER + // [0a 2a 0b 2b 0e 2e 0f 2f]: + const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v2.data())); + // [0c 2c 0d 2d 0g 2g 0h 2h]: + const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v2.data())); + // [1a __ 1b __ 1e __ 1f __]: + const m256 tmp2 = _mm256_unpacklo_ps(avx_cast(v1.data()), avx_cast(v1.data())); + // [1c __ 1d __ 1g __ 1h __]: + const m256 tmp3 = _mm256_unpackhi_ps(avx_cast(v1.data()), avx_cast(v1.data())); + const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); + const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); + const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); + const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); + const m128i mask = _mm_set_epi32(0, -1, -1, -1); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[0]]), mask, lo128(tmp4)); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[1]]), mask, lo128(tmp5)); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[2]]), mask, lo128(tmp6)); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[3]]), mask, lo128(tmp7)); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[4]]), mask, hi128(tmp4)); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[5]]), mask, hi128(tmp5)); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[6]]), mask, hi128(tmp6)); + _mm_maskstore_ps(reinterpret_cast *>(&data[i[7]]), mask, hi128(tmp7)); +#else + interleave(data, i, v0, v1); + v2.scatter(data + 2, i); +#endif + }/*}}}*/ + template static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/ + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3) + { + using namespace AVX; + const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v2.data())); + const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v2.data())); + const m256 tmp2 = _mm256_unpacklo_ps(avx_cast(v1.data()), avx_cast(v3.data())); + const m256 tmp3 = _mm256_unpackhi_ps(avx_cast(v1.data()), avx_cast(v3.data())); + const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); + const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); + const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); + const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); + _mm_storeu_ps(reinterpret_cast *>(&data[i[0]]), lo128(tmp4)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[1]]), lo128(tmp5)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[2]]), lo128(tmp6)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[3]]), lo128(tmp7)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[4]]), hi128(tmp4)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[5]]), hi128(tmp5)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[6]]), hi128(tmp6)); + _mm_storeu_ps(reinterpret_cast *>(&data[i[7]]), hi128(tmp7)); + }/*}}}*/ + template // interleave 5 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4) + { + interleave(data, i, v0, v1, v2, v3); + v4.scatter(data + 4, i); + } + template // interleave 6 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5); + } + template // interleave 7 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5, + const typename V::AsArg v6) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5, v6); + } + template // interleave 8 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5, + const typename V::AsArg v6, const typename V::AsArg v7) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5, v6, v7); + } + //}}}2 + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1) + { + using namespace AVX; + const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]])); // a0 b0 + const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]])); // a2 b2 + const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]])); // a4 b4 + const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]])); // a6 b6 + const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]])); // a0 b0 a1 b1 + const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]])); // a2 b2 a3 b3 + const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]])); // a4 b4 a5 b5 + const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]])); // a6 b6 a7 b7 + + const m256 tmp2 = concat(il01, il45); + const m256 tmp3 = concat(il23, il67); + + const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); + const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); + + v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); + v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); + }/*}}}*/ + static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const Common::SuccessiveEntries<2> &i, V &v0, V &v1) + { + using namespace AVX; + const m256 il0123 = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0]])); // a0 b0 a1 b1 a2 b2 a3 b3 + const m256 il4567 = _mm256_loadu_ps(reinterpret_cast *>(&data[i[4]])); // a4 b4 a5 b5 a6 b6 a7 b7 + + const m256 tmp2 = Mem::shuffle128(il0123, il4567); + const m256 tmp3 = Mem::shuffle128(il0123, il4567); + + const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); + const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); + + v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); + v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2) + { + using namespace AVX; + const m128 il0 = _mm_loadu_ps(reinterpret_cast *>(&data[i[0]])); // a0 b0 c0 d0 + const m128 il1 = _mm_loadu_ps(reinterpret_cast *>(&data[i[1]])); // a1 b1 c1 d1 + const m128 il2 = _mm_loadu_ps(reinterpret_cast *>(&data[i[2]])); // a2 b2 c2 d2 + const m128 il3 = _mm_loadu_ps(reinterpret_cast *>(&data[i[3]])); // a3 b3 c3 d3 + const m128 il4 = _mm_loadu_ps(reinterpret_cast *>(&data[i[4]])); // a4 b4 c4 d4 + const m128 il5 = _mm_loadu_ps(reinterpret_cast *>(&data[i[5]])); // a5 b5 c5 d5 + const m128 il6 = _mm_loadu_ps(reinterpret_cast *>(&data[i[6]])); // a6 b6 c6 d6 + const m128 il7 = _mm_loadu_ps(reinterpret_cast *>(&data[i[7]])); // a7 b7 c7 d7 + + const m256 il04 = concat(il0, il4); + const m256 il15 = concat(il1, il5); + const m256 il26 = concat(il2, il6); + const m256 il37 = concat(il3, il7); + const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); + const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); + const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); + const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); + v0.data() = avx_cast(_mm256_unpacklo_ps(ab0246, ab1357)); + v1.data() = avx_cast(_mm256_unpackhi_ps(ab0246, ab1357)); + v2.data() = avx_cast(_mm256_unpacklo_ps(cd0246, cd1357)); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3) + { + using namespace AVX; + const m128 il0 = _mm_loadu_ps(reinterpret_cast *>(&data[i[0]])); // a0 b0 c0 d0 + const m128 il1 = _mm_loadu_ps(reinterpret_cast *>(&data[i[1]])); // a1 b1 c1 d1 + const m128 il2 = _mm_loadu_ps(reinterpret_cast *>(&data[i[2]])); // a2 b2 c2 d2 + const m128 il3 = _mm_loadu_ps(reinterpret_cast *>(&data[i[3]])); // a3 b3 c3 d3 + const m128 il4 = _mm_loadu_ps(reinterpret_cast *>(&data[i[4]])); // a4 b4 c4 d4 + const m128 il5 = _mm_loadu_ps(reinterpret_cast *>(&data[i[5]])); // a5 b5 c5 d5 + const m128 il6 = _mm_loadu_ps(reinterpret_cast *>(&data[i[6]])); // a6 b6 c6 d6 + const m128 il7 = _mm_loadu_ps(reinterpret_cast *>(&data[i[7]])); // a7 b7 c7 d7 + + const m256 il04 = concat(il0, il4); + const m256 il15 = concat(il1, il5); + const m256 il26 = concat(il2, il6); + const m256 il37 = concat(il3, il7); + const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); + const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); + const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); + const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); + v0.data() = avx_cast(_mm256_unpacklo_ps(ab0246, ab1357)); + v1.data() = avx_cast(_mm256_unpackhi_ps(ab0246, ab1357)); + v2.data() = avx_cast(_mm256_unpacklo_ps(cd0246, cd1357)); + v3.data() = avx_cast(_mm256_unpackhi_ps(cd0246, cd1357)); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) + { + v4.gather(data + 4, i); + deinterleave(data, i, v0, v1, v2, v3); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) + { + deinterleave(data, i, v0, v1, v2, v3); + deinterleave(data + 4, i, v4, v5); + }/*}}}*/ + static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) + { + using namespace AVX; + const m256 a = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0]])); + const m256 b = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 1 * V::Size])); + const m256 c = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 2 * V::Size])); + const m256 d = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 3 * V::Size])); + const m256 e = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 4 * V::Size])); + const m256 f = _mm256_loadu_ps(reinterpret_cast *>(&data[i[0] + 5 * V::Size])); + const __m256 tmp2 = Mem::shuffle128(a, d); + const __m256 tmp3 = Mem::shuffle128(b, e); + const __m256 tmp4 = Mem::shuffle128(a, d); + const __m256 tmp5 = Mem::shuffle128(c, f); + const __m256 tmp8 = Mem::shuffle128(b, e); + const __m256 tmp9 = Mem::shuffle128(c, f); + const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); + const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5); + const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3); + const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9); + const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5); + const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9); + v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); + v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); + v2.data() = avx_cast(_mm256_unpacklo_ps(tmp6, tmp7)); + v3.data() = avx_cast(_mm256_unpackhi_ps(tmp6, tmp7)); + v4.data() = avx_cast(_mm256_unpacklo_ps(tmp10, tmp11)); + v5.data() = avx_cast(_mm256_unpackhi_ps(tmp10, tmp11)); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) + { + deinterleave(data, i, v0, v1, v2, v3); + deinterleave(data + 4, i, v4, v5, v6); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) + { + deinterleave(data, i, v0, v1, v2, v3); + deinterleave(data + 4, i, v4, v5, v6, v7); + }/*}}}*/ +}; +template struct InterleaveImpl { + template // interleave 2 args{{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1) + { + using namespace AVX; + const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); + const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); + _mm_storeu_pd(&data[i[0]], lo128(tmp0)); + _mm_storeu_pd(&data[i[1]], lo128(tmp1)); + _mm_storeu_pd(&data[i[2]], hi128(tmp0)); + _mm_storeu_pd(&data[i[3]], hi128(tmp1)); + } + template // interleave 3 args{{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2) + { + using namespace AVX; +#ifdef Vc_USE_MASKMOV_SCATTER + const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); + const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); + const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data()); + const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data()); + +#if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64)) + // MSVC needs to be at Version 2012 before _mm256_set_epi64x works + const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1)); +#else + const m256i mask = _mm256_set_epi64x(0, -1, -1, -1); +#endif + _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128(tmp0, tmp2)); + _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128(tmp1, tmp3)); + _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128(tmp0, tmp2)); + _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128(tmp1, tmp3)); +#else + interleave(data, i, v0, v1); + v2.scatter(data + 2, i); +#endif + } + template // interleave 4 args{{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3) + { + using namespace AVX; + // 0a 1a 0c 1c: + const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); + // 0b 1b 0b 1b: + const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); + // 2a 3a 2c 3c: + const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data()); + // 2b 3b 2b 3b: + const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data()); + /* The following might be more efficient once 256-bit stores are not split internally into 2 + * 128-bit stores. + _mm256_storeu_pd(&data[i[0]], Mem::shuffle128(tmp0, tmp2)); + _mm256_storeu_pd(&data[i[1]], Mem::shuffle128(tmp1, tmp3)); + _mm256_storeu_pd(&data[i[2]], Mem::shuffle128(tmp0, tmp2)); + _mm256_storeu_pd(&data[i[3]], Mem::shuffle128(tmp1, tmp3)); + */ + _mm_storeu_pd(&data[i[0] ], lo128(tmp0)); + _mm_storeu_pd(&data[i[0]+2], lo128(tmp2)); + _mm_storeu_pd(&data[i[1] ], lo128(tmp1)); + _mm_storeu_pd(&data[i[1]+2], lo128(tmp3)); + _mm_storeu_pd(&data[i[2] ], hi128(tmp0)); + _mm_storeu_pd(&data[i[2]+2], hi128(tmp2)); + _mm_storeu_pd(&data[i[3] ], hi128(tmp1)); + _mm_storeu_pd(&data[i[3]+2], hi128(tmp3)); + } + template // interleave 5 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4) + { + interleave(data, i, v0, v1, v2, v3); + v4.scatter(data + 4, i); + } + template // interleave 6 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5); + } + template // interleave 7 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5, + const typename V::AsArg v6) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5, v6); + } + template // interleave 8 args {{{2 + static inline void interleave(typename V::EntryType *const data, const I &i, + const typename V::AsArg v0, const typename V::AsArg v1, + const typename V::AsArg v2, const typename V::AsArg v3, + const typename V::AsArg v4, const typename V::AsArg v5, + const typename V::AsArg v6, const typename V::AsArg v7) + { + interleave(data, i, v0, v1, v2, v3); + interleave(data + 4, i, v4, v5, v6, v7); + } + //}}}2 + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1) + { + using namespace Vc::AVX; + const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]])); + const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]])); + + v0.data() = _mm256_unpacklo_pd(ab02, ab13); + v1.data() = _mm256_unpackhi_pd(ab02, ab13); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2) + { + v2.gather(data + 2, i); + deinterleave(data, i, v0, v1); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3) + { + deinterleave(data, i, v0, v1); + deinterleave(data + 2, i, v2, v3); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) + { + v4.gather(data + 4, i); + deinterleave(data, i, v0, v1); + deinterleave(data + 2, i, v2, v3); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) + { + deinterleave(data, i, v0, v1); + deinterleave(data + 2, i, v2, v3); + deinterleave(data + 4, i, v4, v5); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) + { + v6.gather(data + 6, i); + deinterleave(data, i, v0, v1); + deinterleave(data + 2, i, v2, v3); + deinterleave(data + 4, i, v4, v5); + }/*}}}*/ + template static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/ + const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) + { + deinterleave(data, i, v0, v1); + deinterleave(data + 2, i, v2, v3); + deinterleave(data + 4, i, v4, v5); + deinterleave(data + 6, i, v6, v7); + }/*}}}*/ +}; +//}}}1 +} // namespace Detail +} // namespace Vc + +#endif // VC_AVX_DETAIL_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/avx/forceToRegisters.tcc vc-1.3.0/avx/forceToRegisters.tcc --- vc-0.7.4/avx/forceToRegisters.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/forceToRegisters.tcc 1969-12-31 18:00:00.000000000 -0600 @@ -1,141 +0,0 @@ -#ifdef __GNUC__ -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x1) { - __asm__ __volatile__(""::"x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x1) { - __asm__ __volatile__("":"+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x8, const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x8.data()), "x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x8, Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x8.data()), "+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -#elif defined(VC_MSVC) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x8*/, const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x8*/, Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#else -#error "forceToRegisters unsupported on this compiler" -#endif diff -Nru vc-0.7.4/avx/helperimpl.h vc-1.3.0/avx/helperimpl.h --- vc-0.7.4/avx/helperimpl.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/helperimpl.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,104 +1,119 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2011-2015 Matthias Kretz - Copyright (C) 2011-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_HELPERIMPL_H -#define VC_AVX_HELPERIMPL_H +#ifndef VC_AVX_HELPERIMPL_H_ +#define VC_AVX_HELPERIMPL_H_ +#include "../sse/helperimpl.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { -namespace Internal +namespace Detail { +template +inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A); +template +inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A); +template +inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A); +template +inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A); +template +inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A); +template +inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A); +template +inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A); +template +inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A); +template +inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A); +template +inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A); + +template +Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, + AVX2::Vector &Vc_RESTRICT b, + AVX2::Vector &Vc_RESTRICT c, + const M *Vc_RESTRICT memory, + A align) Vc_ALWAYS_INLINE_R; +template +Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, + AVX2::Vector &Vc_RESTRICT b, + AVX2::Vector &Vc_RESTRICT c, + AVX2::Vector &Vc_RESTRICT d, + const M *Vc_RESTRICT memory, + A align) Vc_ALWAYS_INLINE_R; +template +Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, + AVX2::Vector &Vc_RESTRICT b, + AVX2::Vector &Vc_RESTRICT c, + AVX2::Vector &Vc_RESTRICT d, + AVX2::Vector &Vc_RESTRICT e, + const M *Vc_RESTRICT memory, + A align) Vc_ALWAYS_INLINE_R; +template +Vc_ALWAYS_INLINE_L void deinterleave( + AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, + AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, + AVX2::Vector &Vc_RESTRICT e, AVX2::Vector &Vc_RESTRICT f, + const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; +template +Vc_ALWAYS_INLINE_L void deinterleave( + AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, + AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, + AVX2::Vector &Vc_RESTRICT e, AVX2::Vector &Vc_RESTRICT f, + AVX2::Vector &Vc_RESTRICT g, AVX2::Vector &Vc_RESTRICT h, + const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; -template<> struct HelperImpl +Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx) { - typedef AVX::Vector float_v; - typedef AVX::Vector sfloat_v; - typedef AVX::Vector double_v; - typedef AVX::Vector int_v; - typedef AVX::Vector uint_v; - typedef AVX::Vector short_v; - typedef AVX::Vector ushort_v; - - template static void deinterleave(float_v &, float_v &, const float *, A); - template static void deinterleave(float_v &, float_v &, const short *, A); - template static void deinterleave(float_v &, float_v &, const unsigned short *, A); - - template static void deinterleave(sfloat_v &, sfloat_v &, const MemT *, A); - - template static void deinterleave(double_v &, double_v &, const double *, A); - - template static void deinterleave(int_v &, int_v &, const int *, A); - template static void deinterleave(int_v &, int_v &, const short *, A); - - template static void deinterleave(uint_v &, uint_v &, const unsigned int *, A); - template static void deinterleave(uint_v &, uint_v &, const unsigned short *, A); - - template static void deinterleave(short_v &, short_v &, const short *, A); - - template static void deinterleave(ushort_v &, ushort_v &, const unsigned short *, A); - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, - const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, - const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, - V &VC_RESTRICT f, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, - V &VC_RESTRICT f, V &VC_RESTRICT g, V &VC_RESTRICT h, - const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - static Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void *malloc(size_t n) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; -}; - -} // namespace Internal -} // namespace Vc -/*OUTER_NAMESPACE_END*/ + prefetchForOneRead(addr, VectorAbi::Sse()); +} +Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx) +{ + prefetchForModify(addr, VectorAbi::Sse()); +} +Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx) +{ + prefetchClose(addr, VectorAbi::Sse()); +} +Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx) +{ + prefetchMid(addr, VectorAbi::Sse()); +} +Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx) +{ + prefetchFar(addr, VectorAbi::Sse()); +} +} // namespace Detail +} // namespace Vc #include "deinterleave.tcc" -#include "prefetches.tcc" -#include "helperimpl.tcc" -#include "undomacros.h" -#endif // VC_AVX_HELPERIMPL_H +#endif // VC_AVX_HELPERIMPL_H_ diff -Nru vc-0.7.4/avx/helperimpl.tcc vc-1.3.0/avx/helperimpl.tcc --- vc-0.7.4/avx/helperimpl.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/helperimpl.tcc 1969-12-31 18:00:00.000000000 -0600 @@ -1,64 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_HELPERIMPL_TCC -#define VC_AVX_HELPERIMPL_TCC - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace Internal -{ - -template -static _VC_CONSTEXPR size_t nextMultipleOf(size_t value) -{ - return (value % X) > 0 ? value + X - (value % X) : value; -} - -template -Vc_ALWAYS_INLINE void *HelperImpl::malloc(size_t n) -{ - switch (A) { - case Vc::AlignOnVector: - return _mm_malloc(nextMultipleOf(n), Vc::AVX::VectorAlignment); - case Vc::AlignOnCacheline: - // TODO: hardcoding 64 is not such a great idea - return _mm_malloc(nextMultipleOf<64>(n), 64); - case Vc::AlignOnPage: - // TODO: hardcoding 4096 is not such a great idea - return _mm_malloc(nextMultipleOf<4096>(n), 4096); - default: -#ifndef NDEBUG - abort(); -#endif - return _mm_malloc(n, 8); - } -} - -Vc_ALWAYS_INLINE void HelperImpl::free(void *p) -{ - _mm_free(p); -} - -} // namespace Internal -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#endif // VC_AVX_HELPERIMPL_TCC diff -Nru vc-0.7.4/avx/interleavedmemory.tcc vc-1.3.0/avx/interleavedmemory.tcc --- vc-0.7.4/avx/interleavedmemory.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/interleavedmemory.tcc 1969-12-31 18:00:00.000000000 -0600 @@ -1,890 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_AVX_INTERLEAVEDMEMORY_TCC -#define VC_AVX_INTERLEAVEDMEMORY_TCC - -#include "macros.h" - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace Common -{ - -namespace -{ -template struct InterleaveImpl; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data()); - const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data()); -#ifdef __x86_64__ - const long long tmp00 = _mm_cvtsi128_si64(tmp0); - const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0)); - const long long tmp10 = _mm_cvtsi128_si64(tmp1); - const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1)); - *reinterpret_cast(&data[i[0]]) = tmp00; - *reinterpret_cast(&data[i[1]]) = tmp00 >> 32; - *reinterpret_cast(&data[i[2]]) = tmp01; - *reinterpret_cast(&data[i[3]]) = tmp01 >> 32; - *reinterpret_cast(&data[i[4]]) = tmp10; - *reinterpret_cast(&data[i[5]]) = tmp10 >> 32; - *reinterpret_cast(&data[i[6]]) = tmp11; - *reinterpret_cast(&data[i[7]]) = tmp11 >> 32; -#else - *reinterpret_cast(&data[i[0]]) = _mm_cvtsi128_si32(tmp0); - *reinterpret_cast(&data[i[1]]) = _mm_extract_epi32(tmp0, 1); - *reinterpret_cast(&data[i[2]]) = _mm_extract_epi32(tmp0, 2); - *reinterpret_cast(&data[i[3]]) = _mm_extract_epi32(tmp0, 3); - *reinterpret_cast(&data[i[4]]) = _mm_cvtsi128_si32(tmp1); - *reinterpret_cast(&data[i[5]]) = _mm_extract_epi32(tmp1, 1); - *reinterpret_cast(&data[i[6]]) = _mm_extract_epi32(tmp1, 2); - *reinterpret_cast(&data[i[7]]) = _mm_extract_epi32(tmp1, 3); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { -#ifdef VC_USE_MASKMOV_SCATTER - const m128i maskLo = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); - const m128i maskHi = _mm_set_epi16(0, -1, -1, -1, 0, 0, 0, 0); - typename V::EntryType *const dataHi = data - 4; - const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); - const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); - const m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data()); - const m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data()); - - const m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); - const m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); - const m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); - const m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); - _mm_maskmoveu_si128(tmp4, maskLo, reinterpret_cast(&data[i[0]])); - _mm_maskmoveu_si128(tmp4, maskHi, reinterpret_cast(&dataHi[i[1]])); - _mm_maskmoveu_si128(tmp5, maskLo, reinterpret_cast(&data[i[2]])); - _mm_maskmoveu_si128(tmp5, maskHi, reinterpret_cast(&dataHi[i[3]])); - _mm_maskmoveu_si128(tmp6, maskLo, reinterpret_cast(&data[i[4]])); - _mm_maskmoveu_si128(tmp6, maskHi, reinterpret_cast(&dataHi[i[5]])); - _mm_maskmoveu_si128(tmp7, maskLo, reinterpret_cast(&data[i[6]])); - _mm_maskmoveu_si128(tmp7, maskHi, reinterpret_cast(&dataHi[i[7]])); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); - const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); - const m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data()); - const m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data()); - - const m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); - const m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); - const m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); - const m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); - - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7)); - }/*}}}*/ -}; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - using namespace Vc::AVX; - // [0a 1a 0b 1b 0e 1e 0f 1f]: - const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v1.data())); - // [0c 1c 0d 1d 0g 1g 0h 1h]: - const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v1.data())); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0)); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1)); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0)); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1)); - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { - using namespace Vc::AVX; -#ifdef VC_USE_MASKMOV_SCATTER - // [0a 2a 0b 2b 0e 2e 0f 2f]: - const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - // [0c 2c 0d 2d 0g 2g 0h 2h]: - const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - // [1a __ 1b __ 1e __ 1f __]: - const m256 tmp2 = _mm256_unpacklo_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v1.data())); - // [1c __ 1d __ 1g __ 1h __]: - const m256 tmp3 = _mm256_unpackhi_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v1.data())); - const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); - const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); - const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); - const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); - const m128i mask = _mm_set_epi32(0, -1, -1, -1); - _mm_maskstore_ps(reinterpret_cast(&data[i[0]]), mask, lo128(tmp4)); - _mm_maskstore_ps(reinterpret_cast(&data[i[1]]), mask, lo128(tmp5)); - _mm_maskstore_ps(reinterpret_cast(&data[i[2]]), mask, lo128(tmp6)); - _mm_maskstore_ps(reinterpret_cast(&data[i[3]]), mask, lo128(tmp7)); - _mm_maskstore_ps(reinterpret_cast(&data[i[4]]), mask, hi128(tmp4)); - _mm_maskstore_ps(reinterpret_cast(&data[i[5]]), mask, hi128(tmp5)); - _mm_maskstore_ps(reinterpret_cast(&data[i[6]]), mask, hi128(tmp6)); - _mm_maskstore_ps(reinterpret_cast(&data[i[7]]), mask, hi128(tmp7)); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - using namespace Vc::AVX; - const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - const m256 tmp2 = _mm256_unpacklo_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v3.data())); - const m256 tmp3 = _mm256_unpackhi_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v3.data())); - const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); - const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); - const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); - const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); - _mm_storeu_ps(reinterpret_cast(&data[i[0]]), lo128(tmp4)); - _mm_storeu_ps(reinterpret_cast(&data[i[1]]), lo128(tmp5)); - _mm_storeu_ps(reinterpret_cast(&data[i[2]]), lo128(tmp6)); - _mm_storeu_ps(reinterpret_cast(&data[i[3]]), lo128(tmp7)); - _mm_storeu_ps(reinterpret_cast(&data[i[4]]), hi128(tmp4)); - _mm_storeu_ps(reinterpret_cast(&data[i[5]]), hi128(tmp5)); - _mm_storeu_ps(reinterpret_cast(&data[i[6]]), hi128(tmp6)); - _mm_storeu_ps(reinterpret_cast(&data[i[7]]), hi128(tmp7)); - }/*}}}*/ -}; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - using namespace Vc::AVX; - const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); - const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); - _mm_storeu_pd(&data[i[0]], lo128(tmp0)); - _mm_storeu_pd(&data[i[1]], lo128(tmp1)); - _mm_storeu_pd(&data[i[2]], hi128(tmp0)); - _mm_storeu_pd(&data[i[3]], hi128(tmp1)); - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { - using namespace Vc::AVX; -#ifdef VC_USE_MASKMOV_SCATTER - const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); - const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); - const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data()); - const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data()); - -#if defined(VC_MSVC) && (VC_MSVC < 170000000 || !defined(_WIN64)) - // MSVC needs to be at Version 2012 before _mm256_set_epi64x works - const m256i mask = AVX::concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1)); -#else - const m256i mask = _mm256_set_epi64x(0, -1, -1, -1); -#endif - _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128(tmp0, tmp2)); - _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128(tmp1, tmp3)); - _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128(tmp0, tmp2)); - _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128(tmp1, tmp3)); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - using namespace Vc::AVX; - // 0a 1a 0c 1c: - const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); - // 0b 1b 0b 1b: - const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); - // 2a 3a 2c 3c: - const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data()); - // 2b 3b 2b 3b: - const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data()); - _mm256_storeu_pd(&data[i[0]], Mem::shuffle128(tmp0, tmp2)); - _mm256_storeu_pd(&data[i[1]], Mem::shuffle128(tmp1, tmp3)); - _mm256_storeu_pd(&data[i[2]], Mem::shuffle128(tmp0, tmp2)); - _mm256_storeu_pd(&data[i[3]], Mem::shuffle128(tmp1, tmp3)); - }/*}}}*/ -}; -} // anonymous namespace - -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); - v4.scatter(m_data + 4, m_indexes); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5) -{ - InterleaveImpl::interleave(m_data , m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6) -{ - InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) -{ - InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6, v7); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1) const/*{{{*/ -{ - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0]])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2]])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4]])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6]])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1]])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3]])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5]])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7]])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v0.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v1.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v3.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4) const/*{{{*/ -{ - v4.gather(m_data, m_indexes + I(4)); - deinterleave(v0, v1, v2, v3); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0] + 4])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2] + 4])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4] + 4])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6] + 4])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1] + 4])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3] + 4])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5] + 4])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7] + 4])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v4.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v5.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6, float_v &v7) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v7.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1) const/*{{{*/ -{ - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0]])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2]])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4]])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6]])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1]])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3]])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5]])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7]])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v0.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v1.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v3.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4) const/*{{{*/ -{ - v4.gather(m_data, m_indexes + I(4)); - deinterleave(v0, v1, v2, v3); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0] + 4])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2] + 4])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4] + 4])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6] + 4])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1] + 4])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3] + 4])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5] + 4])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7] + 4])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v4.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v5.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6, sfloat_v &v7) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v7.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ - -static Vc_ALWAYS_INLINE void _avx_deinterleave_double(const double *VC_RESTRICT data, const uint_v &indexes, double_v &v0, double_v &v1)/*{{{*/ -{ - const m256d ab02 = AVX::concat(_mm_loadu_pd(&data[indexes[0]]), _mm_loadu_pd(&data[indexes[2]])); - const m256d ab13 = AVX::concat(_mm_loadu_pd(&data[indexes[1]]), _mm_loadu_pd(&data[indexes[3]])); - - v0.data() = _mm256_unpacklo_pd(ab02, ab13); - v1.data() = _mm256_unpackhi_pd(ab02, ab13); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - v2.gather(m_data + 2, m_indexes); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - v4.gather(m_data + 4, m_indexes); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); - v6.gather(m_data + 6, m_indexes); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6, double_v &v7) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); - _avx_deinterleave_double(m_data + 6, m_indexes, v6, v7); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1) const {/*{{{*/ - const m128i a = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2) const { - const m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3) const { - const m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - const m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 - const m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); - v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6, short_v &v7) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - const m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 - const m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); - v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); - v7.data() = _mm_unpackhi_epi16(tmp14, tmp15); -}/*}}}*/ - -// forward types of equal size - ugly, but it works/*{{{*/ -#define _forward(V, V2) \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5, V &v6) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5), reinterpret_cast(v6)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5, V &v6, V &v7) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5), reinterpret_cast(v6), reinterpret_cast(v7)); \ -} -_forward( int_v, float_v) -_forward(uint_v, float_v) -_forward(ushort_v, short_v) -#undef _forward/*}}}*/ - -} // namespace Common -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#include "undomacros.h" - -#endif // VC_AVX_INTERLEAVEDMEMORY_TCC - -// vim: foldmethod=marker diff -Nru vc-0.7.4/avx/intrinsics.h vc-1.3.0/avx/intrinsics.h --- vc-0.7.4/avx/intrinsics.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/intrinsics.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,147 +1,77 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_INTRINSICS_H -#define VC_AVX_INTRINSICS_H - -#include "../common/windows_fix_intrin.h" +#ifndef VC_AVX_INTRINSICS_H_ +#define VC_AVX_INTRINSICS_H_ #include +#include "../traits/type_traits.h" // see comment in sse/intrinsics.h extern "C" { // AVX #include -#if (defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)) && !defined(VC_MSVC) +#if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC) #include #endif } #include "../common/fix_clang_emmintrin.h" -#if defined(VC_CLANG) && VC_CLANG < 0x30100 -// _mm_permute_ps is broken: http://llvm.org/bugs/show_bug.cgi?id=12401 -#undef _mm_permute_ps -#define _mm_permute_ps(A, C) __extension__ ({ \ - m128 __A = (A); \ - (m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ - (C) & 0x3, ((C) & 0xc) >> 2, \ - ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) -#endif - #include "const_data.h" +#include "../common/types.h" #include "macros.h" #include -#if defined(VC_CLANG) || defined(VC_MSVC) || (defined(VC_GCC) && !defined(__OPTIMIZE__)) -#define VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT -#endif - -#if defined(VC_CLANG) && VC_CLANG <= 0x30000 -// _mm_alignr_epi8 doesn't specify its return type, thus breaking overload resolution -#undef _mm_alignr_epi8 -#define _mm_alignr_epi8(a, b, n) ((m128i)__builtin_ia32_palignr128((a), (b), (n))) -#endif - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { -namespace AVX +namespace AvxIntrinsics { - /* super evil hacking around C++ features: - * consider - * void fun(int); - * namespace X { void fun(int); } - * namespace X { void bar() { fun(0); } } // this will be a call to X::fun(int) - * - * void fun(m256); - * namespace X { void fun(m256); } - * namespace X { void bar() { fun(0); } } // this will be ambiguous because m256 is a - * non-fundamental type in the global namespace, thus - * adding ::fun(m256) to the candidates - * - * To make my own overloads of the intrinsics distinct I have to use a type that is inside the - * Vc::AVX namespace. To reduce porting effort and increase generality I want to use the same - * function names as used in the global namespace. The type name may not be the same, though - * because identifiers starting with two underscores are reserved by the standard. Thus using - * those would mean to depend on undefined behavior. - * Sadly a typedef is not enough. - * Public inheritance also does not work, because at least ICC considers the __m??? types to be - * some sort of fundamental types. - * Thus composition is the only solution. - */ -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - template struct Alias - { - typedef T Base; - T _d; - Vc_ALWAYS_INLINE operator T &() { return _d; } - Vc_ALWAYS_INLINE operator const T &() const { return _d; } - Vc_ALWAYS_INLINE Alias() {} - Vc_ALWAYS_INLINE Alias(T x) : _d(x) {} - Vc_ALWAYS_INLINE Alias(const Alias &x) : _d(x._d) {} - Vc_ALWAYS_INLINE Alias &operator=(T x) { _d = x; return *this; } - Vc_ALWAYS_INLINE Alias &operator=(const Alias &x) { _d = x._d; return *this; } - }; - typedef Alias<__m128 > m128 ; - typedef Alias<__m128d> m128d; - typedef Alias<__m128i> m128i; - typedef Alias<__m256 > m256 ; - typedef Alias<__m256d> m256d; - typedef Alias<__m256i> m256i; -#else + using AVX::c_general; + using AVX::_IndexesFromZero32; + using AVX::_IndexesFromZero16; + using AVX::_IndexesFromZero8; + typedef __m128 m128 ; typedef __m128d m128d; typedef __m128i m128i; typedef __m256 m256 ; typedef __m256d m256d; typedef __m256i m256i; -#endif -#if defined(VC_UNCONDITIONAL_AVX2_INTRINSICS) && defined(VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN) - typedef const m128 & param128 ; - typedef const m128d & param128d; - typedef const m128i & param128i; - typedef const m256 & param256 ; - typedef const m256d & param256d; - typedef const m256i & param256i; -#else + typedef const m128 param128 ; typedef const m128d param128d; typedef const m128i param128i; typedef const m256 param256 ; typedef const m256d param256d; typedef const m256i param256i; -#endif - -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - // Make use of cast intrinsics easier. But if param256 == const __m256 then these would lead to - // ambiguities. - static Vc_INTRINSIC m256i Vc_CONST _mm256_castps_si256(param256 a) { return ::_mm256_castps_si256(a); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_castps_pd (param256 a) { return ::_mm256_castps_pd (a); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_castpd_si256(param256d a) { return ::_mm256_castpd_si256(a); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_castpd_ps (param256d a) { return ::_mm256_castpd_ps (a); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_castsi256_ps(param256i a) { return ::_mm256_castsi256_ps(a); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_castsi256_pd(param256i a) { return ::_mm256_castsi256_pd(a); } -#endif -#ifdef VC_GCC +#ifdef Vc_GCC // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :) static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) * static_cast<__v4df>(b)); } @@ -152,460 +82,627 @@ static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); } #endif - static Vc_INTRINSIC m256 Vc_CONST _mm256_set1_ps (float a) { return ::_mm256_set1_ps (a); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_set1_pd (double a) { return ::_mm256_set1_pd (a); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epi32(int a) { return ::_mm256_set1_epi32(a); } + static Vc_INTRINSIC m256 Vc_CONST set1_ps (float a) { return _mm256_set1_ps (a); } + static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); } + static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); } //static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epu32(unsigned int a) { return ::_mm256_set1_epu32(a); } -#if defined(VC_GNU_ASM) && !defined(NVALGRIND) - static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; } -#else - static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); } -#endif - static Vc_INTRINSIC m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); } - static Vc_INTRINSIC m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); } - static Vc_INTRINSIC m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); } + static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast(Common::AllBitsSet)); } + static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast(Common::AllBitsSet)); } + static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast(Common::AllBitsSet)); } + + static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast(Common::AllBitsSet))); } + static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast(Common::AllBitsSet)); } + static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::one16))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } + static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); } -#if defined(VC_GNU_ASM) && !defined(NVALGRIND) - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { __m256 r; __asm__("vcmpps $8,%0,%0,%0":"=x"(r)); return r; } -#elif defined(VC_MSVC) - // MSVC puts temporaries of this value on the stack, but sometimes at misaligned addresses, try - // some other generator instead... - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { return _mm256_castsi256_ps(_mm256_set1_epi32(-1)); } -#else - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { m256 r = _mm256_setzero_ps(); return _mm256_cmp_ps(r, r, _CMP_EQ_UQ); } -#endif - static Vc_INTRINSIC m256i Vc_CONST _mm256_setallone_si256() { return _mm256_castps_si256(_mm256_setallone()); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_setallone_pd() { return _mm256_castps_pd(_mm256_setallone()); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone_ps() { return _mm256_setallone(); } - - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi8 () { return _mm256_set1_epi8(1); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu8 () { return _mm256_setone_epi8(); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::one16))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu16() { return _mm256_setone_epi16(); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu32() { return _mm256_setone_epi32(); } - - static Vc_INTRINSIC m256 Vc_CONST _mm256_setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); } - - static Vc_INTRINSIC m256d Vc_CONST _mm256_setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::absMaskFloat[0])); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::absMaskFloat[1])); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast(&c_general::signMaskFloat[0])); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1])); } + static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); } + static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); } + static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::one16))); } + static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); } + static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } + static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); } + + static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); } + static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); } + + static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::absMaskFloat[0])); } + static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::absMaskFloat[1])); } + static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast(&c_general::signMaskFloat[0])); } + static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1])); } + + static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); } + static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); } + static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } + static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } - - //X static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi8 () { return _mm256_slli_epi8 (_mm256_setallone_si256(), 7); } + static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::minShort))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::minShort))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } + static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::minShort))); } + static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } -#ifdef VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT -#define _mm_extract_epu8 (x, i) (static_cast (_mm_extract_epi8 ((x), (i)))) -#define _mm_extract_epu16(x, i) (static_cast(_mm_extract_epi16((x), (i)))) -#define _mm_extract_epu32(x, i) (static_cast (_mm_extract_epi32((x), (i)))) + template + static Vc_INTRINSIC Vc_CONST unsigned char extract_epu8(__m128i x) + { + return _mm_extract_epi8(x, i); + } + template + static Vc_INTRINSIC Vc_CONST unsigned short extract_epu16(__m128i x) + { + return _mm_extract_epi16(x, i); + } + template + static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x) + { + return _mm_extract_epi32(x, i); + } + + template Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); } + template Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); } + template Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) { +#ifdef Vc_IMPL_AVX2 + return _mm256_inserti128_si256(a, b, offset); #else - static Vc_INTRINSIC unsigned char Vc_CONST _mm_extract_epu8(param128i x, const int i) { return _mm_extract_epi8(x, i); } - static Vc_INTRINSIC unsigned short Vc_CONST _mm_extract_epu16(param128i x, const int i) { return _mm_extract_epi16(x, i); } - static Vc_INTRINSIC unsigned int Vc_CONST _mm_extract_epu32(param128i x, const int i) { return _mm_extract_epi32(x, i); } + return _mm256_insertf128_si256(a, b, offset); #endif + } - /////////////////////// COMPARE OPS /////////////////////// - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpeq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpneq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmplt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnlt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmple_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnle_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpord_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpunord_pd(param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); } - - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpeq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpneq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmplt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnlt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpge_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmple_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnle_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpgt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpord_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpunord_ps(param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); } + template Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); } + template Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); } + template Vc_INTRINSIC __m128i extract128(__m256i a) { +#ifdef Vc_IMPL_AVX2 + return _mm256_extracti128_si256(a, offset); +#else + return _mm256_extractf128_si256(a, offset); +#endif + } - static Vc_INTRINSIC m128i _mm_cmplt_epu16(param128i a, param128i b) { + /////////////////////// COMPARE OPS /////////////////////// + static Vc_INTRINSIC m256d Vc_CONST cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } + static Vc_INTRINSIC m256d Vc_CONST cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } + static Vc_INTRINSIC m256d Vc_CONST cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } + static Vc_INTRINSIC m256d Vc_CONST cmpnlt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } + static Vc_INTRINSIC m256d Vc_CONST cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } + static Vc_INTRINSIC m256d Vc_CONST cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } + static Vc_INTRINSIC m256d Vc_CONST cmpnle_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } + static Vc_INTRINSIC m256d Vc_CONST cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } + static Vc_INTRINSIC m256d Vc_CONST cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); } + static Vc_INTRINSIC m256d Vc_CONST cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); } + + static Vc_INTRINSIC m256 Vc_CONST cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } + static Vc_INTRINSIC m256 Vc_CONST cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } + static Vc_INTRINSIC m256 Vc_CONST cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } + static Vc_INTRINSIC m256 Vc_CONST cmpnlt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } + static Vc_INTRINSIC m256 Vc_CONST cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } + static Vc_INTRINSIC m256 Vc_CONST cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } + static Vc_INTRINSIC m256 Vc_CONST cmpnle_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } + static Vc_INTRINSIC m256 Vc_CONST cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } + static Vc_INTRINSIC m256 Vc_CONST cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); } + static Vc_INTRINSIC m256 Vc_CONST cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); } + +#if defined(Vc_IMPL_XOP) + static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) { + return _mm_comlt_epu16(a, b); + } + static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) { + return _mm_comgt_epu16(a, b); + } +#else + static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } - static Vc_INTRINSIC m128i _mm_cmpgt_epu16(param128i a, param128i b) { + static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } - - /////////////////////// INTEGER OPS /////////////////////// -#define AVX_TO_SSE_2(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, param256i b0) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i b1 = _mm256_extractf128_si256(b0, 1); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ - m128i r1 = _mm_##name(a1, b1); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } -#define AVX_TO_SSE_2_si128_si256(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name##_si256(param256i a0, param256i b0) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i b1 = _mm256_extractf128_si256(b0, 1); \ - m128i r0 = _mm_##name##_si128(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ - m128i r1 = _mm_##name##_si128(a1, b1); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } -#define AVX_TO_SSE_1(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ - m128i r1 = _mm_##name(a1); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } -#define AVX_TO_SSE_1i(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, const int i) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ - m128i r1 = _mm_##name(a1, i); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } - - AVX_TO_SSE_2(cmplt_epi8) - AVX_TO_SSE_2(cmplt_epi16) - AVX_TO_SSE_2(cmplt_epi32) - AVX_TO_SSE_2(cmpeq_epi8) - AVX_TO_SSE_2(cmpeq_epi16) - AVX_TO_SSE_2(cmpeq_epi32) - AVX_TO_SSE_2(cmpgt_epi8) - AVX_TO_SSE_2(cmpgt_epi16) - AVX_TO_SSE_2(cmpgt_epi32) - - // This code is AVX only (without AVX2). We never asked for AVX2 intrinsics. So go away... :) -#if defined _mm256_srli_si256 -#undef _mm256_srli_si256 -#endif -#if defined _mm256_slli_si256 -#undef _mm256_slli_si256 #endif -#if defined _mm256_blend_epi16 -#undef _mm256_blend_epi16 + +#ifdef Vc_IMPL_AVX2 + template Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2) + { + return _mm256_alignr_epi8(s1, s2, shift); + } +#else + template Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2) + { + return insert128<1>( + _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1), + _mm256_castsi256_si128(s2), shift)), + _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift)); + } #endif - static Vc_INTRINSIC m256i Vc_CONST _mm256_srli_si256(param256i a0, const int i) { - const m128i vLo = _mm256_castsi256_si128(a0); - const m128i vHi = _mm256_extractf128_si256(a0, 1); - switch (i) { - case 0: return a0; - case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 1)), _mm_srli_si128(vHi, 1), 1); - case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 2)), _mm_srli_si128(vHi, 2), 1); - case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 3)), _mm_srli_si128(vHi, 3), 1); - case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 4)), _mm_srli_si128(vHi, 4), 1); - case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 5)), _mm_srli_si128(vHi, 5), 1); - case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 6)), _mm_srli_si128(vHi, 6), 1); - case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 7)), _mm_srli_si128(vHi, 7), 1); - case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 8)), _mm_srli_si128(vHi, 8), 1); - case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 9)), _mm_srli_si128(vHi, 9), 1); - case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 10)), _mm_srli_si128(vHi, 10), 1); - case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 11)), _mm_srli_si128(vHi, 11), 1); - case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 12)), _mm_srli_si128(vHi, 12), 1); - case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 13)), _mm_srli_si128(vHi, 13), 1); - case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 14)), _mm_srli_si128(vHi, 14), 1); - case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 15)), _mm_srli_si128(vHi, 15), 1); - case 16: return _mm256_permute2f128_si256(a0, a0, 0x81); - case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), 0x80); - case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), 0x80); - case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), 0x80); - case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), 0x80); - case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), 0x80); - case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), 0x80); - case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), 0x80); - case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), 0x80); - case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), 0x80); - case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), 0x80); - case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), 0x80); - case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), 0x80); - case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), 0x80); - case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), 0x80); - case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), 0x80); - } - return _mm256_setzero_si256(); - } - static Vc_INTRINSIC m256i Vc_CONST _mm256_slli_si256(param256i a0, const int i) { - const m128i vLo = _mm256_castsi256_si128(a0); - const m128i vHi = _mm256_extractf128_si256(a0, 1); - switch (i) { - case 0: return a0; - case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm_alignr_epi8(vHi, vLo, 15), 1); - case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm_alignr_epi8(vHi, vLo, 14), 1); - case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm_alignr_epi8(vHi, vLo, 13), 1); - case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm_alignr_epi8(vHi, vLo, 12), 1); - case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm_alignr_epi8(vHi, vLo, 11), 1); - case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm_alignr_epi8(vHi, vLo, 10), 1); - case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm_alignr_epi8(vHi, vLo, 9), 1); - case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm_alignr_epi8(vHi, vLo, 8), 1); - case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm_alignr_epi8(vHi, vLo, 7), 1); - case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm_alignr_epi8(vHi, vLo, 6), 1); - case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm_alignr_epi8(vHi, vLo, 5), 1); - case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm_alignr_epi8(vHi, vLo, 4), 1); - case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm_alignr_epi8(vHi, vLo, 3), 1); - case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm_alignr_epi8(vHi, vLo, 2), 1); - case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm_alignr_epi8(vHi, vLo, 1), 1); - case 16: return _mm256_permute2f128_si256(a0, a0, 0x8); - case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), 0x8); - case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), 0x8); - case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), 0x8); - case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), 0x8); - case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), 0x8); - case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), 0x8); - case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), 0x8); - case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), 0x8); - case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), 0x8); - case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), 0x8); - case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), 0x8); - case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), 0x8); - case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), 0x8); - case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), 0x8); - case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), 0x8); - } - return _mm256_setzero_si256(); + +#ifdef Vc_IMPL_AVX2 +#define Vc_AVX_TO_SSE_2_NEW(name) \ + Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \ + { \ + return _mm256_##name(a0, b0); \ + } +#define Vc_AVX_TO_SSE_256_128(name) \ + Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \ + { \ + return _mm256_##name(a0, b0); \ + } +#define Vc_AVX_TO_SSE_1i(name) \ + template Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \ + { \ + return _mm256_##name(a0, i); \ + } +#define Vc_AVX_TO_SSE_1(name) \ + Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); } +#define Vc_AVX_TO_SSE_1_128(name, shift__) \ + Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); } +#else +/**\internal + * Defines the function \p name, which takes to __m256i arguments and calls `_mm_##name` on the low + * and high 128 bit halfs of the arguments. + * + * In case the AVX2 intrinsics are enabled, the arguments are directly passed to a single + * `_mm256_##name` call. + */ +#define Vc_AVX_TO_SSE_1(name) \ + Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \ + { \ + __m128i a1 = extract128<1>(a0); \ + __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ + __m128i r1 = _mm_##name(a1); \ + return insert128<1>(_mm256_castsi128_si256(r0), r1); \ + } +#define Vc_AVX_TO_SSE_1_128(name, shift__) \ + Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \ + { \ + __m128i r0 = _mm_##name(a0); \ + __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \ + return insert128<1>(_mm256_castsi128_si256(r0), r1); \ + } +#define Vc_AVX_TO_SSE_2_NEW(name) \ + Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \ + { \ + m128i a1 = extract128<1>(a0); \ + m128i b1 = extract128<1>(b0); \ + m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ + m128i r1 = _mm_##name(a1, b1); \ + return insert128<1>(_mm256_castsi128_si256(r0), r1); \ + } +#define Vc_AVX_TO_SSE_256_128(name) \ + Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \ + { \ + m128i a1 = extract128<1>(a0); \ + m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \ + m128i r1 = _mm_##name(a1, b0); \ + return insert128<1>(_mm256_castsi128_si256(r0), r1); \ + } +#define Vc_AVX_TO_SSE_1i(name) \ + template Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \ + { \ + m128i a1 = extract128<1>(a0); \ + m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ + m128i r1 = _mm_##name(a1, i); \ + return insert128<1>(_mm256_castsi128_si256(r0), r1); \ + } +#endif + Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); } + Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); } + Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); } + Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); } + Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); } + Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); } + Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); } + Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); } + + Vc_AVX_TO_SSE_1i(slli_epi16) + Vc_AVX_TO_SSE_1i(slli_epi32) + Vc_AVX_TO_SSE_1i(slli_epi64) + Vc_AVX_TO_SSE_1i(srai_epi16) + Vc_AVX_TO_SSE_1i(srai_epi32) + Vc_AVX_TO_SSE_1i(srli_epi16) + Vc_AVX_TO_SSE_1i(srli_epi32) + Vc_AVX_TO_SSE_1i(srli_epi64) + + Vc_AVX_TO_SSE_256_128(sll_epi16) + Vc_AVX_TO_SSE_256_128(sll_epi32) + Vc_AVX_TO_SSE_256_128(sll_epi64) + Vc_AVX_TO_SSE_256_128(srl_epi16) + Vc_AVX_TO_SSE_256_128(srl_epi32) + Vc_AVX_TO_SSE_256_128(srl_epi64) + Vc_AVX_TO_SSE_256_128(sra_epi16) + Vc_AVX_TO_SSE_256_128(sra_epi32) + + Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8) + Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16) + Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32) + Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64) + Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8) + Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16) + Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32) + Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64) + Vc_AVX_TO_SSE_2_NEW(packs_epi16) + Vc_AVX_TO_SSE_2_NEW(packs_epi32) + Vc_AVX_TO_SSE_2_NEW(packus_epi16) + Vc_AVX_TO_SSE_2_NEW(unpackhi_epi8) + Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16) + Vc_AVX_TO_SSE_2_NEW(unpackhi_epi32) + Vc_AVX_TO_SSE_2_NEW(unpackhi_epi64) + Vc_AVX_TO_SSE_2_NEW(unpacklo_epi8) + Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16) + Vc_AVX_TO_SSE_2_NEW(unpacklo_epi32) + Vc_AVX_TO_SSE_2_NEW(unpacklo_epi64) + Vc_AVX_TO_SSE_2_NEW(add_epi8) + Vc_AVX_TO_SSE_2_NEW(add_epi16) + Vc_AVX_TO_SSE_2_NEW(add_epi32) + Vc_AVX_TO_SSE_2_NEW(add_epi64) + Vc_AVX_TO_SSE_2_NEW(adds_epi8) + Vc_AVX_TO_SSE_2_NEW(adds_epi16) + Vc_AVX_TO_SSE_2_NEW(adds_epu8) + Vc_AVX_TO_SSE_2_NEW(adds_epu16) + Vc_AVX_TO_SSE_2_NEW(sub_epi8) + Vc_AVX_TO_SSE_2_NEW(sub_epi16) + Vc_AVX_TO_SSE_2_NEW(sub_epi32) + Vc_AVX_TO_SSE_2_NEW(sub_epi64) + Vc_AVX_TO_SSE_2_NEW(subs_epi8) + Vc_AVX_TO_SSE_2_NEW(subs_epi16) + Vc_AVX_TO_SSE_2_NEW(subs_epu8) + Vc_AVX_TO_SSE_2_NEW(subs_epu16) + Vc_AVX_TO_SSE_2_NEW(madd_epi16) + Vc_AVX_TO_SSE_2_NEW(mulhi_epi16) + Vc_AVX_TO_SSE_2_NEW(mullo_epi16) + Vc_AVX_TO_SSE_2_NEW(mul_epu32) + Vc_AVX_TO_SSE_2_NEW(max_epi16) + Vc_AVX_TO_SSE_2_NEW(max_epu8) + Vc_AVX_TO_SSE_2_NEW(min_epi16) + Vc_AVX_TO_SSE_2_NEW(min_epu8) + Vc_AVX_TO_SSE_2_NEW(mulhi_epu16) + // shufflehi_epi16 + // shufflelo_epi16 (__m128i __A, const int __mask) + // shuffle_epi32 (__m128i __A, const int __mask) + // maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) + Vc_AVX_TO_SSE_2_NEW(avg_epu8) + Vc_AVX_TO_SSE_2_NEW(avg_epu16) + Vc_AVX_TO_SSE_2_NEW(sad_epu8) + // stream_si32 (int *__A, int __B) + // stream_si128 (__m128i *__A, __m128i __B) + // cvtsi32_si128 (int __A) + // cvtsi64_si128 (long long __A) + // cvtsi64x_si128 (long long __A) + Vc_AVX_TO_SSE_2_NEW(hadd_epi16) + Vc_AVX_TO_SSE_2_NEW(hadd_epi32) + Vc_AVX_TO_SSE_2_NEW(hadds_epi16) + Vc_AVX_TO_SSE_2_NEW(hsub_epi16) + Vc_AVX_TO_SSE_2_NEW(hsub_epi32) + Vc_AVX_TO_SSE_2_NEW(hsubs_epi16) + Vc_AVX_TO_SSE_2_NEW(maddubs_epi16) + Vc_AVX_TO_SSE_2_NEW(mulhrs_epi16) + Vc_AVX_TO_SSE_2_NEW(shuffle_epi8) + Vc_AVX_TO_SSE_2_NEW(sign_epi8) + Vc_AVX_TO_SSE_2_NEW(sign_epi16) + Vc_AVX_TO_SSE_2_NEW(sign_epi32) + Vc_AVX_TO_SSE_2_NEW(min_epi8) + Vc_AVX_TO_SSE_2_NEW(max_epi8) + Vc_AVX_TO_SSE_2_NEW(min_epu16) + Vc_AVX_TO_SSE_2_NEW(max_epu16) + Vc_AVX_TO_SSE_2_NEW(min_epi32) + Vc_AVX_TO_SSE_2_NEW(max_epi32) + Vc_AVX_TO_SSE_2_NEW(min_epu32) + Vc_AVX_TO_SSE_2_NEW(max_epu32) + Vc_AVX_TO_SSE_2_NEW(mullo_epi32) + Vc_AVX_TO_SSE_2_NEW(mul_epi32) + + Vc_AVX_TO_SSE_1(abs_epi8) + Vc_AVX_TO_SSE_1(abs_epi16) + Vc_AVX_TO_SSE_1(abs_epi32) + Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8) + Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4) + Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2) + Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8) + Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4) + Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8) + Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8) + Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4) + Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2) + Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8) + Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4) + Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8) + + Vc_AVX_TO_SSE_2_NEW(packus_epi32) + +#ifndef Vc_IMPL_AVX2 + +///////////////////////////////////////////////////////////////////////// +// implementation of the intrinsics missing in AVX +///////////////////////////////////////////////////////////////////////// + + template Vc_INTRINSIC Vc_CONST __m256i srli_si256(__m256i a0) { + const __m128i vLo = _mm256_castsi256_si128(a0); + const __m128i vHi = extract128<1>(a0); + return insert128<1>(_mm256_castsi128_si256(_mm_srli_si128(vLo, i)), _mm_srli_si128(vHi, i)); + } + template Vc_INTRINSIC Vc_CONST __m256i slli_si256(__m256i a0) { + const __m128i vLo = _mm256_castsi256_si128(a0); + const __m128i vHi = extract128<1>(a0); + return insert128<1>(_mm256_castsi128_si256(_mm_slli_si128(vLo, i)), _mm_slli_si128(vHi, i)); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_and_si256(param256i x, param256i y) { + static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_andnot_si256(param256i x, param256i y) { + static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_or_si256(param256i x, param256i y) { + static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_xor_si256(param256i x, param256i y) { + static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } - AVX_TO_SSE_2(packs_epi16) - AVX_TO_SSE_2(packs_epi32) - AVX_TO_SSE_2(packus_epi16) - AVX_TO_SSE_2(unpackhi_epi8) - AVX_TO_SSE_2(unpackhi_epi16) - AVX_TO_SSE_2(unpackhi_epi32) - AVX_TO_SSE_2(unpackhi_epi64) - AVX_TO_SSE_2(unpacklo_epi8) - AVX_TO_SSE_2(unpacklo_epi16) - AVX_TO_SSE_2(unpacklo_epi32) - AVX_TO_SSE_2(unpacklo_epi64) - AVX_TO_SSE_2(add_epi8) - AVX_TO_SSE_2(add_epi16) - AVX_TO_SSE_2(add_epi32) - AVX_TO_SSE_2(add_epi64) - AVX_TO_SSE_2(adds_epi8) - AVX_TO_SSE_2(adds_epi16) - AVX_TO_SSE_2(adds_epu8) - AVX_TO_SSE_2(adds_epu16) - AVX_TO_SSE_2(sub_epi8) - AVX_TO_SSE_2(sub_epi16) - AVX_TO_SSE_2(sub_epi32) - AVX_TO_SSE_2(sub_epi64) - AVX_TO_SSE_2(subs_epi8) - AVX_TO_SSE_2(subs_epi16) - AVX_TO_SSE_2(subs_epu8) - AVX_TO_SSE_2(subs_epu16) - AVX_TO_SSE_2(madd_epi16) - AVX_TO_SSE_2(mulhi_epi16) - AVX_TO_SSE_2(mullo_epi16) - AVX_TO_SSE_2(mul_epu32) - AVX_TO_SSE_1i(slli_epi16) - AVX_TO_SSE_1i(slli_epi32) - AVX_TO_SSE_1i(slli_epi64) - AVX_TO_SSE_1i(srai_epi16) - AVX_TO_SSE_1i(srai_epi32) - AVX_TO_SSE_1i(srli_epi16) - AVX_TO_SSE_1i(srli_epi32) - AVX_TO_SSE_1i(srli_epi64) - AVX_TO_SSE_2(sll_epi16) - AVX_TO_SSE_2(sll_epi32) - AVX_TO_SSE_2(sll_epi64) - AVX_TO_SSE_2(sra_epi16) - AVX_TO_SSE_2(sra_epi32) - AVX_TO_SSE_2(srl_epi16) - AVX_TO_SSE_2(srl_epi32) - AVX_TO_SSE_2(srl_epi64) - AVX_TO_SSE_2(max_epi16) - AVX_TO_SSE_2(max_epu8) - AVX_TO_SSE_2(min_epi16) - AVX_TO_SSE_2(min_epu8) - Vc_INTRINSIC int Vc_CONST _mm256_movemask_epi8(param256i a0) + Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0) { - m128i a1 = _mm256_extractf128_si256(a0, 1); + m128i a1 = extract128<1>(a0); return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0)); } - AVX_TO_SSE_2(mulhi_epu16) - // shufflehi_epi16 - // shufflelo_epi16 (param128i __A, const int __mask) - // shuffle_epi32 (param128i __A, const int __mask) - // maskmoveu_si128 (param128i __A, param128i __B, char *__C) - AVX_TO_SSE_2(avg_epu8) - AVX_TO_SSE_2(avg_epu16) - AVX_TO_SSE_2(sad_epu8) - // stream_si32 (int *__A, int __B) - // stream_si128 (param128i *__A, param128i __B) - // cvtsi32_si128 (int __A) - // cvtsi64_si128 (long long __A) - // cvtsi64x_si128 (long long __A) - AVX_TO_SSE_2(hadd_epi16) - AVX_TO_SSE_2(hadd_epi32) - AVX_TO_SSE_2(hadds_epi16) - AVX_TO_SSE_2(hsub_epi16) - AVX_TO_SSE_2(hsub_epi32) - AVX_TO_SSE_2(hsubs_epi16) - AVX_TO_SSE_2(maddubs_epi16) - AVX_TO_SSE_2(mulhrs_epi16) - AVX_TO_SSE_2(shuffle_epi8) - AVX_TO_SSE_2(sign_epi8) - AVX_TO_SSE_2(sign_epi16) - AVX_TO_SSE_2(sign_epi32) - // alignr_epi8(param128i __X, param128i __Y, const int __N) - AVX_TO_SSE_1(abs_epi8) - AVX_TO_SSE_1(abs_epi16) - AVX_TO_SSE_1(abs_epi32) -#if !defined(VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT) - m256i Vc_INTRINSIC Vc_CONST _mm256_blend_epi16(param256i a0, param256i b0, const int m) { - m128i a1 = _mm256_extractf128_si256(a0, 1); - m128i b1 = _mm256_extractf128_si256(b0, 1); + template Vc_INTRINSIC Vc_CONST m256i blend_epi16(param256i a0, param256i b0) + { + m128i a1 = extract128<1>(a0); + m128i b1 = extract128<1>(b0); m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff); m128i r1 = _mm_blend_epi16(a1, b1, m >> 8); - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); + return insert128<1>(_mm256_castsi128_si256(r0), r1); } -#else -# define _mm256_blend_epi16(a0, b0, m) \ - _mm256_insertf128_si256( \ - _mm256_castsi128_si256( \ - _mm_blend_epi16( \ - _mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff)), \ - _mm_blend_epi16(_mm256_extractf128_si256(a0, 1), _mm256_extractf128_si256(b0, 1), m >> 8);, 1) -#endif - Vc_INTRINSIC m256i Vc_CONST _mm256_blendv_epi8(param256i a0, param256i b0, param256i m0) { - m128i a1 = _mm256_extractf128_si256(a0, 1); - m128i b1 = _mm256_extractf128_si256(b0, 1); - m128i m1 = _mm256_extractf128_si256(m0, 1); + Vc_INTRINSIC Vc_CONST m256i blendv_epi8(param256i a0, param256i b0, param256i m0) { + m128i a1 = extract128<1>(a0); + m128i b1 = extract128<1>(b0); + m128i m1 = extract128<1>(m0); m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0)); m128i r1 = _mm_blendv_epi8(a1, b1, m1); - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); + return insert128<1>(_mm256_castsi128_si256(r0), r1); } - AVX_TO_SSE_2(cmpeq_epi64) - AVX_TO_SSE_2(min_epi8) - AVX_TO_SSE_2(max_epi8) - AVX_TO_SSE_2(min_epu16) - AVX_TO_SSE_2(max_epu16) - AVX_TO_SSE_2(min_epi32) - AVX_TO_SSE_2(max_epi32) - AVX_TO_SSE_2(min_epu32) - AVX_TO_SSE_2(max_epu32) - AVX_TO_SSE_2(mullo_epi32) - AVX_TO_SSE_2(mul_epi32) -#if !defined(VC_CLANG) || VC_CLANG > 0x30100 - // clang is missing _mm_minpos_epu16 from smmintrin.h - // http://llvm.org/bugs/show_bug.cgi?id=12399 - AVX_TO_SSE_1(minpos_epu16) -#endif - AVX_TO_SSE_1(cvtepi8_epi32) - AVX_TO_SSE_1(cvtepi16_epi32) - AVX_TO_SSE_1(cvtepi8_epi64) - AVX_TO_SSE_1(cvtepi32_epi64) - AVX_TO_SSE_1(cvtepi16_epi64) - AVX_TO_SSE_1(cvtepi8_epi16) - AVX_TO_SSE_1(cvtepu8_epi32) - AVX_TO_SSE_1(cvtepu16_epi32) - AVX_TO_SSE_1(cvtepu8_epi64) - AVX_TO_SSE_1(cvtepu32_epi64) - AVX_TO_SSE_1(cvtepu16_epi64) - AVX_TO_SSE_1(cvtepu8_epi16) - AVX_TO_SSE_2(packus_epi32) - // mpsadbw_epu8 (param128i __X, param128i __Y, const int __M) - // stream_load_si128 (param128i *__X) - AVX_TO_SSE_2(cmpgt_epi64) - -//X static Vc_INTRINSIC m256i _mm256_cmplt_epu8 (param256i a, param256i b) { return _mm256_cmplt_epi8 ( -//X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); } -//X static Vc_INTRINSIC m256i _mm256_cmpgt_epu8 (param256i a, param256i b) { return _mm256_cmpgt_epi8 ( -//X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_cmplt_epu32(param256i _a, param256i _b) { - m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - return _mm256_insertf128_si256(_mm256_castsi128_si256( - _mm_cmplt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))), - _mm_cmplt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1); - } - static Vc_INTRINSIC m256i Vc_CONST _mm256_cmpgt_epu32(param256i _a, param256i _b) { - m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - return _mm256_insertf128_si256(_mm256_castsi128_si256( - _mm_cmpgt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))), - _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1); - } - - static Vc_INTRINSIC void _mm256_maskstore(float *mem, const param256 mask, const param256 v) { -#ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE - _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v); -#else - _mm256_maskstore_ps(mem, mask, v); -#endif - } - static Vc_INTRINSIC void _mm256_maskstore(double *mem, const param256d mask, const param256d v) { -#ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE - _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v); + // mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) + // stream_load_si128 (__m128i *__X) + +#else // Vc_IMPL_AVX2 + +static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); } +static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); } +static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); } +static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); } + +template Vc_INTRINSIC Vc_CONST __m256i srli_si256(__m256i a0) +{ + return _mm256_srli_si256(a0, i); +} +template Vc_INTRINSIC Vc_CONST __m256i slli_si256(__m256i a0) +{ + return _mm256_slli_si256(a0, i); +} + +///////////////////////////////////////////////////////////////////////// +// implementation of the intrinsics missing in AVX2 +///////////////////////////////////////////////////////////////////////// +Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) +{ + return _mm256_blendv_epi8(a0, b0, m0); +} +Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0) +{ + return _mm256_movemask_epi8(a0); +} + +#endif // Vc_IMPL_AVX2 + +///////////////////////////////////////////////////////////////////////// +// implementation of intrinsics missing in AVX and AVX2 +///////////////////////////////////////////////////////////////////////// + +static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) { + return cmpgt_epi64(b, a); +} +static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) { + return cmpgt_epi32(b, a); +} +static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) { + return cmpgt_epi16(b, a); +} +static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) { + return cmpgt_epi8(b, a); +} + +static Vc_INTRINSIC m256i cmplt_epu8(__m256i a, __m256i b) { + return cmplt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8())); +} +static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) { + return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8())); +} +#if defined(Vc_IMPL_XOP) + Vc_AVX_TO_SSE_2_NEW(comlt_epu32) + Vc_AVX_TO_SSE_2_NEW(comgt_epu32) + Vc_AVX_TO_SSE_2_NEW(comlt_epu16) + Vc_AVX_TO_SSE_2_NEW(comgt_epu16) + static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); } + static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); } + static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); } + static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); } #else - _mm256_maskstore_pd(mem, mask, v); + static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) { + m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32()))); + m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32()))); + return cmplt_epi32(a, b); + } + static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) { + m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32()))); + m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32()))); + return cmpgt_epi32(a, b); + } + static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) { + m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16()))); + m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16()))); + return cmplt_epi16(a, b); + } + static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) { + m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16()))); + m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16()))); + return cmpgt_epi16(a, b); + } #endif - } - static Vc_INTRINSIC void _mm256_maskstore(int *mem, const param256i mask, const param256i v) { -#ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE - _mm256_maskstore_ps(reinterpret_cast(mem), mask, _mm256_castsi256_ps(v)); + +static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) { + _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v); +} +static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v); +} +static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) { +#ifdef Vc_IMPL_AVX2 + _mm256_maskstore_epi32(mem, mask, v); #else - _mm256_maskstore_ps(reinterpret_cast(mem), _mm256_castsi256_ps(mask), _mm256_castsi256_ps(v)); -#endif - } - static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const param256i mask, const param256i v) { - _mm256_maskstore(reinterpret_cast(mem), mask, v); - } - -#if defined(VC_IMPL_FMA4) && defined(VC_CLANG) && VC_CLANG < 0x30300 - // clang miscompiles _mm256_macc_ps: http://llvm.org/bugs/show_bug.cgi?id=15040 - static Vc_INTRINSIC __m256 my256_macc_ps(__m256 a, __m256 b, __m256 c) { - __m256 r; - // avoid loading c from memory as that would trigger the bug - asm("vfmaddps %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c)); - return r; - } -#ifdef _mm256_macc_ps -#undef _mm256_macc_ps + _mm256_maskstore_ps(reinterpret_cast(mem), mask, _mm256_castsi256_ps(v)); #endif -#define _mm256_macc_ps(a, b, c) Vc::AVX::my256_macc_ps(a, b, c) +} +static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) { + _mm256_maskstore(reinterpret_cast(mem), mask, v); +} +static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) { + using namespace AVX; + _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast(&mem[0])); + _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast(&mem[8])); +} +static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) { + _mm256_maskstore(reinterpret_cast(mem), mask, v); +} - static Vc_INTRINSIC __m256d my256_macc_pd(__m256d a, __m256d b, __m256d c) { - __m256d r; - // avoid loading c from memory as that would trigger the bug - asm("vfmaddpd %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c)); - return r; - } -#ifdef _mm256_macc_pd -#undef _mm256_macc_pd -#endif -#define _mm256_macc_pd(a, b, c) Vc::AVX::my256_macc_pd(a, b, c) +#undef Vc_AVX_TO_SSE_1 +#undef Vc_AVX_TO_SSE_1_128 +#undef Vc_AVX_TO_SSE_2_NEW +#undef Vc_AVX_TO_SSE_256_128 +#undef Vc_AVX_TO_SSE_1i + +template Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R; +template<> Vc_INTRINSIC m128 stream_load(const float *mem) +{ + return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); +} +template<> Vc_INTRINSIC m256 stream_load(const float *mem) +{ + return insert128<1>(_mm256_castps128_ps256(stream_load(mem)), + stream_load(mem + 4)); +} + +template Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R; +template<> Vc_INTRINSIC m128d stream_load(const double *mem) +{ + return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); +} +template<> Vc_INTRINSIC m256d stream_load(const double *mem) +{ + return insert128<1>(_mm256_castpd128_pd256(stream_load(mem)), + stream_load(mem + 2)); +} + +template Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R; +template<> Vc_INTRINSIC m128i stream_load(const void *mem) +{ + return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem))); +} +template<> Vc_INTRINSIC m256i stream_load(const void *mem) +{ + return insert128<1>(_mm256_castsi128_si256(stream_load(mem)), + stream_load(static_cast(mem) + 1)); +} + +Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask) +{ + _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast(mem)); +} +Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask) +{ + stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask)); + stream_store(mem + 4, extract128<1>(value), extract128<1>(mask)); +} +Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask) +{ + _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast(mem)); +} +Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask) +{ + stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask)); + stream_store(mem + 2, extract128<1>(value), extract128<1>(mask)); +} +Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask) +{ + _mm_maskmoveu_si128(value, mask, reinterpret_cast(mem)); +} +Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask) +{ + stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask)); + stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask)); +} + +#ifndef __x86_64__ +Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) { + return _mm_castpd_si128(_mm_load_sd(reinterpret_cast(&x))); +} #endif -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ -#include "undomacros.h" -#include "shuffle.h" +} // namespace AvxIntrinsics +} // namespace Vc + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace AVX +{ + using namespace AvxIntrinsics; +} // namespace AVX +namespace AVX2 +{ + using namespace AvxIntrinsics; +} // namespace AVX2 +namespace AVX +{ + template struct VectorTypeHelper; + template<> struct VectorTypeHelper< char > { typedef __m256i Type; }; + template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; }; + template<> struct VectorTypeHelper { typedef __m256i Type; }; + template<> struct VectorTypeHelper< short> { typedef __m256i Type; }; + template<> struct VectorTypeHelper { typedef __m256i Type; }; + template<> struct VectorTypeHelper< int > { typedef __m256i Type; }; + template<> struct VectorTypeHelper { typedef __m256i Type; }; + template<> struct VectorTypeHelper< long > { typedef __m256i Type; }; + template<> struct VectorTypeHelper { typedef __m256i Type; }; + template<> struct VectorTypeHelper< long long> { typedef __m256i Type; }; + template<> struct VectorTypeHelper { typedef __m256i Type; }; + template<> struct VectorTypeHelper< float> { typedef __m256 Type; }; + template<> struct VectorTypeHelper< double> { typedef __m256d Type; }; + + template struct SseVectorType; + template<> struct SseVectorType<__m256 > { typedef __m128 Type; }; + template<> struct SseVectorType<__m256i> { typedef __m128i Type; }; + template<> struct SseVectorType<__m256d> { typedef __m128d Type; }; + template<> struct SseVectorType<__m128 > { typedef __m128 Type; }; + template<> struct SseVectorType<__m128i> { typedef __m128i Type; }; + template<> struct SseVectorType<__m128d> { typedef __m128d Type; }; + + template + using IntegerVectorType = + typename std::conditional::type; + template + using DoubleVectorType = + typename std::conditional::type; + template + using FloatVectorType = + typename std::conditional::type; + + template struct VectorHelper {}; + template struct GatherHelper; + template struct ScatterHelper; + + template struct HasVectorDivisionHelper { enum { Value = 1 }; }; + template struct VectorHelperSize; +} // namespace AVX +} // namespace Vc -#endif // VC_AVX_INTRINSICS_H +#endif // VC_AVX_INTRINSICS_H_ diff -Nru vc-0.7.4/avx/limits.h vc-1.3.0/avx/limits.h --- vc-0.7.4/avx/limits.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/limits.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,55 +1,87 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2011 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_LIMITS_H -#define VC_AVX_LIMITS_H +#ifndef VC_AVX_LIMITS_H_ +#define VC_AVX_LIMITS_H_ #include "intrinsics.h" #include "types.h" +#include "macros.h" namespace std { -#define _VC_NUM_LIM(T, _max, _min) \ -template<> struct numeric_limits< ::Vc::AVX::Vector > : public numeric_limits \ -{ \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector max() _VC_NOEXCEPT { return _max; } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector min() _VC_NOEXCEPT { return _min; } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector lowest() _VC_NOEXCEPT { return min(); } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector epsilon() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector round_error() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector infinity() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector quiet_NaN() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector signaling_NaN() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::Vc::AVX::Vector denorm_min() _VC_NOEXCEPT { return ::Vc::AVX::Vector::Zero(); } \ -} - -#ifndef VC_IMPL_AVX2 -namespace { - using ::Vc::AVX::_mm256_srli_epi32; -} +#define Vc_NUM_LIM(T, _max, _min) \ + template <> struct numeric_limits> : public numeric_limits { \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector max() Vc_NOEXCEPT \ + { \ + return _max; \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector min() Vc_NOEXCEPT \ + { \ + return _min; \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector lowest() Vc_NOEXCEPT \ + { \ + return min(); \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector epsilon() Vc_NOEXCEPT \ + { \ + return Vc::AVX2::Vector::Zero(); \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector round_error() Vc_NOEXCEPT \ + { \ + return Vc::AVX2::Vector::Zero(); \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector infinity() Vc_NOEXCEPT \ + { \ + return Vc::AVX2::Vector::Zero(); \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector quiet_NaN() Vc_NOEXCEPT \ + { \ + return Vc::AVX2::Vector::Zero(); \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector signaling_NaN() Vc_NOEXCEPT \ + { \ + return Vc::AVX2::Vector::Zero(); \ + } \ + static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector denorm_min() Vc_NOEXCEPT \ + { \ + return Vc::AVX2::Vector::Zero(); \ + } \ + } + +#ifdef Vc_IMPL_AVX2 +Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>()); +Vc_NUM_LIM( short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16()); +Vc_NUM_LIM( unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>()); +Vc_NUM_LIM( int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32()); #endif -_VC_NUM_LIM(unsigned short, ::Vc::AVX::_mm_setallone_si128(), _mm_setzero_si128()); -_VC_NUM_LIM( short, _mm_srli_epi16(::Vc::AVX::_mm_setallone_si128(), 1), ::Vc::AVX::_mm_setmin_epi16()); -_VC_NUM_LIM( unsigned int, ::Vc::AVX::_mm256_setallone_si256(), _mm256_setzero_si256()); -_VC_NUM_LIM( int, _mm256_srli_epi32(::Vc::AVX::_mm256_setallone_si256(), 1), ::Vc::AVX::_mm256_setmin_epi32()); -#undef _VC_NUM_LIM +#undef Vc_NUM_LIM } // namespace std -#endif // VC_AVX_LIMITS_H +#endif // VC_AVX_LIMITS_H_ diff -Nru vc-0.7.4/avx/macros.h vc-1.3.0/avx/macros.h --- vc-0.7.4/avx/macros.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/macros.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,26 +1,33 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2011 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ +}}}*/ #include "../common/macros.h" -#ifndef VC_AVX_MACROS_H -#define VC_AVX_MACROS_H -#undef VC_AVX_UNDOMACROS_H +#ifndef VC_AVX_MACROS_H_ +#define VC_AVX_MACROS_H_ -#endif // VC_AVX_MACROS_H +#endif // VC_AVX_MACROS_H_ diff -Nru vc-0.7.4/avx/mask.h vc-1.3.0/avx/mask.h --- vc-0.7.4/avx/mask.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/mask.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,246 +1,221 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. +#ifndef VC_AVX_MASK_H_ +#define VC_AVX_MASK_H_ - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_MASK_H -#define VC_AVX_MASK_H +#include #include "intrinsics.h" +#include "../common/storage.h" #include "../common/bitscanintrinsics.h" +#include "../common/maskbool.h" +#include "detail.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace AVX +namespace Vc_VERSIONED_NAMESPACE { - -template class Mask +template class Mask { - friend class Mask<4u, 32u>; // double_v - friend class Mask<8u, 32u>; // float_v, (u)int_v - friend class Mask<8u, 16u>; // (u)short_v - friend class Mask<16u, 16u>; // (u)char_v - public: - FREE_STORE_OPERATORS_ALIGNED(32) - - // abstracts the way Masks are passed to functions, it can easily be changed to const ref here -#if defined VC_MSVC && defined _WIN32 - typedef const Mask &AsArg; -#else - typedef Mask AsArg; -#endif - - Vc_ALWAYS_INLINE Mask() {} - Vc_ALWAYS_INLINE Mask(param256 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(param256d x) : k(_mm256_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(param256i x) : k(_mm256_castsi256_ps(x)) {} -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_ALWAYS_INLINE Mask(__m256 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(__m256d x) : k(_mm256_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(__m256i x) : k(_mm256_castsi256_ps(x)) {} -#endif - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm256_setzero_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm256_setallone_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm256_setallone_ps() : m256(_mm256_setzero_ps())) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(avx_cast(concat( - _mm_unpacklo_epi16(rhs.dataI(), rhs.dataI()), - _mm_unpackhi_epi16(rhs.dataI(), rhs.dataI())))) {} - Vc_ALWAYS_INLINE_L Mask(const Mask &m) Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Mask(const Mask &m) Vc_ALWAYS_INLINE_R; - - Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm256_testc_ps(k, rhs.k); } - Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm256_testc_ps(k, rhs.k); } - - Vc_ALWAYS_INLINE Mask operator!() const { return _mm256_andnot_ps(data(), _mm256_setallone_ps()); } - - Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm256_and_ps(k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm256_or_ps (k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm256_xor_ps(k, rhs.k); return *this; } +public: + using abi = VectorAbi::Avx; - // no need for expression template optimizations because cmp(n)eq for floats are not bitwise - // compares - Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm256_testc_ps(k, _mm256_setallone_ps()); } - Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm256_testz_ps(k, k); } - Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm256_testnzc_ps(k, _mm256_setallone_ps()); } - -#ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK - Vc_ALWAYS_INLINE operator bool() const { return isFull(); } -#endif - - Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE m256 data () const { return k; } - Vc_ALWAYS_INLINE m256i dataI() const { return _mm256_castps_si256(k); } - Vc_ALWAYS_INLINE m256d dataD() const { return _mm256_castps_pd(k); } + /** + * The \c EntryType of masks is always bool, independent of \c T. + */ + typedef bool EntryType; + using value_type = EntryType; + + using MaskBool = Common::MaskBool; + /** + * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD + * implementation. This type is useful for the \c sizeof operator in generic functions. + */ + using VectorEntryType = MaskBool; + + /** + * The associated Vector type. + */ + using Vector = AVX2::Vector; + + ///\internal + using VectorTypeF = AVX::FloatVectorType::Type>; + ///\internal + using VectorTypeD = AVX::DoubleVectorType; + ///\internal + using VectorTypeI = AVX::IntegerVectorType; + +private: + typedef const VectorTypeF VArg; + typedef const VectorTypeD VdArg; + typedef const VectorTypeI ViArg; + +public: + static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T); + static constexpr size_t MemoryAlignment = Size; + static constexpr std::size_t size() { return Size; } + Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType)); + +private: + typedef Common::Storage Storage; + +public: + /** + * The \c VectorType reveals the implementation-specific internal type used for the + * SIMD type. + */ + using VectorType = typename Storage::VectorType; - Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - private: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - m256 k; -}; - -template class Mask -{ - friend class Mask<4u, 32u>; // double_v - friend class Mask<8u, 32u>; // float_v, (u)int_v - friend class Mask<8u, 16u>; // (u)short_v - friend class Mask<16u, 16u>; // (u)char_v - public: - FREE_STORE_OPERATORS_ALIGNED(16) + using EntryReference = Vc::Detail::ElementReference; + using reference = EntryReference; // abstracts the way Masks are passed to functions, it can easily be changed to const ref here -#if defined VC_MSVC && defined _WIN32 - typedef const Mask &AsArg; +#if defined Vc_MSVC && defined _WIN32 + typedef const Mask &AsArg; #else - typedef Mask AsArg; + typedef const Mask AsArg; #endif - Vc_ALWAYS_INLINE Mask() {} - Vc_ALWAYS_INLINE Mask(param128 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(param128d x) : k(_mm_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(param128i x) : k(_mm_castsi128_ps(x)) {} -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_ALWAYS_INLINE Mask(__m128 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(__m128d x) : k(_mm_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(__m128i x) : k(_mm_castsi128_ps(x)) {} -#endif - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm_setzero_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm_setallone_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm_setallone_ps() : m128(_mm_setzero_ps())) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(avx_cast( - _mm_packs_epi32(avx_cast(rhs.data()), _mm256_extractf128_si256(rhs.dataI(), 1)))) {} - Vc_ALWAYS_INLINE Mask(const Mask *a) : k(avx_cast( - _mm_packs_epi16(a[0].dataI(), a[1].dataI()))) {} - - Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm_testc_si128(dataI(), rhs.dataI()); } - Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm_testc_si128(dataI(), rhs.dataI()); } - - Vc_ALWAYS_INLINE Mask operator!() const { return _mm_andnot_ps(data(), _mm_setallone_ps()); } - - Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm_and_ps(k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm_or_ps (k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm_xor_ps(k, rhs.k); return *this; } - - // TODO: use expression templates to optimize (v1 == v2).isFull() and friends - Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm_testc_si128(dataI(), _mm_setallone_si128()); } - Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm_testz_si128(dataI(), dataI()); } - Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm_testnzc_si128(dataI(), _mm_setallone_si128()); } + Vc_INTRINSIC Mask() {} + Vc_INTRINSIC Mask(VArg x) : d(AVX::avx_cast(x)) {} + Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast(x)) {} + Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast(x)) {} + Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero()) {} + Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone()) {} + Vc_INTRINSIC explicit Mask(bool b) + : d(b ? Detail::allone() : Detail::zero()) + { + } + Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; } + Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; } + + // implicit cast + template + Vc_INTRINSIC Mask(U &&rhs, + Common::enable_if_mask_converts_implicitly = nullarg) + : d(AVX::avx_cast( + Detail::mask_cast::Size, Size, VectorTypeF>( + rhs.dataI()))) + { + } + +#if Vc_IS_VERSION_1 + // explicit cast, implemented via simd_cast (in avx/simd_cast_caller.h) + template + Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " + "mask types") Vc_INTRINSIC + explicit Mask(U &&rhs, + Common::enable_if_mask_converts_explicitly = nullarg); +#endif + + template Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); } + + template Vc_INTRINSIC void load(const bool *mem, Flags = Flags()); + + template Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const; + + Vc_INTRINSIC Mask &operator=(const Mask &) = default; + Vc_INTRINSIC_L Mask &operator=(const std::array &values) Vc_INTRINSIC_R; + Vc_INTRINSIC_L operator std::array() const Vc_INTRINSIC_R; + + // specializations in mask.tcc + Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const + { return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); } + + Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const + { return !operator==(rhs); } + + Vc_INTRINSIC Mask operator!() const { return Detail::andnot_(data(), Detail::allone()); } + + Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::and_(data(), rhs.data())); return *this; } + Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::or_ (data(), rhs.data())); return *this; } + Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::xor_(data(), rhs.data())); return *this; } + + Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); } + Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); } + Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); } -#ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK - Vc_ALWAYS_INLINE operator bool() const { return isFull(); } -#endif + Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); } + Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); } - Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; + // no need for expression template optimizations because cmp(n)eq for floats are not bitwise + // compares + Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R; + Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R; + Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R; + Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R; + + Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); } + Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int(dataI()); } + + Vc_INTRINSIC VectorTypeF data () const { return AVX::avx_cast(d.v()); } + Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast(d.v()); } + Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast(d.v()); } + +private: + friend reference; + static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept + { + return m.toInt() & (1 << i); + } + template + static Vc_INTRINSIC void set(Mask &m, int i, + U &&v) noexcept(noexcept(MaskBool(std::declval()))) + { + m.d.set(i, MaskBool(std::forward(v))); + } - Vc_ALWAYS_INLINE m128 data () const { return k; } - Vc_ALWAYS_INLINE m128i dataI() const { return avx_cast(k); } - Vc_ALWAYS_INLINE m128d dataD() const { return avx_cast(k); } +public: + Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept + { + return {*this, int(index)}; + } + Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept + { + return get(*this, index); + } - Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; + Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); } + Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); } - Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; + template static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R; + Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R; private: -#ifdef VC_COMPILE_BENCHMARKS +#ifdef Vc_COMPILE_BENCHMARKS public: #endif - m128 k; -}; - -struct ForeachHelper -{ - size_t mask; - bool brk; - bool outerBreak; - Vc_ALWAYS_INLINE ForeachHelper(size_t _mask) : mask(_mask), brk(false), outerBreak(false) {} - Vc_ALWAYS_INLINE bool outer() const { return mask != 0 && !outerBreak; } - Vc_ALWAYS_INLINE bool inner() { return (brk = !brk); } - Vc_ALWAYS_INLINE void noBreak() { outerBreak = false; } - Vc_ALWAYS_INLINE size_t next() { - outerBreak = true; -#ifdef VC_GNU_ASM - const size_t bit = __builtin_ctzl(mask); - __asm__("btr %1,%0" : "+r"(mask) : "r"(bit)); -#else -#ifdef VC_MSVC -#pragma warning(suppress : 4267) // conversion from 'size_t' to 'unsigned long', possible loss of data -#endif - const size_t bit = _bit_scan_forward(mask); - mask &= ~(1 << bit); -#endif - return bit; - } + Storage d; }; +template constexpr size_t Mask::Size; +template constexpr size_t Mask::MemoryAlignment; -#define Vc_foreach_bit(_it_, _mask_) \ - for (Vc::AVX::ForeachHelper Vc__make_unique(foreach_bit_obj)((_mask_).toInt()); Vc__make_unique(foreach_bit_obj).outer(); ) \ - for (_it_ = Vc__make_unique(foreach_bit_obj).next(); Vc__make_unique(foreach_bit_obj).inner(); Vc__make_unique(foreach_bit_obj).noBreak()) - -// Operators -namespace Intrinsics -{ - static Vc_ALWAYS_INLINE Vc_PURE m256 and_(param256 a, param256 b) { return _mm256_and_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m256 or_(param256 a, param256 b) { return _mm256_or_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m256 xor_(param256 a, param256 b) { return _mm256_xor_ps(a, b); } - - static Vc_ALWAYS_INLINE Vc_PURE m128 and_(param128 a, param128 b) { return _mm_and_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m128 or_(param128 a, param128 b) { return _mm_or_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m128 xor_(param128 a, param128 b) { return _mm_xor_ps(a, b); } -} // namespace Intrinsics - -// binary and/or/xor cannot work with one operand larger than the other -template void operator&(const Mask &l, const Mask &r); -template void operator|(const Mask &l, const Mask &r); -template void operator^(const Mask &l, const Mask &r); - -// let binary and/or/xor work for any combination of masks (as long as they have the same sizeof) -template Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &l, const Mask &r) { return Intrinsics::and_(l.data(), r.data()); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &l, const Mask &r) { return Intrinsics:: or_(l.data(), r.data()); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &l, const Mask &r) { return Intrinsics::xor_(l.data(), r.data()); } - -// disable logical and/or for incompatible masks -template void operator&&(const Mask &lhs, const Mask &rhs); -template void operator||(const Mask &lhs, const Mask &rhs); - -// logical and/or for compatible masks -template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return lhs && static_cast >(rhs); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return lhs || static_cast >(rhs); } - -template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return Intrinsics::and_(lhs.data(), rhs.data()); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return Intrinsics::or_ (lhs.data(), rhs.data()); } - -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +} // namespace Vc #include "mask.tcc" -#include "undomacros.h" -#endif // VC_AVX_MASK_H +#endif // VC_AVX_MASK_H_ diff -Nru vc-0.7.4/avx/mask.tcc vc-1.3.0/avx/mask.tcc --- vc-0.7.4/avx/mask.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/mask.tcc 2016-10-27 02:05:02.000000000 -0500 @@ -1,75 +1,292 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2011-2015 Matthias Kretz - Copyright (C) 2011-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. +}}}*/ - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { -namespace AVX +// store {{{1 +template +template +Vc_INTRINSIC void Mask::store(bool *mem, Flags f) const { + Detail::mask_store(dataI(), mem, f); +} -template<> Vc_ALWAYS_INLINE Mask<4, 32>::Mask(const Mask<8, 32> &m) - : k(concat(_mm_unpacklo_ps(lo128(m.data()), lo128(m.data())), - _mm_unpackhi_ps(lo128(m.data()), lo128(m.data())))) +// load {{{1 +template +template +Vc_INTRINSIC void Mask::load(const bool *mem, Flags f) { + d.v() = AVX::avx_cast(Detail::mask_load(mem, f)); } -template<> Vc_ALWAYS_INLINE Mask<8, 32>::Mask(const Mask<4, 32> &m) - // aabb ccdd -> abcd 0000 - : k(concat(Mem::shuffle(lo128(m.data()), hi128(m.data())), - _mm_setzero_ps())) +// operator[] {{{1 +#ifdef Vc_IMPL_AVX2 +template <> +Vc_INTRINSIC Vc_PURE bool AVX2::Mask::get(const AVX2::Mask &m, + int index) noexcept { + return m.shiftMask() & (1 << 2 * index); } +template <> +Vc_INTRINSIC Vc_PURE bool AVX2::Mask::get(const AVX2::Mask &m, + int index) noexcept +{ + return m.shiftMask() & (1 << 2 * index); +} +#endif +// operator== {{{1 +template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const +{ return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); } +#ifdef Vc_IMPL_AVX2 +template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const +{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); } +template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const +{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); } +#endif -template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const +// isFull, isNotEmpty, isEmpty, isMix specializations{{{1 +template Vc_INTRINSIC bool Mask::isFull() const { + if (sizeof(T) == 8) { + return 0 != Detail::testc(dataD(), Detail::allone()); + } else if (sizeof(T) == 4) { + return 0 != Detail::testc(data (), Detail::allone()); + } else { + return 0 != Detail::testc(dataI(), Detail::allone()); + } +} + +template Vc_INTRINSIC bool Mask::isNotEmpty() const { + if (sizeof(T) == 8) { + return 0 == Detail::testz(dataD(), dataD()); + } else if (sizeof(T) == 4) { + return 0 == Detail::testz(data (), data ()); + } else { + return 0 == Detail::testz(dataI(), dataI()); + } +} + +template Vc_INTRINSIC bool Mask::isEmpty() const { + if (sizeof(T) == 8) { + return 0 != Detail::testz(dataD(), dataD()); + } else if (sizeof(T) == 4) { + return 0 != Detail::testz(data (), data ()); + } else { + return 0 != Detail::testz(dataI(), dataI()); + } +} + +template Vc_INTRINSIC bool Mask::isMix() const { + if (sizeof(T) == 8) { + return 0 != Detail::testnzc(dataD(), Detail::allone()); + } else if (sizeof(T) == 4) { + return 0 != Detail::testnzc(data (), Detail::allone()); + } else { + return 0 != Detail::testnzc(dataI(), Detail::allone()); + } +} + +// generate {{{1 +template +Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { - return _mm256_movemask_epi8(dataI()); + return _mm256_setr_epi64x( + gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0, + gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0); } -template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const +template +Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { - return _mm_movemask_epi8(dataI()); + return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0, + gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0, + gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0, + gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0); } +template +Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) +{ + return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0, + gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0, + gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0, + gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0, + gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0, + gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0, + gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0, + gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0); +} +template +template +Vc_INTRINSIC AVX2::Mask Mask::generate(G &&gen) +{ + return generate_impl>(std::forward(gen), + std::integral_constant()); +} +// shifted {{{1 +template Vc_INTRINSIC Vc_PURE AVX2::Mask Mask::shifted(int amount) const +{ + switch (amount * int(sizeof(VectorEntryType))) { + case 0: return *this; + case 1: return Detail::shifted< 1>(dataI()); + case 2: return Detail::shifted< 2>(dataI()); + case 3: return Detail::shifted< 3>(dataI()); + case 4: return Detail::shifted< 4>(dataI()); + case 5: return Detail::shifted< 5>(dataI()); + case 6: return Detail::shifted< 6>(dataI()); + case 7: return Detail::shifted< 7>(dataI()); + case 8: return Detail::shifted< 8>(dataI()); + case 9: return Detail::shifted< 9>(dataI()); + case 10: return Detail::shifted< 10>(dataI()); + case 11: return Detail::shifted< 11>(dataI()); + case 12: return Detail::shifted< 12>(dataI()); + case 13: return Detail::shifted< 13>(dataI()); + case 14: return Detail::shifted< 14>(dataI()); + case 15: return Detail::shifted< 15>(dataI()); + case 16: return Detail::shifted< 16>(dataI()); + case 17: return Detail::shifted< 17>(dataI()); + case 18: return Detail::shifted< 18>(dataI()); + case 19: return Detail::shifted< 19>(dataI()); + case 20: return Detail::shifted< 20>(dataI()); + case 21: return Detail::shifted< 21>(dataI()); + case 22: return Detail::shifted< 22>(dataI()); + case 23: return Detail::shifted< 23>(dataI()); + case 24: return Detail::shifted< 24>(dataI()); + case 25: return Detail::shifted< 25>(dataI()); + case 26: return Detail::shifted< 26>(dataI()); + case 27: return Detail::shifted< 27>(dataI()); + case 28: return Detail::shifted< 28>(dataI()); + case 29: return Detail::shifted< 29>(dataI()); + case 30: return Detail::shifted< 30>(dataI()); + case 31: return Detail::shifted< 31>(dataI()); + case -1: return Detail::shifted< -1>(dataI()); + case -2: return Detail::shifted< -2>(dataI()); + case -3: return Detail::shifted< -3>(dataI()); + case -4: return Detail::shifted< -4>(dataI()); + case -5: return Detail::shifted< -5>(dataI()); + case -6: return Detail::shifted< -6>(dataI()); + case -7: return Detail::shifted< -7>(dataI()); + case -8: return Detail::shifted< -8>(dataI()); + case -9: return Detail::shifted< -9>(dataI()); + case -10: return Detail::shifted<-10>(dataI()); + case -11: return Detail::shifted<-11>(dataI()); + case -12: return Detail::shifted<-12>(dataI()); + case -13: return Detail::shifted<-13>(dataI()); + case -14: return Detail::shifted<-14>(dataI()); + case -15: return Detail::shifted<-15>(dataI()); + case -16: return Detail::shifted<-16>(dataI()); + case -17: return Detail::shifted<-17>(dataI()); + case -18: return Detail::shifted<-18>(dataI()); + case -19: return Detail::shifted<-19>(dataI()); + case -20: return Detail::shifted<-20>(dataI()); + case -21: return Detail::shifted<-21>(dataI()); + case -22: return Detail::shifted<-22>(dataI()); + case -23: return Detail::shifted<-23>(dataI()); + case -24: return Detail::shifted<-24>(dataI()); + case -25: return Detail::shifted<-25>(dataI()); + case -26: return Detail::shifted<-26>(dataI()); + case -27: return Detail::shifted<-27>(dataI()); + case -28: return Detail::shifted<-28>(dataI()); + case -29: return Detail::shifted<-29>(dataI()); + case -30: return Detail::shifted<-30>(dataI()); + case -31: return Detail::shifted<-31>(dataI()); + } + return Zero(); +} +// }}}1 -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 4, 32>::toInt() const { return _mm256_movemask_pd(dataD()); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8, 32>::toInt() const { return _mm256_movemask_ps(data ()); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8, 16>::toInt() const { return _mm_movemask_epi8(_mm_packs_epi16(dataI(), _mm_setzero_si128())); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<16, 16>::toInt() const { return _mm_movemask_epi8(dataI()); } +/* +template<> Vc_INTRINSIC AVX2::Mask< 4, 32> &AVX2::Mask< 4, 32>::operator=(const std::array &values) { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + unsigned int x = *reinterpret_cast(values.data()); + x *= 0xffu; + __m128i y = _mm_cvtsi32_si128(x); // 4 Bytes + y = _mm_unpacklo_epi8(y, y); // 8 Bytes + y = _mm_unpacklo_epi16(y, y); // 16 Bytes + d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(y, y), _mm_unpackhi_epi32(y, y))); + return *this; +} +template<> Vc_INTRINSIC AVX2::Mask< 8, 32> &AVX2::Mask< 8, 32>::operator=(const std::array &values) { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + unsigned long long x = *reinterpret_cast(values.data()); + x *= 0xffull; + __m128i y = _mm_cvtsi64_si128(x); // 8 Bytes + y = _mm_unpacklo_epi8(y, y); // 16 Bytes + d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(y, y), _mm_unpackhi_epi16(y, y))); + return *this; +} +template<> Vc_INTRINSIC AVX2::Mask< 8, 16> &AVX2::Mask< 8, 16>::operator=(const std::array &values) { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + unsigned long long x = *reinterpret_cast(values.data()); + x *= 0xffull; + __m128i y = _mm_cvtsi64_si128(x); // 8 Bytes + d.v() = AVX::avx_cast<__m128>(_mm_unpacklo_epi8(y, y)); + return *this; +} +template<> Vc_INTRINSIC AVX2::Mask<16, 16> &AVX2::Mask<16, 16>::operator=(const std::array &values) { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + __m128i x = _mm_loadu_si128(reinterpret_cast(values.data())); + d.v() = _mm_andnot_ps(AVX::_mm_setallone_ps(), AVX::avx_cast<__m128>(_mm_sub_epi8(x, _mm_set1_epi8(1)))); + return *this; +} -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 4, 32>::operator[](int index) const { return toInt() & (1 << index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8, 32>::operator[](int index) const { return toInt() & (1 << index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8, 16>::operator[](int index) const { return shiftMask() & (1 << 2 * index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask<16, 16>::operator[](int index) const { return toInt() & (1 << index); } +template<> Vc_INTRINSIC AVX2::Mask< 4, 32>::operator std::array() const { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 64bit -> 32bit + x = _mm_packs_epi32(x, x); // 32bit -> 16bit + x = _mm_srli_epi16(x, 15); + x = _mm_packs_epi16(x, x); // 16bit -> 8bit + std::array r; + asm volatile("vmovd %1,%0" : "=m"(*r.data()) : "x"(x)); + return r; +} +template<> Vc_INTRINSIC AVX2::Mask< 8, 32>::operator std::array() const { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 32bit -> 16bit + x = _mm_srli_epi16(x, 15); + x = _mm_packs_epi16(x, x); // 16bit -> 8bit + std::array r; + asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x)); + return r; +} +template<> Vc_INTRINSIC AVX2::Mask< 8, 16>::operator std::array() const { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + __m128i x = _mm_srli_epi16(dataI(), 15); + x = _mm_packs_epi16(x, x); // 16bit -> 8bit + std::array r; + asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x)); + return r; +} +template<> Vc_INTRINSIC AVX2::Mask<16, 16>::operator std::array() const { + static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte"); + __m128 x = _mm_and_ps(d.v(), AVX::avx_cast<__m128>(_mm_set1_epi32(0x01010101))); + std::array r; + asm volatile("vmovups %1,%0" : "=m"(*r.data()) : "x"(x)); + return r; +} +*/ -#ifndef VC_IMPL_POPCNT -static Vc_ALWAYS_INLINE Vc_CONST unsigned int _mm_popcnt_u32(unsigned int n) { - n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U); - n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U); - n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU); - //n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU); - //n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU); - return n; } -#endif -template Vc_ALWAYS_INLINE Vc_PURE int Mask::count() const { return _mm_popcnt_u32(toInt()); } -template Vc_ALWAYS_INLINE Vc_PURE int Mask::count() const { return _mm_popcnt_u32(toInt()); } -template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { return _bit_scan_forward(toInt()); } -template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { return _bit_scan_forward(toInt()); } - -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/avx/math.h vc-1.3.0/avx/math.h --- vc-0.7.4/avx/math.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/math.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,119 +1,316 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_MATH_H -#define VC_AVX_MATH_H +#ifndef VC_AVX_MATH_H_ +#define VC_AVX_MATH_H_ #include "const.h" #include "limits.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE +{ +// min & max {{{1 +#ifdef Vc_IMPL_AVX2 +Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_min_epi32(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); } +#endif +Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::float_v max(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_max_ps(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); } + +// sqrt {{{1 +template +Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector sqrt(const AVX2::Vector &x) +{ + return AVX::VectorHelper::sqrt(x.data()); +} + +// rsqrt {{{1 +template +Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector rsqrt(const AVX2::Vector &x) +{ + return AVX::VectorHelper::rsqrt(x.data()); +} + +// reciprocal {{{1 +template +Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector reciprocal(const AVX2::Vector &x) +{ + return AVX::VectorHelper::reciprocal(x.data()); +} + +// round {{{1 +template +Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector round(const AVX2::Vector &x) +{ + return AVX::VectorHelper::round(x.data()); +} + +// abs {{{1 +Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x) +{ + return Detail::and_(x.data(), AVX::setabsmask_pd()); +} +Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x) +{ + return Detail::and_(x.data(), AVX::setabsmask_ps()); +} +#ifdef Vc_IMPL_AVX2 +Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x) +{ + return _mm256_abs_epi32(x.data()); +} +Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x) +{ + return _mm256_abs_epi16(x.data()); +} +#endif + +// isfinite {{{1 +Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x) +{ + return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data())); +} + +Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x) +{ + return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data())); +} + +// isinf {{{1 +Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x) +{ + return _mm256_castsi256_pd(AVX::cmpeq_epi64( + _mm256_castpd_si256(abs(x).data()), + _mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log::d(1))))); +} + +Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x) +{ + return _mm256_castsi256_ps( + AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()), + _mm256_castps_si256(Detail::avx_broadcast(AVX::c_log::d(1))))); +} + +// isnan {{{1 +Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x) { -namespace AVX + return AVX::cmpunord_pd(x.data(), x.data()); +} + +Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x) +{ + return AVX::cmpunord_ps(x.data(), x.data()); +} + +// copysign {{{1 +Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign) +{ + return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()), + _mm256_and_ps(mag.data(), AVX::setabsmask_ps())); +} +Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag, + AVX2::double_v::AsArg sign) { - /** - * splits \p v into exponent and mantissa, the sign is kept with the mantissa - * - * The return value will be in the range [0.5, 1.0[ - * The \p e value will be an integer defining the power-of-two exponent - */ - inline double_v frexp(double_v::AsArg v, int_v *e) { - const m256d exponentBits = Const::exponentMask().dataD(); - const m256d exponentPart = _mm256_and_pd(v.data(), exponentBits); - e->data() = _mm256_sub_epi32(_mm256_srli_epi64(avx_cast(exponentPart), 52), _mm256_set1_epi32(0x3fe)); - const m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits); - double_v ret = _mm256_and_pd(exponentMaximized, _mm256_broadcast_sd(reinterpret_cast(&c_general::frexpMask))); - double_m zeroMask = v == double_v::Zero(); - ret(isnan(v) || !isfinite(v) || zeroMask) = v; - e->setZero(zeroMask.data()); - return ret; - } - inline float_v frexp(float_v::AsArg v, int_v *e) { - const m256 exponentBits = Const::exponentMask().data(); - const m256 exponentPart = _mm256_and_ps(v.data(), exponentBits); - e->data() = _mm256_sub_epi32(_mm256_srli_epi32(avx_cast(exponentPart), 23), _mm256_set1_epi32(0x7e)); - const m256 exponentMaximized = _mm256_or_ps(v.data(), exponentBits); - float_v ret = _mm256_and_ps(exponentMaximized, avx_cast(_mm256_set1_epi32(0xbf7fffffu))); - ret(isnan(v) || !isfinite(v) || v == float_v::Zero()) = v; - e->setZero(v == float_v::Zero()); - return ret; - } - inline sfloat_v frexp(sfloat_v::AsArg v, short_v *e) { - const m256 exponentBits = Const::exponentMask().data(); - const m256 exponentPart = _mm256_and_ps(v.data(), exponentBits); - e->data() = _mm_sub_epi16(_mm_packs_epi32(_mm_srli_epi32(avx_cast(exponentPart), 23), - _mm_srli_epi32(avx_cast(hi128(exponentPart)), 23)), _mm_set1_epi16(0x7e)); - const m256 exponentMaximized = _mm256_or_ps(v.data(), exponentBits); - sfloat_v ret = _mm256_and_ps(exponentMaximized, avx_cast(_mm256_set1_epi32(0xbf7fffffu))); - ret(isnan(v) || !isfinite(v) || v == sfloat_v::Zero()) = v; - e->setZero(v == sfloat_v::Zero()); - return ret; - } - - /* -> x * 2^e - * x == NaN -> NaN - * x == (-)inf -> (-)inf - */ - inline double_v ldexp(double_v::AsArg v, int_v::AsArg _e) { - int_v e = _e; - e.setZero((v == double_v::Zero()).dataI()); - const m256i exponentBits = _mm256_slli_epi64(e.data(), 52); - return avx_cast(_mm256_add_epi64(avx_cast(v.data()), exponentBits)); - } - inline float_v ldexp(float_v::AsArg v, int_v::AsArg _e) { - int_v e = _e; - e.setZero(static_cast(v == float_v::Zero())); - return (v.reinterpretCast() + (e << 23)).reinterpretCast(); - } - inline sfloat_v ldexp(sfloat_v::AsArg v, short_v::AsArg _e) { - short_v e = _e; - e.setZero(static_cast(v == sfloat_v::Zero())); - e = e << (23 - 16); - const m256i exponentBits = concat(_mm_unpacklo_epi16(_mm_setzero_si128(), e.data()), - _mm_unpackhi_epi16(_mm_setzero_si128(), e.data())); - return (v.reinterpretCast() + int_v(exponentBits)).reinterpretCast(); - } - - static Vc_ALWAYS_INLINE float_v trunc( float_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } - static Vc_ALWAYS_INLINE sfloat_v trunc(sfloat_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } - static Vc_ALWAYS_INLINE double_v trunc(double_v::AsArg v) { return _mm256_round_pd(v.data(), 0x3); } - - static Vc_ALWAYS_INLINE float_v floor(float_v::AsArg v) { return _mm256_floor_ps(v.data()); } - static Vc_ALWAYS_INLINE sfloat_v floor(sfloat_v::AsArg v) { return _mm256_floor_ps(v.data()); } - static Vc_ALWAYS_INLINE double_v floor(double_v::AsArg v) { return _mm256_floor_pd(v.data()); } - - static Vc_ALWAYS_INLINE float_v ceil(float_v::AsArg v) { return _mm256_ceil_ps(v.data()); } - static Vc_ALWAYS_INLINE sfloat_v ceil(sfloat_v::AsArg v) { return _mm256_ceil_ps(v.data()); } - static Vc_ALWAYS_INLINE double_v ceil(double_v::AsArg v) { return _mm256_ceil_pd(v.data()); } -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#include "undomacros.h" -#define VC__USE_NAMESPACE AVX -#include "../common/trigonometric.h" -#define VC__USE_NAMESPACE AVX -#include "../common/logarithm.h" -#define VC__USE_NAMESPACE AVX -#include "../common/exponential.h" -#undef VC__USE_NAMESPACE + return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()), + _mm256_and_pd(mag.data(), AVX::setabsmask_pd())); +} + +//}}}1 +// frexp {{{1 +/** + * splits \p v into exponent and mantissa, the sign is kept with the mantissa + * + * The return value will be in the range [0.5, 1.0[ + * The \p e value will be an integer defining the power-of-two exponent + */ +inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray *e) +{ + const __m256d exponentBits = AVX::Const::exponentMask().dataD(); + const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits); + auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart)); + auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart)); + lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe)); + hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe)); + SSE::int_v exponent = Mem::shuffle(lo, hi); + const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits); + AVX2::double_v ret = + _mm256_and_pd(exponentMaximized, + _mm256_broadcast_sd(reinterpret_cast(&AVX::c_general::frexpMask))); + const double_m zeroMask = v == AVX2::double_v::Zero(); + ret(isnan(v) || !isfinite(v) || zeroMask) = v; + exponent.setZero(simd_cast(zeroMask)); + internal_data(*e) = exponent; + return ret; +} + +#ifdef Vc_IMPL_AVX2 +inline SimdArray frexp( + const SimdArray &v, + SimdArray *e) +{ + const __m256d exponentBits = AVX::Const::exponentMask().dataD(); + const __m256d w[2] = {internal_data(internal_data0(v)).data(), + internal_data(internal_data1(v)).data()}; + const __m256i exponentPart[2] = { + _mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)), + _mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))}; + const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52), + _mm256_set1_epi32(0x3fe)); // 0.1. 2.3. + const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52), + _mm256_set1_epi32(0x3fe)); // 4.5. 6.7. + const __m256i a = _mm256_unpacklo_epi32(lo, hi); // 04.. 26.. + const __m256i b = _mm256_unpackhi_epi32(lo, hi); // 15.. 37.. + const __m256i tmp = _mm256_unpacklo_epi32(a, b); // 0145 2367 + const __m256i exponent = + AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)), + _mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp))); // 0123 4567 + const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits), + _mm256_or_pd(w[1], exponentBits)}; + const auto frexpMask = + _mm256_broadcast_sd(reinterpret_cast(&AVX::c_general::frexpMask)); + SimdArray ret = { + SimdArray( + _mm256_and_pd(exponentMaximized[0], frexpMask)), + SimdArray( + _mm256_and_pd(exponentMaximized[1], frexpMask))}; + const auto zeroMask = v == v.Zero(); + ret(isnan(v) || !isfinite(v) || zeroMask) = v; + internal_data(*e) = + Detail::andnot_(simd_cast(zeroMask).dataI(), exponent); + return ret; +} +#endif // Vc_IMPL_AVX2 + +namespace Detail +{ +Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e) +{ + SimdArray exponentPart; + const auto ee = AVX::avx_cast<__m256i>(e); +#ifdef Vc_IMPL_AVX2 + exponentPart = AVX2::uint_v(ee); +#else + internal_data(internal_data0(exponentPart)) = AVX::lo128(ee); + internal_data(internal_data1(exponentPart)) = AVX::hi128(ee); +#endif + return (exponentPart >> 23) - 0x7e; +} +} // namespace Detail +inline AVX2::float_v frexp(AVX2::float_v::AsArg v, AVX2::float_v::IndexType *e) +{ + using namespace Detail; + using namespace AVX2; + const __m256 exponentBits = Const::exponentMask().data(); + *e = extractExponent(and_(v.data(), exponentBits)); + const __m256 exponentMaximized = or_(v.data(), exponentBits); + AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu))); + ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v; + e->setZero(simd_cast(v == AVX2::float_v::Zero())); + return ret; +} + +// ldexp {{{1 +/* -> x * 2^e + * x == NaN -> NaN + * x == (-)inf -> (-)inf + */ +inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, + const SimdArray &_e) +{ + SSE::int_v e = internal_data(_e); + e.setZero(simd_cast(v == AVX2::double_v::Zero())); + const __m256i exponentBits = + AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52), + _mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52)); + return AVX::avx_cast<__m256d>( + AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits)); +} +inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray e) +{ + e.setZero(simd_cast(v == AVX2::float_v::Zero())); + e <<= 23; + return {AVX::avx_cast<__m256>( + AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())), + internal_data(internal_data0(e)).data()), + _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())), + internal_data(internal_data1(e)).data())))}; +} + +// trunc {{{1 +Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v) +{ + return _mm256_round_ps(v.data(), 0x3); +} +Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v) +{ + return _mm256_round_pd(v.data(), 0x3); +} + +// floor {{{1 +Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v) +{ + return _mm256_floor_ps(v.data()); +} +Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v) +{ + return _mm256_floor_pd(v.data()); +} + +// ceil {{{1 +Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v) +{ + return _mm256_ceil_ps(v.data()); +} +Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v) +{ + return _mm256_ceil_pd(v.data()); +} + +// fma {{{1 +template +Vc_ALWAYS_INLINE Vector fma(Vector a, + Vector b, + Vector c) +{ + return Detail::fma(a.data(), b.data(), c.data(), T()); +} + +// }}}1 +} // namespace Vc + +#endif // VC_AVX_MATH_H_ -#endif // VC_AVX_MATH_H +// vim: foldmethod=marker diff -Nru vc-0.7.4/avx/prefetches.tcc vc-1.3.0/avx/prefetches.tcc --- vc-0.7.4/avx/prefetches.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/prefetches.tcc 1969-12-31 18:00:00.000000000 -0600 @@ -1,58 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010, 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_PREFETCHES_TCC -#define VC_AVX_PREFETCHES_TCC - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace Internal -{ - -Vc_ALWAYS_INLINE void HelperImpl::prefetchForOneRead(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_NTA); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchClose(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchMid(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T1); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchFar(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T2); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchForModify(const void *addr) -{ -#ifdef __3dNOW__ - _m_prefetchw(const_cast(addr)); -#else - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); -#endif -} - -} // namespace Internal -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#endif // VC_AVX_PREFETCHES_TCC diff -Nru vc-0.7.4/avx/shuffle.h vc-1.3.0/avx/shuffle.h --- vc-0.7.4/avx/shuffle.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/shuffle.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,108 +1,170 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2011-2015 Matthias Kretz - Copyright (C) 2011-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_SHUFFLE_H -#define VC_AVX_SHUFFLE_H +#ifndef VC_AVX_SHUFFLE_H_ +#define VC_AVX_SHUFFLE_H_ #include "../sse/shuffle.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { - using AVX::m128; - using AVX::m128d; - using AVX::m128i; - using AVX::m256; - using AVX::m256d; - using AVX::m256i; - using AVX::param128; - using AVX::param128d; - using AVX::param128i; - using AVX::param256; - using AVX::param256d; - using AVX::param256i; - namespace Mem - { - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x) { - VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); - VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); - return _mm256_permute2f128_ps(x, x, L + H * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x) { - VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); - VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); - return _mm256_permute2f128_pd(x, x, L + H * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x) { - VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); - VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); - return _mm256_permute2f128_si256(x, x, L + H * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); +namespace Detail +{ +template struct Permutation {}; +template struct Mask {}; + +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST __m256i +blend(__m256i a, __m256i b, Mask) +{ + static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) && + (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) && + (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) && + (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) && + (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) && + (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) && + (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) && + (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1), + "Selectors must be 0 or 1 to select the value from a or b"); + constexpr uint8_t mask = static_cast( + (Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) | + (Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) | + (Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) | + (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15)); + return _mm256_blend_epi16(a, b, mask); +} +#endif // Vc_IMPL_AVX2 +} // namespace Detail +namespace Mem +{ +#ifdef Vc_IMPL_AVX2 + template static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); + static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); + return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); + } + + template static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) { + static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range"); + static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range"); + return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); + } +#endif // Vc_IMPL_AVX2 + + template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) { + static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); + static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); + return _mm256_permute2f128_ps( + x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); + } + template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) { + static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); + static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); + return _mm256_permute2f128_pd( + x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); + } + template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) { + static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); + static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); +#ifdef Vc_IMPL_AVX2 + return _mm256_permute2x128_si256( + x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); +#else + return _mm256_permute2f128_si256( + x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); +#endif + } + template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) { + static_assert(L >= X0 && H >= X0, "Incorrect_Range"); + static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } - template static Vc_ALWAYS_INLINE m256i Vc_CONST shuffle128(param256i x, param256i y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle128(param256d x, param256d y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) { + static_assert(L >= X0 && H >= X0, "Incorrect_Range"); + static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); +#ifdef Vc_IMPL_AVX2 + return _mm256_permute2x128_si256( + x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); +#else + return _mm256_permute2f128_si256( + x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); +#endif + } + template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) { + static_assert(L >= X0 && H >= X0, "Incorrect_Range"); + static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range"); + static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); + static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } - template static Vc_ALWAYS_INLINE m256i Vc_CONST permute(param256i x) { + template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) { return _mm256_castps_si256(permute(_mm256_castsi256_ps(x))); } - template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range); +#ifdef Vc_IMPL_AVX2 + template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); + static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); + return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); + } +#endif // Vc_IMPL_AVX2 + template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) { + static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range"); + static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } - template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); + static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } template - static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y) { - VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); - VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); - VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range); - VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range); - VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range); - VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range); + static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) { + static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); + static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); + static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range"); + static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range"); + static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range"); + static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range"); + static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range"); + static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range"); return _mm256_blend_ps(x, y, (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + @@ -111,26 +173,26 @@ ); } template - static Vc_ALWAYS_INLINE m256i Vc_CONST blend(param256i x, param256i y) { + static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) { return _mm256_castps_si256(blend(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } template struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; }; template - static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst0 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst1 >= X0 && Dst1 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst2 >= X0 && Dst2 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst3 >= X0 && Dst3 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst4 >= X0 && Dst4 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst5 >= X0 && Dst5 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst6 >= X0 && Dst6 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst7 >= X0 && Dst7 <= X7, Incorrect_Range); + static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { + static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range"); + static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range"); + static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range"); + static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range"); + static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range"); + static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range"); + static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range"); + static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range"); if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) { return permute(x); } - const m128 loIn = _mm256_castps256_ps128(x); - const m128 hiIn = _mm256_extractf128_ps(x, 1); - m128 lo, hi; + const __m128 loIn = _mm256_castps256_ps128(x); + const __m128 hiIn = _mm256_extractf128_ps(x, 1); + __m128 lo, hi; if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) { lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); @@ -176,7 +238,8 @@ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); } - } // namespace Mem +} // namespace Mem +} // namespace Vc // little endian has the lo bits on the right and high bits on the left // with vectors this becomes greatly confusing: @@ -184,56 +247,62 @@ // Reg: dcba // // The shuffles and permutes above use memory ordering. The ones below use register ordering: - namespace Reg - { - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Reg +{ + template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) { + static_assert(L >= X0 && H >= X0, "Incorrect_Range"); + static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } - template static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x, param256i y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x, param256d y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) { + static_assert(L >= X0 && H >= X0, "Incorrect_Range"); + static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); +#ifdef Vc_IMPL_AVX2 + return _mm256_permute2x128_si256( + x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); +#else + return _mm256_permute2f128_si256( + x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); +#endif + } + template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) { + static_assert(L >= X0 && H >= X0, "Incorrect_Range"); + static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range"); + static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); + static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } - template static Vc_ALWAYS_INLINE m128d Vc_CONST permute(param128d x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) { + static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range"); + static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range"); return _mm_permute_pd(x, Dst0 + Dst1 * 2); } - template static Vc_ALWAYS_INLINE m128 Vc_CONST permute(param128 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); + static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } - template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) { + static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range"); + static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } - template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); + template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) { + static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); + static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } - } // namespace Reg -} // namespace Vc -/*OUTER_NAMESPACE_END*/ -#include "undomacros.h" +} // namespace Reg +} // namespace Vc -#endif // VC_AVX_SHUFFLE_H +#endif // VC_AVX_SHUFFLE_H_ diff -Nru vc-0.7.4/avx/simd_cast_caller.tcc vc-1.3.0/avx/simd_cast_caller.tcc --- vc-0.7.4/avx/simd_cast_caller.tcc 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/avx/simd_cast_caller.tcc 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,55 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_ +#define Vc_AVX_SIMD_CAST_CALLER_TCC_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +#if Vc_IS_VERSION_1 +template +template +Vc_INTRINSIC Vector::Vector(U &&x) + : d(simd_cast(std::forward(x)).data()) +{ +} + +template +template +Vc_INTRINSIC Mask::Mask(U &&rhs, + Common::enable_if_mask_converts_explicitly) + : Mask(simd_cast(std::forward(rhs))) +{ +} +#endif // Vc_IS_VERSION_1 +} + +#endif // Vc_AVX_SIMD_CAST_CALLER_TCC_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/avx/simd_cast.h vc-1.3.0/avx/simd_cast.h --- vc-0.7.4/avx/simd_cast.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/avx/simd_cast.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,2735 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_AVX_SIMD_CAST_H_ +#define VC_AVX_SIMD_CAST_H_ + +#ifndef VC_AVX_VECTOR_H_ +#error "Vc/avx/vector.h needs to be included before Vc/avx/simd_cast.h" +#endif +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +// Declarations: helper macros Vc_SIMD_CAST_AVX_[124] & Vc_SIMD_CAST_[124] {{{1 +#define Vc_SIMD_CAST_AVX_1(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + AVX2::from_ x, enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_AVX_2(from_, to_) \ + static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \ + "this type combination is wrong"); \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + AVX2::from_ x0, AVX2::from_ x1, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_AVX_3(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_AVX_4(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, AVX2::from_ x3, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_1(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x, enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_2(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x0, from_ x1, enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_3(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x0, from_ x1, from_ x2, enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_4(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x0, from_ x1, from_ x2, from_ x3, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_5(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_6(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_7(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_8(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \ + enable_if::value> = nullarg) + +#define Vc_SIMD_CAST_OFFSET(from_, to_, offset_) \ + static_assert(from_::size() >= to_::size() * (offset_ + 1), \ + "this offset cannot exist for this type combination"); \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x, \ + enable_if<(offset == offset_ && std::is_same::value)> = nullarg) + +// Declaration: SSE -> AVX where the AVX Vector is integral and thus of equal size() {{{1 +// as the equivalent SSE Vector +template +Vc_INTRINSIC Vc_CONST To +simd_cast(From x, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)> = + nullarg); +template +Vc_INTRINSIC Vc_CONST To simd_cast( + From x0, From x1, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)> = nullarg); +template +Vc_INTRINSIC Vc_CONST To simd_cast( + From x0, From x1, From x2, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)> = nullarg); +template +Vc_INTRINSIC Vc_CONST To simd_cast( + From x0, From x1, From x2, From x3, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)> = nullarg); +template +Vc_INTRINSIC Vc_CONST To simd_cast( + From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)> = nullarg); + +// Declarations: Vector casts without offset {{{1 +// AVX2::Vector {{{2 +Vc_SIMD_CAST_AVX_1( float_v, double_v); + +Vc_SIMD_CAST_AVX_1(double_v, float_v); +Vc_SIMD_CAST_AVX_2(double_v, float_v); + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_1( int_v, double_v); +Vc_SIMD_CAST_AVX_1( uint_v, double_v); +Vc_SIMD_CAST_AVX_1( short_v, double_v); +Vc_SIMD_CAST_AVX_1(ushort_v, double_v); + +Vc_SIMD_CAST_AVX_1( int_v, float_v); +Vc_SIMD_CAST_AVX_1( uint_v, float_v); +Vc_SIMD_CAST_AVX_1( short_v, float_v); +Vc_SIMD_CAST_AVX_1(ushort_v, float_v); + +Vc_SIMD_CAST_AVX_1(double_v, int_v); +Vc_SIMD_CAST_AVX_1( float_v, int_v); +Vc_SIMD_CAST_AVX_1( uint_v, int_v); +Vc_SIMD_CAST_AVX_1( short_v, int_v); +Vc_SIMD_CAST_AVX_1(ushort_v, int_v); +Vc_SIMD_CAST_AVX_2(double_v, int_v); + +Vc_SIMD_CAST_AVX_1(double_v, uint_v); +Vc_SIMD_CAST_AVX_1( float_v, uint_v); +Vc_SIMD_CAST_AVX_1( int_v, uint_v); +Vc_SIMD_CAST_AVX_1( short_v, uint_v); +Vc_SIMD_CAST_AVX_1(ushort_v, uint_v); +Vc_SIMD_CAST_AVX_2(double_v, uint_v); + +Vc_SIMD_CAST_AVX_1(double_v, short_v); +Vc_SIMD_CAST_AVX_1( float_v, short_v); +Vc_SIMD_CAST_AVX_1( int_v, short_v); +Vc_SIMD_CAST_AVX_1( uint_v, short_v); +Vc_SIMD_CAST_AVX_1(ushort_v, short_v); +Vc_SIMD_CAST_AVX_2(double_v, short_v); +Vc_SIMD_CAST_AVX_2( float_v, short_v); +Vc_SIMD_CAST_AVX_2( int_v, short_v); +Vc_SIMD_CAST_AVX_2( uint_v, short_v); +Vc_SIMD_CAST_AVX_3(double_v, short_v); +Vc_SIMD_CAST_AVX_4(double_v, short_v); + +Vc_SIMD_CAST_AVX_1(double_v, ushort_v); +Vc_SIMD_CAST_AVX_1( float_v, ushort_v); +Vc_SIMD_CAST_AVX_1( int_v, ushort_v); +Vc_SIMD_CAST_AVX_1( uint_v, ushort_v); +Vc_SIMD_CAST_AVX_1( short_v, ushort_v); +Vc_SIMD_CAST_AVX_2(double_v, ushort_v); +Vc_SIMD_CAST_AVX_2( float_v, ushort_v); +Vc_SIMD_CAST_AVX_2( int_v, ushort_v); +Vc_SIMD_CAST_AVX_2( uint_v, ushort_v); +Vc_SIMD_CAST_AVX_3(double_v, ushort_v); +Vc_SIMD_CAST_AVX_4(double_v, ushort_v); +#endif + +// 1 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v); +Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v); +Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v); +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v); +Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v); +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v); + +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v); +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v); +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v); +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v); +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v); +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v); + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v); +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v); +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v); + +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v); +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v); +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v); +Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v); + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v); +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v); +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v); +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v); + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v); +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v); +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v); +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v); + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v); +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v); +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v); +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v); + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v); +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v); +Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v); +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v); +#endif + +// 2 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v); + +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v); +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v); +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v); +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v); + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v); +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v); +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v); + +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v); +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v); +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v); +Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v); + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v); +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v); + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v); +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v); + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v); +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v); +Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v); +Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v); + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v); +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v); +Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v); +Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v); +#endif + +// 3 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v); + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v); +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v); +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v); + +Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v); +Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v); + +Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v); +Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v); + +Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v); +Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v); +#endif + +// 4 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v); + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v); +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v); +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v); + +Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v); +Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v); + +Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v); +Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v); + +Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v); +Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v); +#endif + +// 5 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v); +#endif + +// 6 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v); +#endif + +// 7 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v); +#endif + +// 8 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v); +Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v); +#endif + +// 1 AVX2::Vector to 1 SSE::Vector {{{2 +Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v); +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v); +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v); +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v); +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v); +Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v); + +Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v); +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v); +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v); +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v); +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v); +Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v); + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v); +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v); +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v); +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v); +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v); +Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v); + +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v); +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v); +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v); +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v); +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v); +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v); + +Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v); +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v); +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v); +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v); +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v); +Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v); + +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v); +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v); +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v); +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v); +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v); +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v); +#endif + +// 2 AVX2::Vector to 1 SSE::Vector {{{2 +Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v); +Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v); + +// 1 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value> = nullarg); +#endif + +// 2 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value> = nullarg); +#endif + +// 3 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value> = nullarg); +#endif + +// 4 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value> = nullarg); +#endif + +// 5 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value> = nullarg); +#endif + +// 6 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value> = nullarg); +#endif + +// 7 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value> = nullarg); +#endif + +// 8 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value> = nullarg); +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value> = nullarg); +#endif + +// 9 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + enable_if::value> = nullarg); +#endif + +// 10 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, + enable_if::value> = nullarg); +#endif + +// 11 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, + enable_if::value> = nullarg); +#endif + +// 12 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + enable_if::value> = nullarg); +#endif + +// 13 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, + enable_if::value> = nullarg); +#endif + +// 14 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, + enable_if::value> = nullarg); +#endif + +// 15 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + enable_if::value> = nullarg); +#endif + +// 16 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + Scalar::Vector x15, + enable_if::value> = nullarg); +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + Scalar::Vector x15, + enable_if::value> = nullarg); +#endif + +// 1 AVX2::Vector to 1 Scalar::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector x, + enable_if::value> = nullarg); + +// Declarations: Mask casts without offset {{{1 +// 1 AVX2::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return + simd_cast(const AVX2::Mask &k, enable_if::value> = nullarg); + +// 2 AVX2::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_AVX_2(double_m, float_m); +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_2(double_m, int_m); +Vc_SIMD_CAST_AVX_2(double_m, uint_m); +Vc_SIMD_CAST_AVX_2(double_m, short_m); +Vc_SIMD_CAST_AVX_2(double_m, ushort_m); + +Vc_SIMD_CAST_AVX_2( float_m, short_m); +Vc_SIMD_CAST_AVX_2( float_m, ushort_m); + +Vc_SIMD_CAST_AVX_2( int_m, short_m); +Vc_SIMD_CAST_AVX_2( int_m, ushort_m); + +Vc_SIMD_CAST_AVX_2( uint_m, short_m); +Vc_SIMD_CAST_AVX_2( uint_m, ushort_m); +#endif + +// 4 AVX2::Mask to 1 AVX2::Mask {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_4(double_m, short_m); +Vc_SIMD_CAST_AVX_4(double_m, ushort_m); +#endif + +// 1 SSE::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m); +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m); +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m); +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m); +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m); +Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m); +#endif + +Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m); +Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m); +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m); +Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m); +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m); + +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m); +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m); +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m); +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m); +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m); +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m); +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m); +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m); +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m); +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m); +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m); + +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m); +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m); +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m); +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m); +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m); +Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m); +Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m); +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m); +Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m); +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m); + +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m); +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m); + +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m); +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m); +#endif + +// 2 SSE::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m); +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m); +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m); +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m); +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m); +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m); +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m); +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m); +Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m); + +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m); +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m); +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m); +Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m); + +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m); +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m); +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m); +Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m); + +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m); +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m); +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m); +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m); + +Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m); +Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m); +Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m); +Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m); +#endif + +// 4 SSE::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m); +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m); +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m); +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m); +Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m); +Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m); +Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m); +Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m); +Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m); +Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m); +Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m); +#endif + +// 1 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k, + enable_if::value> = nullarg); + +// 2 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k0, Scalar::Mask k1, + enable_if::value> = nullarg); + +// 4 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return simd_cast( + Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, + enable_if<(AVX2::is_mask::value && Return::Size >= 4)> = nullarg); + +// 8 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return simd_cast( + Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, + Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, + enable_if<(AVX2::is_mask::value && Return::Size >= 8)> = nullarg); + +// 16 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, + Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, + Scalar::Mask k8, Scalar::Mask k9, Scalar::Mask k10, + Scalar::Mask k11, Scalar::Mask k12, Scalar::Mask k13, + Scalar::Mask k14, Scalar::Mask k15, + enable_if<(AVX2::is_mask::value && Return::Size >= 16)> = nullarg); + +// 1 AVX2::Mask to 1 SSE::Mask {{{2 +Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m); +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m); +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m); +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m); +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m); +Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m); + +Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m); +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m); +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m); +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m); +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m); +Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m); + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m); +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m); +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m); +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m); +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m); +Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m); + +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m); +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m); +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m); +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m); +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m); +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m); + +Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m); +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m); +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m); +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m); +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m); +Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m); + +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m); +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m); +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m); +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m); +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m); +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m); +#endif + +// 2 AVX2::Mask to 1 SSE::Mask {{{2 +Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m); +Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m); + +// 1 AVX2::Mask to 1 Scalar::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask x, + enable_if::value> = nullarg); + +// Declaration: offset == 0 | convert from AVX2::Mask/Vector {{{1 +template +Vc_INTRINSIC Vc_CONST enable_if< + (offset == 0 && + ((AVX2::is_vector::value && !Scalar::is_vector::value && + Traits::is_simd_vector::value && !Traits::isSimdArray::value) || + (AVX2::is_mask::value && !Scalar::is_mask::value && + Traits::is_simd_mask::value && + !Traits::isSimdMaskArray::value))), + Return> +simd_cast(const From &x); +// Declaration: offset == 0 | convert from SSE::Mask/Vector to AVX2::Mask/Vector {{{1 +template +Vc_INTRINSIC Vc_CONST Return simd_cast( + const From &x, + enable_if::value && + AVX2::is_vector::value) || + (SSE::is_mask::value && + AVX2::is_mask::value))> = nullarg); + +// Declarations: Vector casts with offset {{{1 +// AVX2 to AVX2 {{{2 +template +Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector::value && offset != 0), + Return> +simd_cast(AVX2::Vector x); +// AVX2 to SSE (Vector) {{{2 +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && + sizeof(AVX2::Vector) == 32), + Return> +simd_cast(AVX2::Vector x); +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && + sizeof(AVX2::Vector) == 16), + Return> +simd_cast(AVX2::Vector x); +// SSE to AVX2 {{{2 +Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1); +Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1); + +// Declarations: Mask casts with offset {{{1 +// 1 AVX2::Mask to N AVX2::Mask {{{2 +/* This declaration confuses GCC (4.9.2). If the declarations are there the definitions + * are ignored by the compiler. ;-( +template +Vc_INTRINSIC_L Vc_CONST_L Return +simd_cast(const AVX2::Mask &k, + enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; +template +Vc_INTRINSIC_L Vc_CONST_L Return +simd_cast(const AVX2::Mask &k, + enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; +template +Vc_INTRINSIC_L Vc_CONST_L Return +simd_cast(const AVX2::Mask &k, + enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; +template +Vc_INTRINSIC_L Vc_CONST_L Return +simd_cast(const AVX2::Mask &k, + enable_if::value> = nullarg) Vc_INTRINSIC_R Vc_CONST_R; + */ + +// 1 SSE::Mask to N AVX2(2)::Mask {{{2 +Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1); +Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1); + +// AVX2 to SSE (Mask) {{{2 +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && + sizeof(AVX2::Mask) == 32), + Return> +simd_cast(AVX2::Mask x); +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && + sizeof(AVX2::Mask) == 16), + Return> +simd_cast(AVX2::Mask x); + +// helper macros Vc_SIMD_CAST_AVX_[124] & Vc_SIMD_CAST_[124] {{{1 +#undef Vc_SIMD_CAST_AVX_1 +#define Vc_SIMD_CAST_AVX_1(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_AVX_2 +#define Vc_SIMD_CAST_AVX_2(from_, to_) \ + static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \ + "this type combination is wrong"); \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_AVX_3 +#define Vc_SIMD_CAST_AVX_3(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_AVX_4 +#define Vc_SIMD_CAST_AVX_4(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ + AVX2::from_ x3, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_1 +#define Vc_SIMD_CAST_1(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if::value>) + +#undef Vc_SIMD_CAST_2 +#define Vc_SIMD_CAST_2(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_3 +#define Vc_SIMD_CAST_3(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_4 +#define Vc_SIMD_CAST_4(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_5 +#define Vc_SIMD_CAST_5(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_6 +#define Vc_SIMD_CAST_6(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ + from_ x5, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_7 +#define Vc_SIMD_CAST_7(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ + from_ x5, from_ x6, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_8 +#define Vc_SIMD_CAST_8(from_, to_) \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ + from_ x5, from_ x6, from_ x7, \ + enable_if::value>) + +#undef Vc_SIMD_CAST_OFFSET +#define Vc_SIMD_CAST_OFFSET(from_, to_, offset_) \ + static_assert(from_::size() >= to_::size() * (offset_ + 1), \ + "this offset cannot exist for this type combination"); \ + template \ + Vc_INTRINSIC Vc_CONST To simd_cast( \ + from_ x, enable_if<(offset == offset_ && std::is_same::value)>) + +// SSE -> AVX2 where the AVX2 Vector is integral and thus of equal size() as the {{{1 +// equivalent SSE Vector +template +Vc_INTRINSIC Vc_CONST To +simd_cast(From x, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)>) +{ + return simd_cast>(x).data(); +} +template +Vc_INTRINSIC Vc_CONST To +simd_cast(From x0, From x1, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)>) +{ + return simd_cast>(x0, x1).data(); +} +template +Vc_INTRINSIC Vc_CONST To +simd_cast(From x0, From x1, From x2, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)>) +{ + return simd_cast>(x0, x1, x2).data(); +} +template +Vc_INTRINSIC Vc_CONST To +simd_cast(From x0, From x1, From x2, From x3, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)>) +{ + return simd_cast>(x0, x1, x2, x3).data(); +} +template +Vc_INTRINSIC Vc_CONST To +simd_cast(From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7, + enable_if<(AVX2::is_vector::value && SSE::is_vector::value && + SSE::Vector::Size == To::Size)>) +{ + return simd_cast>(x0, x1, x2, x3, x4, x5, x6, x7) + .data(); +} + +// Vector casts without offset {{{1 +// AVX2::Vector {{{2 +// 1: to double_v {{{3 +Vc_SIMD_CAST_AVX_1( float_v, double_v) { return _mm256_cvtps_pd(AVX::lo128(x.data())); } +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_1( int_v, double_v) { return AVX::convert< int, double>(AVX::lo128(x.data())); } +Vc_SIMD_CAST_AVX_1( uint_v, double_v) { return AVX::convert< uint, double>(AVX::lo128(x.data())); } +Vc_SIMD_CAST_AVX_1( short_v, double_v) { return AVX::convert< short, double>(AVX::lo128(x.data())); } +Vc_SIMD_CAST_AVX_1(ushort_v, double_v) { return AVX::convert(AVX::lo128(x.data())); } +#endif + +// 1: to float_v {{{3 +Vc_SIMD_CAST_AVX_1(double_v, float_v) { return AVX::zeroExtend(_mm256_cvtpd_ps(x.data())); } +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_1( int_v, float_v) { return AVX::convert< int, float>(x.data()); } +Vc_SIMD_CAST_AVX_1( uint_v, float_v) { return AVX::convert< uint, float>(x.data()); } +Vc_SIMD_CAST_AVX_1( short_v, float_v) { return AVX::convert< short, float>(AVX::lo128(x.data())); } +Vc_SIMD_CAST_AVX_1(ushort_v, float_v) { return AVX::convert(AVX::lo128(x.data())); } +#endif + +// 2: to float_v {{{3 +Vc_SIMD_CAST_AVX_2(double_v, float_v) { return AVX::concat(_mm256_cvtpd_ps(x0.data()), _mm256_cvtpd_ps(x1.data())); } + +// 1: to int_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_1(double_v, int_v) { return AVX::zeroExtend(_mm256_cvttpd_epi32(x.data())); } +Vc_SIMD_CAST_AVX_1( float_v, int_v) { return _mm256_cvttps_epi32(x.data()); } +Vc_SIMD_CAST_AVX_1( uint_v, int_v) { return x.data(); } +Vc_SIMD_CAST_AVX_1( short_v, int_v) { + const auto tmp = Mem::permute4x64(x.data()); + return _mm256_srai_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); +} +Vc_SIMD_CAST_AVX_1(ushort_v, int_v) { + const auto tmp = Mem::permute4x64(x.data()); + return _mm256_srli_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); +} +#endif + +// 2: to int_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_2(double_v, int_v) { return AVX::concat(_mm256_cvttpd_epi32(x0.data()), _mm256_cvttpd_epi32(x1.data())); } +#endif + +// 1: to uint_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_1(double_v, uint_v) { return AVX::zeroExtend(AVX::convert(x.data())); } +Vc_SIMD_CAST_AVX_1( float_v, uint_v) { + return _mm256_blendv_epi8( + _mm256_cvttps_epi32(x.data()), + _mm256_add_epi32( + _mm256_cvttps_epi32(_mm256_sub_ps(x.data(), AVX::set2power31_ps())), + AVX::set2power31_epu32()), + _mm256_castps_si256(AVX::cmpge_ps(x.data(), AVX::set2power31_ps()))); +} +Vc_SIMD_CAST_AVX_1( int_v, uint_v) { return x.data(); } +Vc_SIMD_CAST_AVX_1( short_v, uint_v) { + const auto tmp = Mem::permute4x64(x.data()); + return _mm256_srai_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); +} +Vc_SIMD_CAST_AVX_1(ushort_v, uint_v) { + const auto tmp = Mem::permute4x64(x.data()); + return _mm256_srli_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16); +} +#endif + +// 2: to uint_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_2(double_v, uint_v) { return AVX::concat(AVX::convert(x0.data()), AVX::convert(x1.data())); } +#endif + +// 1: to short_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_1(double_v, short_v) { return AVX::zeroExtend(_mm_packs_epi32(_mm256_cvttpd_epi32(x.data()), _mm_setzero_si128())); } +Vc_SIMD_CAST_AVX_1( float_v, short_v) { + const auto tmp = _mm256_cvttps_epi32(x.data()); + return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp))); +} +Vc_SIMD_CAST_AVX_1( int_v, short_v) { return AVX::zeroExtend(AVX::convert< int, short>(x.data())); } +Vc_SIMD_CAST_AVX_1( uint_v, short_v) { return AVX::zeroExtend(AVX::convert(x.data())); } +Vc_SIMD_CAST_AVX_1(ushort_v, short_v) { return x.data(); } +#endif + +// 2: to short_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_2(double_v, short_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1)); +} +Vc_SIMD_CAST_AVX_2( float_v, short_v) { + using AVX2::short_v; + using AVX2::int_v; + return simd_cast(simd_cast(x0), simd_cast(x1)); +} +Vc_SIMD_CAST_AVX_2( int_v, short_v) { + auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); + auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); + auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); + auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); + return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); +} +Vc_SIMD_CAST_AVX_2( uint_v, short_v) { + auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); + auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); + auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); + auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); + return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); +} +#endif + +// 3: to short_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_3(double_v, short_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); + return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128())); +} +#endif + +// 4: to short_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_4(double_v, short_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); + const auto tmp3 = _mm256_cvttpd_epi32(x3.data()); + return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3)); +} +#endif + +// 1: to ushort_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_1(double_v, ushort_v) { + const auto tmp = _mm256_cvttpd_epi32(x.data()); + return AVX::zeroExtend(_mm_packs_epi32(tmp, _mm_setzero_si128())); +} +Vc_SIMD_CAST_AVX_1( float_v, ushort_v) { + const auto tmp = _mm256_cvttps_epi32(x.data()); + return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp))); +} +Vc_SIMD_CAST_AVX_1( int_v, ushort_v) { return AVX::zeroExtend(AVX::convert< int, ushort>(x.data())); } +Vc_SIMD_CAST_AVX_1( uint_v, ushort_v) { return AVX::zeroExtend(AVX::convert(x.data())); } +Vc_SIMD_CAST_AVX_1( short_v, ushort_v) { return x.data(); } +#endif + +// 2: to ushort_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_2(double_v, ushort_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1)); +} +Vc_SIMD_CAST_AVX_2( float_v, ushort_v) { + using AVX2::ushort_v; + using AVX2::int_v; + return simd_cast(simd_cast(x0), simd_cast(x1)); +} +Vc_SIMD_CAST_AVX_2( int_v, ushort_v) { + auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); + auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); + auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); + auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); + return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); +} +Vc_SIMD_CAST_AVX_2( uint_v, ushort_v) { + auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); + auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); + auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); + auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); + return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); +} +#endif + +// 3: to ushort_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_3(double_v, ushort_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); + return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128())); +} +#endif + +// 4: to ushort_v {{{3 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_4(double_v, ushort_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); + const auto tmp3 = _mm256_cvttpd_epi32(x3.data()); + return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3)); +} +#endif + +// 1 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v) { return _mm256_cvtps_pd(x.data()); } +Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v) { return _mm256_cvtepi32_pd(x.data()); } +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v) { using namespace AvxIntrinsics; return _mm256_add_pd(_mm256_cvtepi32_pd(_mm_sub_epi32(x.data(), _mm_setmin_epi32())), set1_pd(1u << 31)); } +Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v) { return simd_cast(simd_cast(x)); } + +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v) { return AVX::zeroExtend(_mm_cvtepi32_ps(x.data())); } +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v) { return AVX::convert< short, float>(x.data()); } +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v) { return AVX::convert(x.data()); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } + +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v) { return AVX::convert< short, int>(x.data()); } +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v) { return AVX::convert(x.data()); } + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v) { return AVX::convert< short, uint>(x.data()); } +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v) { return AVX::convert(x.data()); } + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); } + +Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); } +#endif + +// 2 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v) { return AVX::concat(x0.data(), x1.data()); } + +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v) { return AVX::convert< int, float>(AVX::concat(x0.data(), x1.data())); } +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v) { return AVX::convert(AVX::concat(x0.data(), x1.data())); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } + +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1)); } +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1)); } +Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); } + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); } + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); } + +Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } +Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); } +#endif +// 3 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } +Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2).data()); } +Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2).data()); } + +Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } +Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } + +Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } +Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } + +Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } +Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } +#endif + +// 4 SSE::Vector to 1 AVX2::Vector {{{2 +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } +Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2, x3).data()); } +Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2, x3).data()); } + +Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } +Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } + +Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } +Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } + +Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } +Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } +#endif + +// 5 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4)); } +Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4)); } +#endif + +// 6 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5)); } +Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5)); } +#endif + +// 7 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6)); } +Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6)); } +#endif + +// 8 SSE::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6, x7)); } +Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6, x7)); } +#endif + +// 1 AVX2::Vector to 1 SSE::Vector {{{2 +Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v) { return AVX::lo128(x.data()); } +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v) { return AVX::lo128(x.data()); } +#endif + +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v) { return AVX::convert(x.data()); } +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v) { return AVX::convert(x.data()); } +Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v) { return AVX::convert(x.data()); } +Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v) { return AVX::convert(x.data()); } + +Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v) { return AVX::convert(x.data()); } +Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v) { return AVX::convert(x.data()); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v) { return SSE::convert(AVX::lo128(x.data())); } +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v) { return SSE::convert(AVX::lo128(x.data())); } +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v) { return AVX::convert(x.data()); } +Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v) { return AVX::convert(x.data()); } + +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v) { return SSE::convert(AVX::lo128(x.data())); } +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v) { return SSE::convert(AVX::lo128(x.data())); } +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v) { return AVX::convert(x.data()); } +Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v) { return AVX::convert(x.data()); } + +Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v) { return simd_cast(simd_cast(x)); } + +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v) { return simd_cast(simd_cast(x)); } +#endif + +// 2 AVX2::Vector to 1 SSE::Vector {{{2 +Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + return _mm_packs_epi32(tmp0, tmp1); +} +Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v) { + const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); + const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); + return _mm_packs_epi32(tmp0, tmp1); +} + +// 1 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value>) +{ + return AVX::zeroExtend(_mm_setr_pd(x.data(), 0.)); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value>) +{ + return AVX::zeroExtend(_mm_setr_ps(x.data(), 0.f, 0.f, 0.f)); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value>) +{ + return _mm256_setr_epi32(x.data(), 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x.data()), 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value>) +{ + return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x, + enable_if::value>) +{ + return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 2 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value>) +{ + return AVX::zeroExtend(_mm_setr_pd(x0.data(), x1.data())); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value>) +{ + return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f)); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value>) +{ + return _mm256_setr_epi32(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 3 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value>) +{ + return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value>) +{ + return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), 0)); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value>) +{ + return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), 0, 0, 0, + 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 4 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value>) +{ + return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), x3.data()); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value>) +{ + return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), x3.data())); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value>) +{ + return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), + uint(x3.data()), 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 5 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value>) +{ + return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value>) +{ + return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), + uint(x3.data()), uint(x4.data()), 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 6 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value>) +{ + return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), 0, 0); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value>) +{ + return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), + uint(x3.data()), uint(x4.data()), uint(x5.data()), 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 7 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value>) +{ + return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), 0); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value>) +{ + return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), + uint(x3.data()), uint(x4.data()), uint(x5.data()), + uint(x6.data()), 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 8 Scalar::Vector to 1 AVX2::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value>) +{ + return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data()); +} +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value>) +{ + return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data()); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value>) +{ + return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), + uint(x3.data()), uint(x4.data()), uint(x5.data()), + uint(x6.data()), uint(x7.data())); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0); +} +#endif + +// 9 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0, + 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0, + 0); +} +#endif + +// 10 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0, + 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0, + 0, 0, 0, 0); +} +#endif + +// 11 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), 0, 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), 0, 0, 0, 0, 0); +} +#endif + +// 12 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), 0, 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), 0, 0, 0, 0); +} +#endif + +// 13 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), 0, 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), 0, 0, 0); +} +#endif + +// 14 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), x13.data(), 0, 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), x13.data(), 0, 0); +} +#endif + +// 15 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), + 0); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), + 0); +} +#endif + +// 16 Scalar::Vector to 1 AVX2::Vector {{{2 +#ifdef Vc_IMPL_AVX2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + Scalar::Vector x15, enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), + x15.data()); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, + Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, + Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, + Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, + Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, + Scalar::Vector x15, enable_if::value>) +{ + return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), + x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), + x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), + x15.data()); +} +#endif + +// 1 AVX2::Vector to 1 Scalar::Vector {{{2 +template +Vc_INTRINSIC Vc_CONST To +simd_cast(AVX2::Vector x, enable_if::value>) +{ + return static_cast(x[0]); +} + +// Mask casts without offset {{{1 +// 1 AVX2::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return + simd_cast(const AVX2::Mask &k, enable_if::value>) +{ + return {Detail::mask_cast::Size, Return::Size, + typename Return::VectorTypeF>(k.dataI())}; +} + +// 2 AVX2::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_AVX_2(double_m, float_m) { return AVX::concat(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_2(double_m, int_m) { return Mem::permute4x64(_mm256_packs_epi32(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_AVX_2(double_m, uint_m) { return Mem::permute4x64(_mm256_packs_epi32(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_AVX_2(double_m, short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); } +Vc_SIMD_CAST_AVX_2(double_m, ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); } + +Vc_SIMD_CAST_AVX_2( float_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_AVX_2( float_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } + +Vc_SIMD_CAST_AVX_2( int_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_AVX_2( int_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } + +Vc_SIMD_CAST_AVX_2( uint_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_AVX_2( uint_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } +#endif + +// 4 AVX2::Mask to 1 AVX2::Mask {{{2 +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_AVX_4(double_m, short_m) +{ + using namespace AVX; + const auto tmp = _mm256_packs_epi32( + _mm256_packs_epi32(x0.dataI(), x1.dataI()) // a0 a1 b0 b1 a2 a3 b2 b3 + , + _mm256_packs_epi32(x2.dataI(), x3.dataI()) // c0 c1 d0 d1 c2 c3 d2 d3 + ); // a0 a1 b0 b1 c0 c1 d0 d1 a2 a3 b2 b3 c2 c3 d2 d3 + return concat(_mm_unpacklo_epi32(lo128(tmp), hi128(tmp)), // a0 a1 a2 a3 b0 b1 b2 b3 + _mm_unpackhi_epi32(lo128(tmp), hi128(tmp))); // c0 c1 c2 c3 d0 d1 d2 d3 +} +Vc_SIMD_CAST_AVX_4(double_m, ushort_m) { return simd_cast(x0, x1, x2, x3).data(); } +#endif + +// 1 SSE::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(simd_cast(x).data()); } +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } +#endif + +Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.data(), x.data()), _mm_unpackhi_ps(x.data(), x.data())); } +Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.data(), x.data()), _mm_unpackhi_ps(x.data(), x.data())); } +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.data(), x.data()), _mm_unpackhi_ps(x.data(), x.data())); } +Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } + +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); } +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } + +Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } + +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } +Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } + +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } +Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } +#endif + +// 2 SSE::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); } +Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); } + +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } + +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } + +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } +Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } + +Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); } +Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); } +#endif + +// 4 SSE::Mask to 1 AVX2::Mask {{{2 +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } +Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); } +Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); } +Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } +Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } +Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } +Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } +Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } +Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } +#endif + +// 1 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k, enable_if::value>) +{ + Return r{false}; + r[0] = k.data(); + return r; +} + +// 2 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k0, Scalar::Mask k1, + enable_if::value>) +{ + Return r{false}; + r[0] = k0.data(); + r[1] = k1.data(); + return r; +} + +// 4 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, + enable_if<(AVX2::is_mask::value && Return::Size >= 4)>) +{ + Return r{false}; + r[0] = k0.data(); + r[1] = k1.data(); + r[2] = k2.data(); + r[3] = k3.data(); + return r; +} + +// 8 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, + Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, + enable_if<(AVX2::is_mask::value && Return::Size >= 8)>) +{ + Return r{false}; + r[0] = k0.data(); + r[1] = k1.data(); + r[2] = k2.data(); + r[3] = k3.data(); + r[4] = k4.data(); + r[5] = k5.data(); + r[6] = k6.data(); + r[7] = k7.data(); + return r; +} + +// 16 Scalar::Mask to 1 AVX2::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, + Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, + Scalar::Mask k8, Scalar::Mask k9, Scalar::Mask k10, + Scalar::Mask k11, Scalar::Mask k12, Scalar::Mask k13, + Scalar::Mask k14, Scalar::Mask k15, + enable_if<(AVX2::is_mask::value && Return::Size >= 16)>) +{ + Return r{false}; + r[0] = k0.data(); + r[1] = k1.data(); + r[2] = k2.data(); + r[3] = k3.data(); + r[4] = k4.data(); + r[5] = k5.data(); + r[6] = k6.data(); + r[7] = k7.data(); + r[8] = k8.data(); + r[9] = k9.data(); + r[10] = k10.data(); + r[11] = k11.data(); + r[12] = k12.data(); + r[13] = k13.data(); + r[14] = k14.data(); + r[15] = k15.data(); + return r; +} + +// 1 AVX2::Mask to 1 SSE::Mask {{{2 +Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); } +Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); } + +Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m) { return _mm_unpacklo_ps(AVX::lo128(x.data()), AVX::lo128(x.data())); } +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m) { return AVX::lo128(x.data()); } +Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } + +#ifdef Vc_IMPL_AVX2 +Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m) { return AVX::lo128(x.dataI()); } +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m) { return AVX::lo128(x.dataI()); } +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); } +Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } + +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m) { return AVX::lo128(x.dataI()); } +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m) { return AVX::lo128(x.dataI()); } +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); } +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } +Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } + +Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } + +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } +Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } +#endif + +// 2 AVX2::Mask to 1 SSE::Mask {{{2 +Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } +Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } + +// 1 AVX2::Mask to 1 Scalar::Mask {{{2 +template +Vc_INTRINSIC Vc_CONST To +simd_cast(AVX2::Mask x, enable_if::value>) +{ + return static_cast(x[0]); +} + +// offset == 0 | convert from AVX2::Mask/Vector {{{1 +template +Vc_INTRINSIC Vc_CONST enable_if< + (offset == 0 && + ((AVX2::is_vector::value && !Scalar::is_vector::value && + Traits::is_simd_vector::value && !Traits::isSimdArray::value) || + (AVX2::is_mask::value && !Scalar::is_mask::value && + Traits::is_simd_mask::value && + !Traits::isSimdMaskArray::value))), + Return> +simd_cast(const From &x) +{ + return simd_cast(x); +} + +// offset == 0 | convert from SSE::Mask/Vector to AVX2::Mask/Vector {{{1 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(const From &x, + enable_if::value && + AVX2::is_vector::value) || + (SSE::is_mask::value && + AVX2::is_mask::value))>) +{ + return simd_cast(x); +} + +// Vector casts with offset {{{1 +// AVX2 to AVX2 {{{2 +template +Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector::value && offset != 0), + Return> + simd_cast(AVX2::Vector x) +{ + // TODO: there certainly is potential for leaving out the shift/permute + // instruction at the cost of a lot more specializations + using V = AVX2::Vector; + constexpr int shift = sizeof(T) * offset * Return::Size; + static_assert(shift > 0 && shift < sizeof(x), ""); + if (shift < 16) { + return simd_cast(V{AVX::avx_cast( + _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))}); + } else if (shift == 16) { + return simd_cast(V{Mem::permute128(x.data())}); + } else { +#ifdef Vc_MSVC +#pragma warning(push) +#pragma warning(disable : 4556) // value of intrinsic immediate argument '-8' is out of + // range '0 - 255' +#endif + return simd_cast(V{AVX::avx_cast( + _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), shift - 16))}); +#ifdef Vc_MSVC +#pragma warning(pop) +#endif + } +} +// AVX2 to SSE (Vector) {{{2 +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && + sizeof(AVX2::Vector) == 32), + Return> + simd_cast(AVX2::Vector x) +{ + using V = AVX2::Vector; + constexpr int shift = sizeof(V) / V::Size * offset * Return::Size; + static_assert(shift > 0, ""); + static_assert(shift < sizeof(V), ""); + using SseVector = SSE::Vector; + if (shift == 16) { + return simd_cast(SseVector{AVX::hi128(x.data())}); + } + using Intrin = typename SseVector::VectorType; + return simd_cast(SseVector{AVX::avx_cast( + _mm_alignr_epi8(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), + AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))}); +} +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && + sizeof(AVX2::Vector) == 16), + Return> + simd_cast(AVX2::Vector x) +{ + using V = AVX2::Vector; + constexpr int shift = sizeof(V) / V::Size * offset * Return::Size; + static_assert(shift > 0, ""); + static_assert(shift < sizeof(V), ""); + using SseVector = SSE::Vector; + return simd_cast(SseVector{_mm_srli_si128(x.data(), shift)}); +} +// SSE to AVX2 {{{2 +Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1) { return simd_cast(simd_cast(x)); } +Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1) { return simd_cast(simd_cast(x)); } + +// Mask casts with offset {{{1 +// 1 AVX2::Mask to N AVX2::Mask {{{2 +// float_v and (u)int_v have size 8, double_v has size 4, and (u)short_v have size 16. Consequently, +// offset can 0, 1, 2, or 3. +// - offset == 0 is already done. +// - offset == 1 can be 16 -> 8, 16 -> 4, 8 -> 4, and 16 -> 4 +// - offset == 2 && offset == 3 can only be 16 -> 4 +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(const AVX2::Mask &k, + enable_if<(AVX2::is_mask::value && offset == 1 && + AVX2::Mask::Size == Return::Size * 2)> = nullarg) +{ + const auto tmp = AVX::hi128(k.dataI()); + return AVX::concat(_mm_unpacklo_epi8(tmp, tmp), _mm_unpackhi_epi8(tmp, tmp)); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(const AVX2::Mask &k, + enable_if<(AVX2::is_mask::value && offset == 1 && + AVX2::Mask::Size == Return::Size * 4)> = nullarg) +{ + auto tmp = AVX::lo128(k.dataI()); + tmp = _mm_unpackhi_epi8(tmp, tmp); + return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(const AVX2::Mask &k, + enable_if<(AVX2::is_mask::value && offset == 2 && + AVX2::Mask::Size == Return::Size * 4)> = nullarg) +{ + auto tmp = AVX::hi128(k.dataI()); + tmp = _mm_unpacklo_epi8(tmp, tmp); + return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); +} +template +Vc_INTRINSIC Vc_CONST Return +simd_cast(const AVX2::Mask &k, + enable_if<(AVX2::is_mask::value && offset == 3 && + AVX2::Mask::Size == Return::Size * 4)> = nullarg) +{ + auto tmp = AVX::hi128(k.dataI()); + tmp = _mm_unpackhi_epi8(tmp, tmp); + return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); +} + +// 1 SSE::Mask to N AVX2::Mask {{{2 +Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } +Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } + +// AVX2 to SSE (Mask) {{{2 +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && + sizeof(AVX2::Mask) == 32), + Return> + simd_cast(AVX2::Mask x) +{ + using M = AVX2::Mask; + constexpr int shift = sizeof(M) / M::Size * offset * Return::Size; + static_assert(shift > 0, ""); + static_assert(shift < sizeof(M), ""); + using SseVector = SSE::Mask>; + if (shift == 16) { + return simd_cast(SseVector{AVX::hi128(x.data())}); + } + using Intrin = typename SseVector::VectorType; + return simd_cast(SseVector{AVX::avx_cast( + _mm_alignr_epi8(AVX::hi128(x.dataI()), AVX::lo128(x.dataI()), shift))}); +} + +template +Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && + sizeof(AVX2::Mask) == 16), + Return> + simd_cast(AVX2::Mask x) +{ + return simd_cast(simd_cast>(x)); +} + +// undef Vc_SIMD_CAST_AVX_[1234] & Vc_SIMD_CAST_[12345678] {{{1 +#undef Vc_SIMD_CAST_AVX_1 +#undef Vc_SIMD_CAST_AVX_2 +#undef Vc_SIMD_CAST_AVX_3 +#undef Vc_SIMD_CAST_AVX_4 + +#undef Vc_SIMD_CAST_1 +#undef Vc_SIMD_CAST_2 +#undef Vc_SIMD_CAST_3 +#undef Vc_SIMD_CAST_4 +#undef Vc_SIMD_CAST_5 +#undef Vc_SIMD_CAST_6 +#undef Vc_SIMD_CAST_7 +#undef Vc_SIMD_CAST_8 + +#undef Vc_SIMD_CAST_OFFSET +// }}}1 + +} // namespace Vc + +#endif // VC_AVX_SIMD_CAST_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/avx/sorthelper.h vc-1.3.0/avx/sorthelper.h --- vc-0.7.4/avx/sorthelper.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/sorthelper.h 1969-12-31 18:00:00.000000000 -0600 @@ -1,45 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_SORTHELPER_H -#define VC_AVX_SORTHELPER_H - -#include "types.h" - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace AVX -{ -template struct SortHelper -{ - typedef typename VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - static VectorType sort(VTArg); - static void sort(VectorType &, VectorType &); -}; -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#endif // VC_AVX_SORTHELPER_H diff -Nru vc-0.7.4/avx/types.h vc-1.3.0/avx/types.h --- vc-0.7.4/avx/types.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/types.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,111 +1,116 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. +#ifndef VC_AVX_TYPES_H_ +#define VC_AVX_TYPES_H_ - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_TYPES_H -#define AVX_TYPES_H - -#include "intrinsics.h" -#include "../common/storage.h" +#include "../sse/types.h" +#include "../traits/type_traits.h" #include "macros.h" -#define VC_DOUBLE_V_SIZE 4 -#define VC_FLOAT_V_SIZE 8 -#define VC_SFLOAT_V_SIZE 8 -#define VC_INT_V_SIZE 8 -#define VC_UINT_V_SIZE 8 -#define VC_SHORT_V_SIZE 8 -#define VC_USHORT_V_SIZE 8 - -#include "../common/types.h" +#ifdef Vc_DEFAULT_IMPL_AVX2 +#define Vc_DOUBLE_V_SIZE 4 +#define Vc_FLOAT_V_SIZE 8 +#define Vc_INT_V_SIZE 8 +#define Vc_UINT_V_SIZE 8 +#define Vc_SHORT_V_SIZE 16 +#define Vc_USHORT_V_SIZE 16 +#elif defined Vc_DEFAULT_IMPL_AVX +#define Vc_DOUBLE_V_SIZE 4 +#define Vc_FLOAT_V_SIZE 8 +#define Vc_INT_V_SIZE 4 +#define Vc_UINT_V_SIZE 4 +#define Vc_SHORT_V_SIZE 8 +#define Vc_USHORT_V_SIZE 8 +#endif -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { namespace AVX { - template class Vector; +template using Vector = Vc::Vector>; +typedef Vector double_v; +typedef Vector float_v; +typedef Vector int_v; +typedef Vector uint_v; +typedef Vector short_v; +typedef Vector ushort_v; + +template using Mask = Vc::Mask>; +typedef Mask double_m; +typedef Mask float_m; +typedef Mask int_m; +typedef Mask uint_m; +typedef Mask short_m; +typedef Mask ushort_m; + +template struct Const; + +template struct is_vector : public std::false_type {}; +template struct is_vector> : public std::true_type {}; +template struct is_mask : public std::false_type {}; +template struct is_mask> : public std::true_type {}; +} // namespace AVX - template class Mask; +namespace AVX2 +{ +template using Vector = Vc::Vector; +using double_v = Vector; +using float_v = Vector< float>; +using int_v = Vector< int>; +using uint_v = Vector< uint>; +using short_v = Vector< short>; +using ushort_v = Vector; + +template using Mask = Vc::Mask; +using double_m = Mask; +using float_m = Mask< float>; +using llong_m = Mask< llong>; +using ullong_m = Mask; +using long_m = Mask< long>; +using ulong_m = Mask< ulong>; +using int_m = Mask< int>; +using uint_m = Mask< uint>; +using short_m = Mask< short>; +using ushort_m = Mask; +using schar_m = Mask< schar>; +using uchar_m = Mask< uchar>; + +template struct is_vector : public std::false_type {}; +template struct is_vector> : public std::true_type {}; +template struct is_mask : public std::false_type {}; +template struct is_mask> : public std::true_type {}; +} // namespace AVX2 - template struct VectorHelper {}; - template struct GatherHelper; - template struct ScatterHelper; - - template struct IndexTypeHelper; - template<> struct IndexTypeHelper< char > { typedef unsigned char Type; }; - template<> struct IndexTypeHelper { typedef unsigned char Type; }; - template<> struct IndexTypeHelper< short> { typedef unsigned short Type; }; - template<> struct IndexTypeHelper { typedef unsigned short Type; }; - template<> struct IndexTypeHelper< int > { typedef unsigned int Type; }; - template<> struct IndexTypeHelper { typedef unsigned int Type; }; - template<> struct IndexTypeHelper< float> { typedef unsigned int Type; }; - template<> struct IndexTypeHelper< sfloat> { typedef unsigned short Type; }; - template<> struct IndexTypeHelper< double> { typedef unsigned int Type; }; // _M128I based int32 would be nice - - template struct VectorTypeHelper; - template<> struct VectorTypeHelper< char > { typedef m128i Type; }; - template<> struct VectorTypeHelper { typedef m128i Type; }; - template<> struct VectorTypeHelper< short> { typedef m128i Type; }; - template<> struct VectorTypeHelper { typedef m128i Type; }; - template<> struct VectorTypeHelper< int > { typedef m256i Type; }; - template<> struct VectorTypeHelper { typedef m256i Type; }; - template<> struct VectorTypeHelper< float> { typedef m256 Type; }; - template<> struct VectorTypeHelper< sfloat> { typedef m256 Type; }; - template<> struct VectorTypeHelper< double> { typedef m256d Type; }; - - template struct SseVectorType; - template<> struct SseVectorType { typedef m128 Type; }; - template<> struct SseVectorType { typedef m128i Type; }; - template<> struct SseVectorType { typedef m128d Type; }; - template<> struct SseVectorType { typedef m128 Type; }; - template<> struct SseVectorType { typedef m128i Type; }; - template<> struct SseVectorType { typedef m128d Type; }; - - template struct HasVectorDivisionHelper { enum { Value = 1 }; }; - //template<> struct HasVectorDivisionHelper { enum { Value = 0 }; }; - - template struct VectorHelperSize; - -#ifdef VC_MSVC - // MSVC's __declspec(align(#)) only works with numbers, no enums or sizeof allowed ;( - template class _VectorAlignedBaseHack; - template<> class STRUCT_ALIGN1( 8) _VectorAlignedBaseHack< 8> {} STRUCT_ALIGN2( 8); - template<> class STRUCT_ALIGN1(16) _VectorAlignedBaseHack<16> {} STRUCT_ALIGN2(16); - template<> class STRUCT_ALIGN1(32) _VectorAlignedBaseHack<32> {} STRUCT_ALIGN2(32); - template<> class STRUCT_ALIGN1(64) _VectorAlignedBaseHack<64> {} STRUCT_ALIGN2(64); - template > - class VectorAlignedBaseT : public _VectorAlignedBaseHack - { - public: - FREE_STORE_OPERATORS_ALIGNED(sizeof(V)) - }; -#else - template > - class STRUCT_ALIGN1(sizeof(V)) VectorAlignedBaseT - { - public: - FREE_STORE_OPERATORS_ALIGNED(sizeof(V)) - } STRUCT_ALIGN2(sizeof(V)); -#endif -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ -#include "undomacros.h" +namespace Traits +{ +template struct is_simd_mask_internal> : public std::true_type {}; +template struct is_simd_vector_internal> : public std::true_type {}; +} // namespace Traits +} // namespace Vc -#endif // AVX_TYPES_H +#endif // VC_AVX_TYPES_H_ diff -Nru vc-0.7.4/avx/undomacros.h vc-1.3.0/avx/undomacros.h --- vc-0.7.4/avx/undomacros.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/undomacros.h 1969-12-31 18:00:00.000000000 -0600 @@ -1,26 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_UNDOMACROS_H -#define VC_AVX_UNDOMACROS_H -#undef VC_AVX_MACROS_H - -#endif // VC_AVX_UNDOMACROS_H - -#include "../common/undomacros.h" diff -Nru vc-0.7.4/avx/vector.h vc-1.3.0/avx/vector.h --- vc-0.7.4/avx/vector.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/vector.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,34 +1,44 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_VECTOR_H -#define AVX_VECTOR_H +#ifndef VC_AVX_VECTOR_H_ +#define VC_AVX_VECTOR_H_ #include "intrinsics.h" +#include "casts.h" +#include "../sse/vector.h" +#include "shuffle.h" #include "vectorhelper.h" #include "mask.h" -#include "writemaskedvector.h" -#include "sorthelper.h" #include #include #include "../common/aliasingentryhelper.h" #include "../common/memoryfwd.h" +#include "../common/where.h" #include "macros.h" #ifdef isfinite @@ -38,307 +48,292 @@ #undef isnan #endif -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Detail { -namespace AVX +template struct VectorTraits { -enum VectorAlignmentEnum { VectorAlignment = 32 }; + using mask_type = Vc::Mask; + using vector_type = Vc::Vector; + using writemasked_vector_type = Common::WriteMaskedVector; + using intrinsic_type = typename AVX::VectorTypeHelper::Type; +}; +} // namespace Detail -template class Vector +#define Vc_CURRENT_CLASS_NAME Vector +template class Vector { - public: - FREE_STORE_OPERATORS_ALIGNED(32) +public: + using abi = VectorAbi::Avx; - typedef typename VectorTypeHelper::Type VectorType; - typedef typename DetermineEntryType::Type EntryType; +private: + using traits_type = Detail::VectorTraits; + static_assert( + std::is_arithmetic::value, + "Vector only accepts arithmetic builtin types as template parameter T."); + + using WriteMaskedVector = typename traits_type::writemasked_vector_type; + +public: + using VectorType = typename traits_type::intrinsic_type; + using vector_type = VectorType; + + using mask_type = typename traits_type::mask_type; + using Mask = mask_type; + using MaskType = mask_type; + using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg; + using MaskArgument = typename Mask::AsArg; + using reference = Detail::ElementReference; + + Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType)); + + Vc_ALIGNED_TYPEDEF(sizeof(T), T, EntryType); + using value_type = EntryType; + typedef EntryType VectorEntryType; + static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType); + static constexpr size_t MemoryAlignment = alignof(VectorType); enum Constants { - Size = sizeof(VectorType) / sizeof(EntryType), - HasVectorDivision = HasVectorDivisionHelper::Value + HasVectorDivision = AVX::HasVectorDivisionHelper::Value }; - typedef Vector::Type> IndexType; - typedef typename Vc::AVX::Mask Mask; - typedef typename Mask::AsArg MaskArg; - typedef Vc::Memory, Size> Memory; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const Vector &AsArg; - typedef const VectorType &VectorTypeArg; +#ifdef Vc_IMPL_AVX2 + typedef typename std::conditional< + (Size >= 8), SimdArray, + typename std::conditional<(Size >= 4), SimdArray, + SimdArray>::type>::type + IndexType; #else - typedef Vector AsArg; - typedef VectorType VectorTypeArg; + typedef typename std::conditional<(Size >= 4), + SimdArray, + SimdArray>::type IndexType; #endif + typedef Vector AsArg; + typedef VectorType VectorTypeArg; protected: + template using V = Vector; + // helper that specializes on VectorType - typedef VectorHelper HV; + typedef AVX::VectorHelper HV; // helper that specializes on T - typedef VectorHelper HT; + typedef AVX::VectorHelper HT; // cast any m256/m128 to VectorType - static Vc_INTRINSIC VectorType _cast(param128 v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param128i v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param128d v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param256 v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param256i v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param256d v) { return avx_cast(v); } + template static Vc_INTRINSIC VectorType _cast(V v) + { + return AVX::avx_cast(v); + } -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - typedef Common::VectorMemoryUnion StorageType; -#else typedef Common::VectorMemoryUnion StorageType; -#endif StorageType d; + using WidthT = Common::WidthT; + // ICC can't compile this: + // static constexpr WidthT Width = WidthT(); + public: - /////////////////////////////////////////////////////////////////////////////////////////// - // uninitialized - Vc_ALWAYS_INLINE Vector() {} +#include "../common/generalinterface.h" - /////////////////////////////////////////////////////////////////////////////////////////// - // constants - explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerZero::ZEnum) Vc_ALWAYS_INLINE_R; - explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerOne::OEnum) Vc_ALWAYS_INLINE_R; - explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerIndexesFromZero::IEnum) Vc_ALWAYS_INLINE_R; - static Vc_INTRINSIC_L Vc_CONST_L Vector Zero() Vc_INTRINSIC_R Vc_CONST_R; - static Vc_INTRINSIC_L Vc_CONST_L Vector One() Vc_INTRINSIC_R Vc_CONST_R; - static Vc_INTRINSIC_L Vc_CONST_L Vector IndexesFromZero() Vc_INTRINSIC_R Vc_CONST_R; static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R; /////////////////////////////////////////////////////////////////////////////////////////// // internal: required to enable returning objects of VectorType Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {} -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_ALWAYS_INLINE Vector(typename VectorType::Base x) : d(x) {} -#endif - - /////////////////////////////////////////////////////////////////////////////////////////// - // static_cast / copy ctor - template explicit Vector(VC_ALIGNED_PARAMETER(Vector) x); - - // implicit cast - template Vc_INTRINSIC_L Vector &operator=(const Vector &x) Vc_INTRINSIC_R; - // copy assignment - Vc_ALWAYS_INLINE Vector &operator=(AsArg v) { d.v() = v.d.v(); return *this; } + // implict conversion from compatible Vector + template + Vc_INTRINSIC Vector( + V x, typename std::enable_if::value, + void *>::type = nullptr) + : d(AVX::convert(x.data())) + { + } + +#if Vc_IS_VERSION_1 + // static_cast from the remaining Vector + template + Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " + "vector types") Vc_INTRINSIC explicit Vector( + V x, + typename std::enable_if::value, + void *>::type = nullptr) + : d(Detail::zeroExtendIfNeeded(AVX::convert(x.data()))) + { + } + + // static_cast from other types, implemented via the non-member simd_cast function in + // simd_cast_caller.tcc + template ::value && + !std::is_same>::value>> + Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " + "vector types") Vc_INTRINSIC_L + explicit Vector(U &&x) Vc_INTRINSIC_R; +#endif /////////////////////////////////////////////////////////////////////////////////////////// // broadcast - explicit Vc_ALWAYS_INLINE_L Vector(EntryType a) Vc_ALWAYS_INLINE_R; - template Vc_INTRINSIC Vector(TT x, VC_EXACT_TYPE(TT, EntryType, void *) = 0) : d(HT::set(x)) {} - Vc_ALWAYS_INLINE Vector &operator=(EntryType a) { d.v() = HT::set(a); return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // load ctors - explicit Vc_INTRINSIC_L - Vector(const EntryType *x) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - Vector(const EntryType *x, Alignment align) Vc_INTRINSIC_R; - template explicit Vc_INTRINSIC_L - Vector(const OtherT *x) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - Vector(const OtherT *x, Alignment align) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // load member functions - Vc_INTRINSIC_L - void load(const EntryType *mem) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const EntryType *mem, Alignment align) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const OtherT *mem) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const OtherT *mem, Alignment align) Vc_INTRINSIC_R; + Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {} + template + Vc_INTRINSIC Vector(U a, + typename std::enable_if::value && + !std::is_same::value, + void *>::type = nullptr) + : Vector(static_cast(a)) + { + } + + //template + explicit Vector(std::initializer_list) + { + static_assert(std::is_same::value, + "A SIMD vector object cannot be initialized from an initializer list " + "because the number of entries in the vector is target-dependent."); + } - /////////////////////////////////////////////////////////////////////////////////////////// - // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX - explicit inline Vector(const Vector *a); - inline void expand(Vector *x) const; +#include "../common/loadinterface.h" +#include "../common/storeinterface.h" /////////////////////////////////////////////////////////////////////////////////////////// // zeroing Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; + Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; - Vc_INTRINSIC_L void setQnan(MaskArg k) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // stores - Vc_INTRINSIC_L void store(EntryType *mem) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask) const Vc_INTRINSIC_R; - template Vc_INTRINSIC_L void store(EntryType *mem, A align) const Vc_INTRINSIC_R; - template Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask, A align) const Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // swizzles - Vc_INTRINSIC_L Vc_PURE_L const Vector &abcd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector cdab() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector badc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector aaaa() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bbbb() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector cccc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dddd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bcad() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bcda() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dabc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector acbd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dbca() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dcba() const Vc_INTRINSIC_R Vc_PURE_R; + Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R; - /////////////////////////////////////////////////////////////////////////////////////////// - // gathers - template Vector(const EntryType *mem, const IndexT *indexes); - template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes); - template Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask); - template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); - template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); - template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); - template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); - template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes); - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask); -#ifdef VC_USE_SET_GATHERS - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); -#endif - template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); - template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); - template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); - template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); - - /////////////////////////////////////////////////////////////////////////////////////////// - // scatters - template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const; - template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const; - template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const; - template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; - template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const; - template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; - template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const; - template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const; +#include "../common/gatherinterface.h" +#include "../common/scatterinterface.h" /////////////////////////////////////////////////////////////////////////////////////////// //prefix - Vc_ALWAYS_INLINE Vector &operator++() { data() = VectorHelper::add(data(), VectorHelper::one()); return *this; } - Vc_ALWAYS_INLINE Vector &operator--() { data() = VectorHelper::sub(data(), VectorHelper::one()); return *this; } + Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; } + Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; } //postfix - Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = VectorHelper::add(data(), VectorHelper::one()); return r; } - Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = VectorHelper::sub(data(), VectorHelper::one()); return r; } + Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; } + Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; } - Vc_INTRINSIC Common::AliasingEntryHelper operator[](size_t index) { -#if defined(VC_GCC) && VC_GCC >= 0x40300 && VC_GCC < 0x40400 - ::Vc::Warnings::_operator_bracket_warning(); -#endif - return d.m(index); + private: + friend reference; + Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept + { + return o.d.m(i); + } + template + Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept( + noexcept(std::declval() = v)) + { + return o.d.set(i, v); } - Vc_ALWAYS_INLINE EntryType operator[](size_t index) const { + + public: + Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept + { + static_assert(noexcept(reference{std::declval(), int()}), ""); + return {*this, int(index)}; + } + Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept + { return d.m(index); } - Vc_ALWAYS_INLINE Vector operator~() const { return VectorHelper::andnot_(data(), VectorHelper::allone()); } - Vc_ALWAYS_INLINE_L Vc_PURE_L Vector::Type> operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R; + Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R; + Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R; + + Vc_INTRINSIC Vc_PURE Mask operator!() const + { + return *this == Zero(); + } + Vc_ALWAYS_INLINE Vector operator~() const + { +#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS + static_assert(std::is_integral::value, + "bit-complement can only be used with Vectors of integral type"); +#endif + return Detail::andnot_(data(), Detail::allone()); + } + Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; } -#define OP1(fun) \ - Vc_ALWAYS_INLINE Vector fun() const { return Vector(VectorHelper::fun(data())); } \ - Vc_ALWAYS_INLINE Vector &fun##_eq() { data() = VectorHelper::fun(data()); return *this; } - OP1(sqrt) - OP1(abs) -#undef OP1 - -#define OP(symbol, fun) \ - Vc_ALWAYS_INLINE Vector &operator symbol##=(const Vector &x) { data() = VectorHelper::fun(data(), x.data()); return *this; } \ - Vc_ALWAYS_INLINE Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ - Vc_ALWAYS_INLINE Vector operator symbol(const Vector &x) const { return Vector(VectorHelper::fun(data(), x.data())); } \ - template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Vector) operator symbol(TT x) const { return operator symbol(Vector(x)); } - - OP(+, add) - OP(-, sub) - OP(*, mul) -#undef OP - inline Vector &operator/=(EntryType x); - template inline Vc_PURE_L VC_EXACT_TYPE(TT, EntryType, Vector) operator/(TT x) const Vc_PURE_R; - inline Vector &operator/=(const Vector &x); - inline Vc_PURE_L Vector operator/ (const Vector &x) const Vc_PURE_R; - - // bitwise ops -#define OP_VEC(op) \ - Vc_ALWAYS_INLINE_L Vector &operator op##=(AsArg x) Vc_ALWAYS_INLINE_R; \ - Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator op (AsArg x) const Vc_ALWAYS_INLINE_R Vc_PURE_R; -#define OP_ENTRY(op) \ - Vc_ALWAYS_INLINE Vector &operator op##=(EntryType x) { return operator op##=(Vector(x)); } \ - template Vc_ALWAYS_INLINE Vc_PURE VC_EXACT_TYPE(TT, EntryType, Vector) operator op(TT x) const { return operator op(Vector(x)); } - VC_ALL_BINARY(OP_VEC) - VC_ALL_BINARY(OP_ENTRY) - VC_ALL_SHIFTS(OP_VEC) -#undef OP_VEC -#undef OP_ENTRY - - Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R; - -#define OPcmp(symbol, fun) \ - Vc_ALWAYS_INLINE Mask operator symbol(AsArg x) const { return VectorHelper::fun(data(), x.data()); } \ - template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Mask) operator symbol(TT x) const { return operator symbol(Vector(x)); } - - OPcmp(==, cmpeq) - OPcmp(!=, cmpneq) - OPcmp(>=, cmpnlt) - OPcmp(>, cmpnle) - OPcmp(<, cmplt) - OPcmp(<=, cmple) -#undef OPcmp - Vc_INTRINSIC_L Vc_PURE_L Mask isNegative() const Vc_PURE_R Vc_INTRINSIC_R; - - Vc_ALWAYS_INLINE void fusedMultiplyAdd(const Vector &factor, const Vector &summand) { - VectorHelper::fma(data(), factor.data(), summand.data()); - } - - Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { - const VectorType k = avx_cast(mask.data()); - data() = VectorHelper::blend(data(), v.data(), k); - } - - template Vc_ALWAYS_INLINE V2 staticCast() const { return V2(*this); } - template Vc_ALWAYS_INLINE V2 reinterpretCast() const { return avx_cast(data()); } - - Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k) { return WriteMaskedVector(this, k); } - - /** - * \return \p true This vector was completely filled. m2 might be 0 or != 0. You still have - * to test this. - * \p false This vector was not completely filled. m2 is all 0. - */ - //inline bool pack(Mask &m1, Vector &v2, Mask &m2) { - //return VectorHelper::pack(data(), m1.data, v2.data(), m2.data); - //} + // shifts +#define Vc_OP_VEC(op) \ + Vc_INTRINSIC Vector &operator op##=(AsArg x); \ + Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const \ + { \ + static_assert( \ + std::is_integral::value, \ + "bitwise-operators can only be used with Vectors of integral type"); \ + } + Vc_ALL_SHIFTS(Vc_OP_VEC); +#undef Vc_OP_VEC + + Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R; + + Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask + isNegative() const + { + return Vc::isnegative(*this); + } + + Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { + const VectorType k = _cast(mask.data()); + data() = Detail::blend(data(), v.data(), k); + } + + template + Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2 + staticCast() const + { + return V2(*this); + } + template + Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2 + reinterpretCast() const + { + return AVX::avx_cast(data()); + } + + Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k) + { + return {*this, k}; + } Vc_ALWAYS_INLINE VectorType &data() { return d.v(); } - Vc_ALWAYS_INLINE const VectorType data() const { return d.v(); } + Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); } + + template + Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R; + + Vc_INTRINSIC_L std::pair minIndex() const Vc_INTRINSIC_R; + Vc_INTRINSIC_L std::pair maxIndex() const Vc_INTRINSIC_R; - Vc_ALWAYS_INLINE EntryType min() const { return VectorHelper::min(data()); } - Vc_ALWAYS_INLINE EntryType max() const { return VectorHelper::max(data()); } - Vc_ALWAYS_INLINE EntryType product() const { return VectorHelper::mul(data()); } - Vc_ALWAYS_INLINE EntryType sum() const { return VectorHelper::add(data()); } - Vc_ALWAYS_INLINE_L EntryType min(MaskArg m) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L EntryType max(MaskArg m) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L EntryType product(MaskArg m) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L EntryType sum(MaskArg m) const Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); } + Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); } + Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); } + Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); } + Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R; + //template Vc_ALWAYS_INLINE_L Vector partialSum(BinaryOperation op) const Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R; + Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R; + Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; - Vc_ALWAYS_INLINE Vector sorted() const { return SortHelper::sort(data()); } + Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R; + Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - template void callWithValuesSorted(F &f) { + template void callWithValuesSorted(F &&f) + { EntryType value = d.m(0); f(value); - for (int i = 1; i < Size; ++i) { + for (size_t i = 1; i < Size; ++i) { if (d.m(i) != value) { value = d.m(i); f(value); @@ -346,121 +341,119 @@ } } - template Vc_INTRINSIC void call(const F &f) const { - for_all_vector_entries(i, - f(EntryType(d.m(i))); - ); - } - template Vc_INTRINSIC void call(F &f) const { - for_all_vector_entries(i, - f(EntryType(d.m(i))); - ); + template Vc_INTRINSIC void call(F &&f) const + { + Common::for_all_vector_entries([&](size_t i) { f(EntryType(d.m(i))); }); } - template Vc_INTRINSIC void call(const F &f, const Mask &mask) const { - Vc_foreach_bit(size_t i, mask) { - f(EntryType(d.m(i))); - } - } - template Vc_INTRINSIC void call(F &f, const Mask &mask) const { - Vc_foreach_bit(size_t i, mask) { + template Vc_INTRINSIC void call(F &&f, const Mask &mask) const + { + for (size_t i : where(mask)) { f(EntryType(d.m(i))); } } - template Vc_INTRINSIC Vector apply(const F &f) const { - Vector r; - for_all_vector_entries(i, - r.d.m(i) = f(EntryType(d.m(i))); - ); - return r; - } - template Vc_INTRINSIC Vector apply(F &f) const { - Vector r; - for_all_vector_entries(i, - r.d.m(i) = f(EntryType(d.m(i))); - ); + template Vc_INTRINSIC Vector apply(F &&f) const + { + Vector r; + Common::for_all_vector_entries( + [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); }); return r; } - template Vc_INTRINSIC Vector apply(const F &f, const Mask &mask) const { - Vector r(*this); - Vc_foreach_bit (size_t i, mask) { - r.d.m(i) = f(EntryType(r.d.m(i))); - } - return r; - } - template Vc_INTRINSIC Vector apply(F &f, const Mask &mask) const { - Vector r(*this); - Vc_foreach_bit (size_t i, mask) { - r.d.m(i) = f(EntryType(r.d.m(i))); + template Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const + { + Vector r(*this); + for (size_t i : where(mask)) { + r.d.set(i, f(EntryType(r.d.m(i)))); } return r; } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { - for_all_vector_entries(i, - d.m(i) = f(i); - ); + Common::for_all_vector_entries([&](size_t i) { d.set(i, f(i)); }); } Vc_INTRINSIC void fill(EntryType (&f)()) { - for_all_vector_entries(i, - d.m(i) = f(); - ); + Common::for_all_vector_entries([&](size_t i) { d.set(i, f()); }); } - Vc_INTRINSIC_L Vector copySign(AsArg reference) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector exponent() const Vc_INTRINSIC_R; + template static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R; + + Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector + copySign(AsArg reference) const + { + return Vc::copysign(*this, reference); + } + + Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const + { + Vc::exponent(*this); + } + + Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R; + Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R; }; +#undef Vc_CURRENT_CLASS_NAME +template constexpr size_t Vector::Size; +template constexpr size_t Vector::MemoryAlignment; + +static_assert(Traits::is_simd_vector::value, "is_simd_vector::value"); +static_assert(Traits::is_simd_vector::value, "is_simd_vector< float_v>::value"); +static_assert(Traits::is_simd_vector::value, "is_simd_vector< int_v>::value"); +static_assert(Traits::is_simd_vector::value, "is_simd_vector< uint_v>::value"); +static_assert(Traits::is_simd_vector::value, "is_simd_vector< short_v>::value"); +static_assert(Traits::is_simd_vector::value, "is_simd_vector::value"); +static_assert(Traits::is_simd_mask ::value, "is_simd_mask ::value"); +static_assert(Traits::is_simd_mask ::value, "is_simd_mask < float_m>::value"); +static_assert(Traits::is_simd_mask ::value, "is_simd_mask < int_m>::value"); +static_assert(Traits::is_simd_mask ::value, "is_simd_mask < uint_m>::value"); +static_assert(Traits::is_simd_mask ::value, "is_simd_mask < short_m>::value"); +static_assert(Traits::is_simd_mask ::value, "is_simd_mask ::value"); + +#ifdef Vc_IMPL_AVX2 +static_assert(!std::is_convertible::value, "A float* should never implicitly convert to short_v. Something is broken."); +static_assert(!std::is_convertible::value, "An int* should never implicitly convert to short_v. Something is broken."); +static_assert(!std::is_convertible::value, "A short* should never implicitly convert to short_v. Something is broken."); +#endif + +#define Vc_CONDITIONAL_ASSIGN(name_, op_) \ + template \ + Vc_INTRINSIC enable_if conditional_assign( \ + AVX2::Vector &lhs, M &&mask, U &&rhs) \ + { \ + lhs(mask) op_ rhs; \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON +Vc_CONDITIONAL_ASSIGN( Assign, =); +Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); +Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); +Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); +Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); +Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); +Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); +Vc_CONDITIONAL_ASSIGN( AndAssign, &=); +Vc_CONDITIONAL_ASSIGN( OrAssign, |=); +Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); +Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); +#undef Vc_CONDITIONAL_ASSIGN + +#define Vc_CONDITIONAL_ASSIGN(name_, expr_) \ + template \ + Vc_INTRINSIC enable_if> conditional_assign( \ + AVX2::Vector &lhs, M &&mask) \ + { \ + return expr_; \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON +Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++); +Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask)); +Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--); +Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask)); +#undef Vc_CONDITIONAL_ASSIGN -typedef Vector double_v; -typedef Vector float_v; -typedef Vector sfloat_v; -typedef Vector int_v; -typedef Vector uint_v; -typedef Vector short_v; -typedef Vector ushort_v; -typedef double_v::Mask double_m; -typedef float_v::Mask float_m; -typedef sfloat_v::Mask sfloat_m; -typedef int_v::Mask int_m; -typedef uint_v::Mask uint_m; -typedef short_v::Mask short_m; -typedef ushort_v::Mask ushort_m; - -template class SwizzledVector : public Vector {}; - -static Vc_ALWAYS_INLINE int_v min(const int_v &x, const int_v &y) { return _mm256_min_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE uint_v min(const uint_v &x, const uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE short_v min(const short_v &x, const short_v &y) { return _mm_min_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE ushort_v min(const ushort_v &x, const ushort_v &y) { return _mm_min_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE float_v min(const float_v &x, const float_v &y) { return _mm256_min_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE sfloat_v min(const sfloat_v &x, const sfloat_v &y) { return _mm256_min_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE double_v min(const double_v &x, const double_v &y) { return _mm256_min_pd(x.data(), y.data()); } -static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y) { return _mm256_max_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE uint_v max(const uint_v &x, const uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE short_v max(const short_v &x, const short_v &y) { return _mm_max_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE ushort_v max(const ushort_v &x, const ushort_v &y) { return _mm_max_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE float_v max(const float_v &x, const float_v &y) { return _mm256_max_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE sfloat_v max(const sfloat_v &x, const sfloat_v &y) { return _mm256_max_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE double_v max(const double_v &x, const double_v &y) { return _mm256_max_pd(x.data(), y.data()); } - - template static Vc_ALWAYS_INLINE Vector sqrt (const Vector &x) { return VectorHelper::sqrt(x.data()); } - template static Vc_ALWAYS_INLINE Vector rsqrt(const Vector &x) { return VectorHelper::rsqrt(x.data()); } - template static Vc_ALWAYS_INLINE Vector abs (const Vector &x) { return VectorHelper::abs(x.data()); } - template static Vc_ALWAYS_INLINE Vector reciprocal(const Vector &x) { return VectorHelper::reciprocal(x.data()); } - template static Vc_ALWAYS_INLINE Vector round(const Vector &x) { return VectorHelper::round(x.data()); } - - template static Vc_ALWAYS_INLINE typename Vector::Mask isfinite(const Vector &x) { return VectorHelper::isFinite(x.data()); } - template static Vc_ALWAYS_INLINE typename Vector::Mask isnan(const Vector &x) { return VectorHelper::isNaN(x.data()); } - -#include "forceToRegisters.tcc" -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +} // namespace Vc #include "vector.tcc" -#include "math.h" -#include "undomacros.h" +#include "simd_cast.h" -#endif // AVX_VECTOR_H +#endif // VC_AVX_VECTOR_H_ diff -Nru vc-0.7.4/avx/vectorhelper.h vc-1.3.0/avx/vectorhelper.h --- vc-0.7.4/avx/vectorhelper.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/vectorhelper.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,258 +1,116 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_VECTORHELPER_H -#define AVX_VECTORHELPER_H +#ifndef VC_AVX_VECTORHELPER_H_ +#define VC_AVX_VECTORHELPER_H_ #include #include "types.h" #include "intrinsics.h" #include "casts.h" +#include "../common/loadstoreflags.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { namespace AVX { - -namespace Internal -{ -Vc_INTRINSIC Vc_CONST m256 exponent(param256 v) -{ - m128i tmp0 = _mm_srli_epi32(avx_cast(v), 23); - m128i tmp1 = _mm_srli_epi32(avx_cast(hi128(v)), 23); - tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); - tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); - return _mm256_cvtepi32_ps(concat(tmp0, tmp1)); -} -Vc_INTRINSIC Vc_CONST m256d exponent(param256d v) -{ - m128i tmp0 = _mm_srli_epi64(avx_cast(v), 52); - m128i tmp1 = _mm_srli_epi64(avx_cast(hi128(v)), 52); - tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff)); - tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff)); - return _mm256_cvtepi32_pd(avx_cast(Mem::shuffle(avx_cast(tmp0), avx_cast(tmp1)))); -} -} // namespace Internal - -#define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } -#define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a) { return code; } -#define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b) { return code; } -#define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b, VTArg c) { return code; } - - template<> struct VectorHelper + template<> struct VectorHelper<__m256> { - typedef m256 VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else + typedef __m256 VectorType; typedef const VectorType VTArg; -#endif - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - static Vc_ALWAYS_INLINE Vc_CONST VectorType cdab(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType badc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType aaaa(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType bbbb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cccc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType dddd(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType dacb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); } - - OP0(allone, _mm256_setallone_ps()) - OP0(zero, _mm256_setzero_ps()) - OP2(or_, _mm256_or_ps(a, b)) - OP2(xor_, _mm256_xor_ps(a, b)) - OP2(and_, _mm256_and_ps(a, b)) - OP2(andnot_, _mm256_andnot_ps(a, b)) - OP3(blend, _mm256_blendv_ps(a, b, c)) - }; - template<> struct VectorHelper - { - typedef m256d VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - static VectorType cdab(VTArg x) { return _mm256_permute_pd(x, 5); } - static VectorType badc(VTArg x) { return _mm256_permute2f128_pd(x, x, 1); } - // aaaa bbbb cccc dddd specialized in vector.tcc - static VectorType dacb(VTArg x) { - const m128d cb = avx_cast(_mm_alignr_epi8(avx_cast(lo128(x)), - avx_cast(hi128(x)), sizeof(double))); // XXX: lo and hi swapped? - const m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped? - return concat(cb, da); - } - - OP0(allone, _mm256_setallone_pd()) - OP0(zero, _mm256_setzero_pd()) - OP2(or_, _mm256_or_pd(a, b)) - OP2(xor_, _mm256_xor_pd(a, b)) - OP2(and_, _mm256_and_pd(a, b)) - OP2(andnot_, _mm256_andnot_pd(a, b)) - OP3(blend, _mm256_blendv_pd(a, b, c)) + template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); } + template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); } + template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); } + template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); } + + template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } + template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; - template<> struct VectorHelper + template<> struct VectorHelper<__m256d> { - typedef m256i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else + typedef __m256d VectorType; typedef const VectorType VTArg; -#endif - template static VectorType load(const T *x, AlignedFlag) Vc_PURE; - template static VectorType load(const T *x, UnalignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; - template static void store(T *mem, VTArg x, AlignedFlag); - template static void store(T *mem, VTArg x, UnalignedFlag); - template static void store(T *mem, VTArg x, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, AlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); - - static VectorType cdab(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(2, 3, 0, 1))); } - static VectorType badc(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(1, 0, 3, 2))); } - static VectorType aaaa(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(0, 0, 0, 0))); } - static VectorType bbbb(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(1, 1, 1, 1))); } - static VectorType cccc(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(2, 2, 2, 2))); } - static VectorType dddd(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(3, 3, 3, 3))); } - static VectorType dacb(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(3, 0, 2, 1))); } - - OP0(allone, _mm256_setallone_si256()) - OP0(zero, _mm256_setzero_si256()) - OP2(or_, _mm256_or_si256(a, b)) - OP2(xor_, _mm256_xor_si256(a, b)) - OP2(and_, _mm256_and_si256(a, b)) - OP2(andnot_, _mm256_andnot_si256(a, b)) - OP3(blend, _mm256_blendv_epi8(a, b, c)) + + template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); } + template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); } + template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); } + template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); } + + template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } + template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; - template<> struct VectorHelper + template<> struct VectorHelper<__m256i> { - typedef m128i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else + typedef __m256i VectorType; typedef const VectorType VTArg; -#endif - template static VectorType load(const T *x, AlignedFlag) Vc_PURE; - template static VectorType load(const T *x, UnalignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; - template static void store(T *mem, VTArg x, AlignedFlag); - template static void store(T *mem, VTArg x, UnalignedFlag); - template static void store(T *mem, VTArg x, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, AlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); - - static VectorType cdab(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); } - static VectorType badc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)); } - static VectorType aaaa(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 0, 0, 0)); } - static VectorType bbbb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 1, 1, 1)); } - static VectorType cccc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 2, 2, 2)); } - static VectorType dddd(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 3, 3, 3)); } - static VectorType dacb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 0, 2, 1)); } - - OP0(allone, _mm_setallone_si128()) - OP0(zero, _mm_setzero_si128()) - OP2(or_, _mm_or_si128(a, b)) - OP2(xor_, _mm_xor_si128(a, b)) - OP2(and_, _mm_and_si128(a, b)) - OP2(andnot_, _mm_andnot_si128(a, b)) - OP3(blend, _mm_blendv_epi8(a, b, c)) + + template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); } + template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); } + template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); } + template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); } + + template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } + template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; -#undef OP1 -#undef OP2 -#undef OP3 - -#define OP1(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return CAT(_mm256_##op##_, SUFFIX)(a); } -#define OP(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); } -#define OP_(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op , SUFFIX)(a, b); } -#define OPx(op, op2) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); } -#define OPcmp(op) \ - static Vc_INTRINSIC VectorType Vc_CONST cmp##op(VTArg a, VTArg b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); } -#define OP_CAST_(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_castps_, SUFFIX)( \ - _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \ - CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \ - } -#define MINMAX \ - static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \ - static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return CAT(_mm256_max_, SUFFIX)(a, b); } + +#define Vc_OP1(op) \ + static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); } +#define Vc_OP(op) \ + static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); } +#define Vc_OP_(op) \ + static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); } +#define Vc_OPx(op, op2) \ + static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); } template<> struct VectorHelper { - typedef m256d VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else + typedef __m256d VectorType; typedef const VectorType VTArg; -#endif typedef double EntryType; - typedef double ConcatType; -#define SUFFIX pd +#define Vc_SUFFIX pd - static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); } - static Vc_ALWAYS_INLINE VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); } + static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); } + static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d); + return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d); } - static Vc_ALWAYS_INLINE VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); } + static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); } + static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { -#ifdef VC_IMPL_FMA4 +#ifdef Vc_IMPL_FMA4 v1 = _mm256_macc_pd(v1, v2, v3); #else VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); -#if defined(VC_GCC) && VC_GCC < 0x40703 +#if defined(Vc_GCC) && Vc_GCC < 0x40703 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 asm("":"+x"(h1), "+x"(h2)); @@ -263,57 +121,51 @@ const VectorType lh = add(mul(l1, h2), mul(h1, l2)); const VectorType hh = mul(h1, h2); // ll < lh < hh for all entries is certain - const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3| + const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3| const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3); const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3); v1 = add(add(ll, b), add(c, hh)); #endif } - OP(add) OP(sub) OP(mul) - OPcmp(eq) OPcmp(neq) - OPcmp(lt) OPcmp(nlt) - OPcmp(le) OPcmp(nle) + static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); } + static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); } + static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); } - OP1(sqrt) + Vc_OP1(sqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) { return _mm256_div_pd(one(), sqrt(x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_div_pd(one(), x); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { - return _mm256_cmpunord_pd(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { - return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x)); - } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { - return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd()); + return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd()); } - MINMAX + static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); } + static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { - m128d b = _mm_min_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); + __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_min_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { - m128d b = _mm_max_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); + __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_max_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { - m128d b = _mm_mul_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); + __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { - m128d b = _mm_add_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); + __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(b); } -#undef SUFFIX +#undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_pd(a, _MM_FROUND_NINT); } @@ -321,445 +173,85 @@ template<> struct VectorHelper { typedef float EntryType; - typedef m256 VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else + typedef __m256 VectorType; typedef const VectorType VTArg; -#endif - typedef double ConcatType; -#define SUFFIX ps +#define Vc_SUFFIX ps - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); } + static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); } + static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, const float e, const float f, const float g, const float h) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); } - static Vc_ALWAYS_INLINE Vc_CONST m256 concat(param256d a, param256d b) { return _mm256_insertf128_ps(avx_cast(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } + return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); } + static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); } + static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.f); } + static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { -#ifdef VC_IMPL_FMA4 +#ifdef Vc_IMPL_FMA4 v1 = _mm256_macc_ps(v1, v2, v3); #else - m256d v1_0 = _mm256_cvtps_pd(lo128(v1)); - m256d v1_1 = _mm256_cvtps_pd(hi128(v1)); - m256d v2_0 = _mm256_cvtps_pd(lo128(v2)); - m256d v2_1 = _mm256_cvtps_pd(hi128(v2)); - m256d v3_0 = _mm256_cvtps_pd(lo128(v3)); - m256d v3_1 = _mm256_cvtps_pd(hi128(v3)); + __m256d v1_0 = _mm256_cvtps_pd(lo128(v1)); + __m256d v1_1 = _mm256_cvtps_pd(hi128(v1)); + __m256d v2_0 = _mm256_cvtps_pd(lo128(v2)); + __m256d v2_1 = _mm256_cvtps_pd(hi128(v2)); + __m256d v3_0 = _mm256_cvtps_pd(lo128(v3)); + __m256d v3_1 = _mm256_cvtps_pd(hi128(v3)); v1 = AVX::concat( _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); #endif } - OP(add) OP(sub) OP(mul) - OPcmp(eq) OPcmp(neq) - OPcmp(lt) OPcmp(nlt) - OPcmp(le) OPcmp(nle) - - OP1(sqrt) OP1(rsqrt) - static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { - return _mm256_cmpunord_ps(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { - return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x)); - } + static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); } + static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); } + static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); } + + Vc_OP1(sqrt) Vc_OP1(rsqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_rcp_ps(x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { - return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps()); + return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps()); } - MINMAX + static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); } + static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { - m128 b = _mm_min_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); + __m128 b = _mm_min_ps(lo128(a), hi128(a)); b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3 return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { - m128 b = _mm_max_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); + __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3 return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { - m128 b = _mm_mul_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); + __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { - m128 b = _mm_add_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); + __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } -#undef SUFFIX +#undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_ps(a, _MM_FROUND_NINT); } }; - template<> struct VectorHelper : public VectorHelper {}; - - template<> struct VectorHelper { - typedef int EntryType; - typedef m256i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef long long ConcatType; -#define SUFFIX si256 - - OP_(or_) OP_(and_) OP_(xor_) - static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } -#undef SUFFIX -#define SUFFIX epi32 - static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } - - static Vc_INTRINSIC VectorType Vc_CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const int a, const int b, const int c, const int d, - const int e, const int f, const int g, const int h) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } - - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm256_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm256_srai_, SUFFIX)(a, shift); - } - OP1(abs) - - MINMAX - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { - m128i b = _mm_min_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { - m128i b = _mm_max_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { - m128i b = _mm_add_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { - m128i b = _mm_mullo_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } - - OP(add) OP(sub) - OPcmp(eq) - OPcmp(lt) - OPcmp(gt) - static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { m256i x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { m256i x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { m256i x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } - }; - - template<> struct VectorHelper { - typedef unsigned int EntryType; - typedef m256i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef unsigned long long ConcatType; -#define SUFFIX si256 - OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) - static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } - -#undef SUFFIX -#define SUFFIX epu32 - static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } - - MINMAX - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { - m128i b = _mm_min_epu32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { - m128i b = _mm_max_epu32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { - m128i b = _mm_add_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { - m128i b = _mm_mullo_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } - -#undef SUFFIX -#define SUFFIX epi32 - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm256_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm256_srli_, SUFFIX)(a, shift); - } - static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d, - const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } - - OP(add) OP(sub) - OPcmp(eq) - static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); } - -#ifndef USE_INCORRECT_UNSIGNED_COMPARE - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { - return _mm256_cmplt_epu32(a, b); - } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { - return _mm256_cmpgt_epu32(a, b); - } -#else - OPcmp(lt) - OPcmp(gt) -#endif - static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } - -#undef SUFFIX - static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } - }; - - template<> struct VectorHelper { - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef signed short EntryType; - typedef int ConcatType; - - static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } - -#define SUFFIX epi16 - static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm_setone_, SUFFIX)(); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm_srai_, SUFFIX)(a, shift); - } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, - const EntryType e, const EntryType f, const EntryType g, const EntryType h) { - return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); - } - - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { - v1 = add(mul(v1, v2), v3); - } - - static Vc_INTRINSIC VectorType Vc_CONST abs(VTArg a) { return _mm_abs_epi16(a); } - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epi16(a, b); } - - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { - VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { - VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - - static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } - }; - - template<> struct VectorHelper { - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef unsigned short EntryType; - typedef unsigned int ConcatType; - - static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } - static Vc_INTRINSIC VectorType Vc_CONST one() { return _mm_setone_epu16(); } - - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epu16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epu16(a, b); } - -#define SUFFIX epi16 - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm_srli_, SUFFIX)(a, shift); - } - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, - const EntryType d, const EntryType e, const EntryType f, - const EntryType g, const EntryType h) { - return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); - } - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } - - static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } - -#ifndef USE_INCORRECT_UNSIGNED_COMPARE - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epu16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epu16(a, b); } -#else - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } -#endif - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } - }; -#undef OP1 -#undef OP -#undef OP_ -#undef OPx -#undef OPcmp - -template<> struct VectorHelper -{ - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef char EntryType; - typedef short ConcatType; -}; - -template<> struct VectorHelper -{ - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef unsigned char EntryType; - typedef unsigned short ConcatType; -}; - -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +#undef Vc_OP1 +#undef Vc_OP +#undef Vc_OP_ +#undef Vc_OPx -#include "vectorhelper.tcc" -#include "undomacros.h" +} // namespace AVX(2) +} // namespace Vc -#endif // AVX_VECTORHELPER_H +#endif // VC_AVX_VECTORHELPER_H_ diff -Nru vc-0.7.4/avx/vectorhelper.tcc vc-1.3.0/avx/vectorhelper.tcc --- vc-0.7.4/avx/vectorhelper.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/vectorhelper.tcc 1969-12-31 18:00:00.000000000 -0600 @@ -1,270 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "casts.h" -#include - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace AVX -{ - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// float_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, AlignedFlag) -{ - return _mm256_load_ps(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, UnalignedFlag) -{ - return _mm256_loadu_ps(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, StreamingAndAlignedFlag) -{ - return avx_cast(concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[4]))))); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256 - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const float *m, StreamingAndUnalignedFlag) -{ - return _mm256_loadu_ps(m); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, AlignedFlag) -{ - _mm256_store_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, UnalignedFlag) -{ - _mm256_storeu_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm256_stream_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm_setallone_si128(), reinterpret_cast(mem + 4)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, AlignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, UnalignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm256_extractf128_si256(avx_cast(m), 1), reinterpret_cast(mem + 4)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm256_extractf128_si256(avx_cast(m), 1), reinterpret_cast(mem + 4)); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// double_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, AlignedFlag) -{ - return _mm256_load_pd(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, UnalignedFlag) -{ - return _mm256_loadu_pd(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, StreamingAndAlignedFlag) -{ - return avx_cast(concat( - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[2]))))); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256d - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const double *m, StreamingAndUnalignedFlag) -{ - return _mm256_loadu_pd(m); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, AlignedFlag) -{ - _mm256_store_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, UnalignedFlag) -{ - _mm256_storeu_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm256_stream_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); - _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), _mm_setallone_si128(), reinterpret_cast(mem + 2)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, AlignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, UnalignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), avx_cast(_mm256_extractf128_pd(m, 1)), reinterpret_cast(mem + 2)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), avx_cast(_mm256_extractf128_pd(m, 1)), reinterpret_cast(mem + 2)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -// (u)int_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, AlignedFlag) -{ - return _mm256_load_si256(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, UnalignedFlag) -{ - return _mm256_loadu_si256(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, StreamingAndAlignedFlag) -{ - return concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[4])))); -} -template Vc_ALWAYS_INLINE Vc_PURE m256i - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const T *m, StreamingAndUnalignedFlag) -{ - return _mm256_loadu_si256(reinterpret_cast(m)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, AlignedFlag) -{ - _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, UnalignedFlag) -{ - _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(x, 1), _mm_setallone_si128(), reinterpret_cast(mem + 4)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, AlignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, UnalignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast(mem + 4)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast(mem + 4)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -// (u)short_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, AlignedFlag) -{ - return _mm_load_si128(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, UnalignedFlag) -{ - return _mm_loadu_si128(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, StreamingAndAlignedFlag) -{ - return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))); -} -template Vc_ALWAYS_INLINE Vc_PURE m128i - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const T *m, StreamingAndUnalignedFlag) -{ - return _mm_loadu_si128(reinterpret_cast(m)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, AlignedFlag) -{ - _mm_store_si128(reinterpret_cast<__m128i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, UnalignedFlag) -{ - _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm_stream_si128(reinterpret_cast<__m128i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast(mem)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, AlignedFlag align) -{ - store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, UnalignedFlag align) -{ - store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); -} - -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ diff -Nru vc-0.7.4/avx/vector.tcc vc-1.3.0/avx/vector.tcc --- vc-0.7.4/avx/vector.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/vector.tcc 2016-10-27 02:05:02.000000000 -0500 @@ -1,1406 +1,918 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ +/* This file is part of the Vc library. {{{ +Copyright © 2011-2015 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#include "../common/x86_prefetches.h" +#include "../common/gatherimplementation.h" +#include "../common/scatterimplementation.h" #include "limits.h" #include "const.h" +#include "../common/set.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { -ALIGN(64) extern unsigned int RandomState[16]; - -namespace AVX +namespace Detail { - +// compare operators {{{1 +Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } +Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } + +Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } +Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } + +Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); } +Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); } + +Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); } +Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); } + +Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); } +Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); } + +Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); } +Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); } + +// bitwise operators {{{1 +template +Vc_INTRINSIC AVX2::Vector operator^(AVX2::Vector a, AVX2::Vector b) +{ + return xor_(a.data(), b.data()); +} +template +Vc_INTRINSIC AVX2::Vector operator&(AVX2::Vector a, AVX2::Vector b) +{ + return and_(a.data(), b.data()); +} +template +Vc_INTRINSIC AVX2::Vector operator|(AVX2::Vector a, AVX2::Vector b) +{ + return or_(a.data(), b.data()); +} +// }}}1 +// arithmetic operators {{{1 +template +Vc_INTRINSIC AVX2::Vector operator+(AVX2::Vector a, AVX2::Vector b) +{ + return add(a.data(), b.data(), T()); +} +template +Vc_INTRINSIC AVX2::Vector operator-(AVX2::Vector a, AVX2::Vector b) +{ + return sub(a.data(), b.data(), T()); +} +template +Vc_INTRINSIC AVX2::Vector operator*(AVX2::Vector a, AVX2::Vector b) +{ + return mul(a.data(), b.data(), T()); +} +template +Vc_INTRINSIC AVX2::Vector operator/(AVX2::Vector a, AVX2::Vector b) +{ + return div(a.data(), b.data(), T()); +} +Vc_INTRINSIC AVX2::Vector operator/(AVX2::Vector a, + AVX2::Vector b) +{ + using namespace AVX; + const __m256 lo = _mm256_div_ps(convert(lo128(a.data())), + convert(lo128(b.data()))); + const __m256 hi = _mm256_div_ps(convert(hi128(a.data())), + convert(hi128(b.data()))); + const float_v threshold = 32767.f; + using Detail::operator>; + const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty())) + ? convert(lo) + : convert(lo); + const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty())) + ? convert(hi) + : convert(hi); + return concat(loShort, hiShort); +} +template +Vc_INTRINSIC enable_if::value, AVX2::Vector> operator%( + AVX2::Vector a, AVX2::Vector b) +{ + return a - a / b * b; +} +// }}}1 +} // namespace Detail /////////////////////////////////////////////////////////////////////////////////////////// -// constants {{{1 -template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {} -template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {} -template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) - : d(HV::load(IndexesFromZeroData::address(), Aligned)) {} - -template Vc_INTRINSIC Vector Vc_CONST Vector::Zero() { return HT::zero(); } -template Vc_INTRINSIC Vector Vc_CONST Vector::One() { return HT::one(); } -template Vc_INTRINSIC Vector Vc_CONST Vector::IndexesFromZero() { return HV::load(IndexesFromZeroData::address(), Aligned); } - -template template Vc_ALWAYS_INLINE Vector::Vector(VC_ALIGNED_PARAMETER(Vector) x) - : d(StaticCastHelper::cast(x.data())) {} - -template Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(HT::set(x)) {} -template<> Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(_mm256_set1_pd(x)) {} +// generate {{{1 +template <> template Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen) +{ + const auto tmp0 = gen(0); + const auto tmp1 = gen(1); + const auto tmp2 = gen(2); + const auto tmp3 = gen(3); + return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3); +} +template <> template Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen) +{ + const auto tmp0 = gen(0); + const auto tmp1 = gen(1); + const auto tmp2 = gen(2); + const auto tmp3 = gen(3); + const auto tmp4 = gen(4); + const auto tmp5 = gen(5); + const auto tmp6 = gen(6); + const auto tmp7 = gen(7); + return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); +} +#ifdef Vc_IMPL_AVX2 +template <> template Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen) +{ + const auto tmp0 = gen(0); + const auto tmp1 = gen(1); + const auto tmp2 = gen(2); + const auto tmp3 = gen(3); + const auto tmp4 = gen(4); + const auto tmp5 = gen(5); + const auto tmp6 = gen(6); + const auto tmp7 = gen(7); + return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); +} +template <> template Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen) +{ + const auto tmp0 = gen(0); + const auto tmp1 = gen(1); + const auto tmp2 = gen(2); + const auto tmp3 = gen(3); + const auto tmp4 = gen(4); + const auto tmp5 = gen(5); + const auto tmp6 = gen(6); + const auto tmp7 = gen(7); + return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); +} +template <> template Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen) +{ + const auto tmp0 = gen(0); + const auto tmp1 = gen(1); + const auto tmp2 = gen(2); + const auto tmp3 = gen(3); + const auto tmp4 = gen(4); + const auto tmp5 = gen(5); + const auto tmp6 = gen(6); + const auto tmp7 = gen(7); + const auto tmp8 = gen(8); + const auto tmp9 = gen(9); + const auto tmp10 = gen(10); + const auto tmp11 = gen(11); + const auto tmp12 = gen(12); + const auto tmp13 = gen(13); + const auto tmp14 = gen(14); + const auto tmp15 = gen(15); + return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); +} +template <> template Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen) +{ + const auto tmp0 = gen(0); + const auto tmp1 = gen(1); + const auto tmp2 = gen(2); + const auto tmp3 = gen(3); + const auto tmp4 = gen(4); + const auto tmp5 = gen(5); + const auto tmp6 = gen(6); + const auto tmp7 = gen(7); + const auto tmp8 = gen(8); + const auto tmp9 = gen(9); + const auto tmp10 = gen(10); + const auto tmp11 = gen(11); + const auto tmp12 = gen(12); + const auto tmp13 = gen(13); + const auto tmp14 = gen(14); + const auto tmp15 = gen(15); + return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); +} +#endif +// constants {{{1 +template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero) : d{} {} -/////////////////////////////////////////////////////////////////////////////////////////// -// load ctors {{{1 -template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } -template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } -template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } +template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {} +template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {} +#ifdef Vc_IMPL_AVX2 +template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {} +template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {} +template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {} +template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {} +template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {} +template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {} +#endif -/////////////////////////////////////////////////////////////////////////////////////////// -// load member functions {{{1 -template Vc_INTRINSIC void Vector::load(const EntryType *mem) +template +Vc_ALWAYS_INLINE Vector::Vector( + VectorSpecialInitializerIndexesFromZero) + : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { - load(mem, Aligned); } -template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) +template <> +Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero) + : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { - d.v() = HV::load(mem, align); } - -template template Vc_INTRINSIC void Vector::load(const OtherT *mem) +template <> +Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero) + : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { - load(mem, Aligned); } -// LoadHelper {{{2 -template struct LoadHelper; - -// float {{{2 -template struct LoadHelper { - static m256 load(const double *mem, Flags f) - { - return concat(_mm256_cvtpd_ps(VectorHelper::load(&mem[0], f)), - _mm256_cvtpd_ps(VectorHelper::load(&mem[4], f))); - } -}; -template struct LoadHelper { - static m256 load(const unsigned int *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const int *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const unsigned char *mem, Flags f) - { - return StaticCastHelper::cast(LoadHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const signed char *mem, Flags f) - { - return StaticCastHelper::cast(LoadHelper::load(mem, f)); - } -}; - -template struct LoadHelper : public LoadHelper {}; - -// int {{{2 -template struct LoadHelper { - static m256i load(const unsigned int *mem, Flags f) - { - return VectorHelper::load(mem, f); - } -}; -template struct LoadHelper { - static m256i load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256i load(const short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - const m128i epu16 = _mm_cvtepu8_epi16(epu8); - return StaticCastHelper::cast(epu16); - } -}; -template struct LoadHelper { - static m256i load(const signed char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); - const m128i epi16 = _mm_cvtepi8_epi16(epi8); - return StaticCastHelper::cast(epi16); - } -}; - -// unsigned int {{{2 -template struct LoadHelper { - static m256i load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - const m128i epu16 = _mm_cvtepu8_epi16(epu8); - return StaticCastHelper::cast(epu16); - } -}; - -// short {{{2 -template struct LoadHelper { - static m128i load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m128i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - return _mm_cvtepu8_epi16(epu8); - } -}; -template struct LoadHelper { - static m128i load(const signed char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); - return _mm_cvtepi8_epi16(epi8); - } -}; - -// unsigned short {{{2 -template struct LoadHelper { - static m128i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - return _mm_cvtepu8_epi16(epu8); - } -}; - +/////////////////////////////////////////////////////////////////////////////////////////// +// load member functions {{{1 // general load, implemented via LoadHelper {{{2 -template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) +template +template +Vc_INTRINSIC typename Vector:: +#ifndef Vc_MSVC +template +#endif +load_concept::type Vector::load(const SrcT *mem, Flags flags) { - d.v() = LoadHelper::load(x, f); + Common::handleLoadPrefetches(mem, flags); + d.v() = Detail::load(mem, flags); } /////////////////////////////////////////////////////////////////////////////////////////// // zeroing {{{1 -template Vc_INTRINSIC void Vector::setZero() +template Vc_INTRINSIC void Vector::setZero() { - data() = HV::zero(); + data() = Detail::zero(); } -template Vc_INTRINSIC void Vector::setZero(const Mask &k) +template Vc_INTRINSIC void Vector::setZero(const Mask &k) { - data() = HV::andnot_(avx_cast(k.data()), data()); + data() = Detail::andnot_(AVX::avx_cast(k.data()), data()); } - -template<> Vc_INTRINSIC void Vector::setQnan() +template Vc_INTRINSIC void Vector::setZeroInverted(const Mask &k) { - data() = _mm256_setallone_pd(); + data() = Detail::and_(AVX::avx_cast(k.data()), data()); } -template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) -{ - data() = _mm256_or_pd(data(), k.dataD()); -} -template<> Vc_INTRINSIC void Vector::setQnan() + +template<> Vc_INTRINSIC void Vector::setQnan() { - data() = _mm256_setallone_ps(); + data() = Detail::allone(); } -template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) +template<> Vc_INTRINSIC void Vector::setQnan(MaskArgument k) { - data() = _mm256_or_ps(data(), k.data()); + data() = _mm256_or_pd(data(), k.dataD()); } -template<> Vc_INTRINSIC void Vector::setQnan() +template<> Vc_INTRINSIC void Vector::setQnan() { - data() = _mm256_setallone_ps(); + data() = Detail::allone(); } -template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) +template<> Vc_INTRINSIC void Vector::setQnan(MaskArgument k) { data() = _mm256_or_ps(data(), k.data()); } /////////////////////////////////////////////////////////////////////////////////////////// // stores {{{1 -template Vc_INTRINSIC void Vector::store(EntryType *mem) const -{ - HV::store(mem, data(), Aligned); -} -template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const -{ - HV::store(mem, data(), avx_cast(mask.data()), Aligned); -} -template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const -{ - HV::store(mem, data(), align); -} -template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const +template +template +Vc_INTRINSIC void Vector::store(U *mem, Flags flags) const +{ + Common::handleStorePrefetches(mem, flags); + HV::template store(mem, data()); +} + +template +template +Vc_INTRINSIC void Vector::store(U *mem, Mask mask, Flags flags) const { - HV::store(mem, data(), avx_cast(mask.data()), align); + Common::handleStorePrefetches(mem, flags); + HV::template store(mem, data(), AVX::avx_cast(mask.data())); } /////////////////////////////////////////////////////////////////////////////////////////// -// expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1 -template Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(a[0]) -{ +// integer ops {{{1 +#ifdef Vc_IMPL_AVX2 +template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); } +template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); } +template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); } +template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); } +template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); } +template <> Vc_ALWAYS_INLINE AVX2::Vector Vector::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); } +template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); } +template <> Vc_ALWAYS_INLINE AVX2::Vector Vector::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); } +template +Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator<<=(AsArg x) +{ + static_assert(std::is_integral::value, + "bitwise-operators can only be used with Vectors of integral type"); + return *this = *this << x; +} +template +Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator>>=(AsArg x) +{ + static_assert(std::is_integral::value, + "bitwise-operators can only be used with Vectors of integral type"); + return *this = *this >> x; } -template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data()))) -{ +#endif + +template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator>>=(int shift) { + d.v() = Detail::shiftRight(d.v(), shift, T()); + return *static_cast *>(this); } -template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data()))) -{ +template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator>>(int shift) const { + return Detail::shiftRight(d.v(), shift, T()); } -template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data()))) -{ +template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator<<=(int shift) { + d.v() = Detail::shiftLeft(d.v(), shift, T()); + return *static_cast *>(this); } -template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const -{ - x[0] = *this; +template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator<<(int shift) const { + return Detail::shiftLeft(d.v(), shift, T()); } -template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const + +// isnegative {{{1 +Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x) { - x[0].data() = _mm256_cvtps_pd(lo128(d.v())); - x[1].data() = _mm256_cvtps_pd(hi128(d.v())); + return AVX::avx_cast<__m256>(AVX::srai_epi32<31>( + AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data())))); } -template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const +Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x) { - x[0].data() = concat(_mm_cvtepi16_epi32(d.v()), - _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); + return Mem::permute(AVX::avx_cast<__m256>(AVX::srai_epi32<31>( + AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data()))))); } -template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const +// gathers {{{1 +template <> +template +inline void AVX2::double_v::gatherImplementation(const MT *mem, IT &&indexes) { - x[0].data() = concat(_mm_cvtepu16_epi32(d.v()), - _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); + d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } -/////////////////////////////////////////////////////////////////////////////////////////// -// swizzles {{{1 -template Vc_INTRINSIC const Vector Vc_PURE &Vector::abcd() const { return *this; } -template Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } - -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cdab() const { return Mem::shuffle128(data(), data()); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::badc() const { return Mem::permute(data()); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcad() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcda() const { return Mem::shuffle(data(), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dabc() const { return Mem::shuffle(Mem::shuffle128(data(), data()), data()); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::acbd() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dbca() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dcba() const { return cdab().badc(); } - -#define VC_SWIZZLES_16BIT_IMPL(T) \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } -VC_SWIZZLES_16BIT_IMPL(short) -VC_SWIZZLES_16BIT_IMPL(unsigned short) -#undef VC_SWIZZLES_16BIT_IMPL - -/////////////////////////////////////////////////////////////////////////////////////////// -// division {{{1 -template inline Vector &Vector::operator/=(EntryType x) -{ - if (HasVectorDivision) { - return operator/=(Vector(x)); - } - for_all_vector_entries(i, - d.m(i) /= x; - ); - return *this; +template <> +template +inline void AVX2::float_v::gatherImplementation(const MT *mem, IT &&indexes) +{ + d.v() = _mm256_setr_ps(mem[indexes[0]], + mem[indexes[1]], + mem[indexes[2]], + mem[indexes[3]], + mem[indexes[4]], + mem[indexes[5]], + mem[indexes[6]], + mem[indexes[7]]); +} + +#ifdef Vc_IMPL_AVX2 +template <> +template +inline void AVX2::int_v::gatherImplementation(const MT *mem, IT &&indexes) +{ + d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], + mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], + mem[indexes[6]], mem[indexes[7]]); +} + +template <> +template +inline void AVX2::uint_v::gatherImplementation(const MT *mem, IT &&indexes) +{ + d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], + mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], + mem[indexes[6]], mem[indexes[7]]); +} + +template <> +template +inline void AVX2::short_v::gatherImplementation(const MT *mem, IT &&indexes) +{ + d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], + mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], + mem[indexes[6]], mem[indexes[7]], mem[indexes[8]], + mem[indexes[9]], mem[indexes[10]], mem[indexes[11]], + mem[indexes[12]], mem[indexes[13]], mem[indexes[14]], + mem[indexes[15]]); +} + +template <> +template +inline void AVX2::ushort_v::gatherImplementation(const MT *mem, IT &&indexes) +{ + d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], + mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], + mem[indexes[6]], mem[indexes[7]], mem[indexes[8]], + mem[indexes[9]], mem[indexes[10]], mem[indexes[11]], + mem[indexes[12]], mem[indexes[13]], mem[indexes[14]], + mem[indexes[15]]); } -template template inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const -{ - if (HasVectorDivision) { - return operator/(Vector(x)); - } - Vector r; - for_all_vector_entries(i, - r.d.m(i) = d.m(i) / x; - ); - return r; -} -// per default fall back to scalar division -template inline Vector &Vector::operator/=(const Vector &x) -{ - for_all_vector_entries(i, - d.m(i) /= x.d.m(i); - ); - return *this; +#endif + +template +template +inline void Vector::gatherImplementation(const MT *mem, IT &&indexes, MaskArgument mask) +{ + using Selector = std::integral_constant < Common::GatherScatterImplementation, +#ifdef Vc_USE_SET_GATHERS + Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : +#endif +#ifdef Vc_USE_BSF_GATHERS + Common::GatherScatterImplementation::BitScanLoop +#elif defined Vc_USE_POPCNT_BSF_GATHERS + Common::GatherScatterImplementation::PopcntSwitch +#else + Common::GatherScatterImplementation::SimpleLoop +#endif + > ; + Common::executeGather(Selector(), *this, mem, std::forward(indexes), mask); } -template inline Vector Vc_PURE Vector::operator/(const Vector &x) const +template +template +inline void Vector::scatterImplementation(MT *mem, IT &&indexes) const { - Vector r; - for_all_vector_entries(i, - r.d.m(i) = d.m(i) / x.d.m(i); - ); - return r; -} -// specialize division on type -static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) { - const m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); - const m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); - const m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); - const m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); - return concat( - _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), - _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)) - ); + Common::unrolled_loop([&](std::size_t i) { mem[indexes[i]] = d.m(i); }); } -template<> inline Vector &Vector::operator/=(const Vector &x) -{ - d.v() = divInt(d.v(), x.d.v()); - return *this; -} -template<> inline Vector Vc_PURE Vector::operator/(const Vector &x) const + +template +template +inline void Vector::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { - return divInt(d.v(), x.d.v()); -} -static inline m256i Vc_CONST divUInt(param256i a, param256i b) { - m256d loa = _mm256_cvtepi32_pd(lo128(a)); - m256d hia = _mm256_cvtepi32_pd(hi128(a)); - m256d lob = _mm256_cvtepi32_pd(lo128(b)); - m256d hib = _mm256_cvtepi32_pd(hi128(b)); - // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32) - // to get the right number back we have to add 2^32 where a >= 2^31 - loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); - hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); - // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and - // we rather want the standard stuff fast - // - // there is one remaining problem: a >= 2^31 and b == 1 - // in that case the return value would be 2^31 - return avx_cast(_mm256_blendv_ps(avx_cast(concat( - _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), - _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)) - )), avx_cast(a), avx_cast(concat( - _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()), - _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32()))))); + using Selector = std::integral_constant < Common::GatherScatterImplementation, +#ifdef Vc_USE_SET_GATHERS + Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : +#endif +#ifdef Vc_USE_BSF_GATHERS + Common::GatherScatterImplementation::BitScanLoop +#elif defined Vc_USE_POPCNT_BSF_GATHERS + Common::GatherScatterImplementation::PopcntSwitch +#else + Common::GatherScatterImplementation::SimpleLoop +#endif + > ; + Common::executeScatter(Selector(), *this, mem, std::forward(indexes), mask); } -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) + +/////////////////////////////////////////////////////////////////////////////////////////// +// operator- {{{1 +#ifdef Vc_USE_BUILTIN_VECTOR_TYPES +template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator-() const { - d.v() = divUInt(d.v(), x.d.v()); - return *this; + return VectorType(-d.builtin()); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const +#else +template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator-() const { - return divUInt(d.v(), x.d.v()); + return Detail::negate(d.v(), std::integral_constant()); } -template static inline m128i Vc_CONST divShort(param128i a, param128i b) +#endif + +/////////////////////////////////////////////////////////////////////////////////////////// +// horizontal ops {{{1 +template +Vc_INTRINSIC std::pair, int> +Vector::minIndex() const { - const m256 r = _mm256_div_ps(StaticCastHelper::cast(a), - StaticCastHelper::cast(b)); - return StaticCastHelper::cast(r); + AVX2::Vector x = min(); + return std::make_pair(x, (*this == x).firstOne()); } -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) +template +Vc_INTRINSIC std::pair, int> +Vector::maxIndex() const { - d.v() = divShort(d.v(), x.d.v()); - return *this; + AVX2::Vector x = max(); + return std::make_pair(x, (*this == x).firstOne()); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const +template <> Vc_INTRINSIC std::pair AVX2::float_v::minIndex() const { - return divShort(d.v(), x.d.v()); + /* + // 28 cycles latency: + __m256 x = _mm256_min_ps(Mem::permute128(d.v()), d.v()); + x = _mm256_min_ps(x, Reg::permute(x)); + AVX2::float_v xx = _mm256_min_ps(x, Reg::permute(x)); + AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero(); + idx = _mm256_castps_si256( + _mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data()))); + return std::make_pair(xx, (*this == xx).firstOne()); + + __m128 loData = AVX::lo128(d.v()); + __m128 hiData = AVX::hi128(d.v()); + const __m128 less2 = _mm_cmplt_ps(hiData, loData); + loData = _mm_min_ps(loData, hiData); + hiData = Mem::permute(loData); + const __m128 less1 = _mm_cmplt_ps(hiData, loData); + loData = _mm_min_ps(loData, hiData); + hiData = Mem::permute(loData); + const __m128 less0 = _mm_cmplt_ps(hiData, loData); + unsigned bits = _mm_movemask_ps(less0) & 0x1; + bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2; + bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4; + loData = _mm_min_ps(loData, hiData); + return std::make_pair(AVX::concat(loData, loData), bits); + */ + + // 28 cycles Latency: + __m256 x = d.v(); + __m256 idx = Vector::IndexesFromZero().data(); + __m256 y = Mem::permute128(x); + __m256 idy = Mem::permute128(idx); + __m256 less = AVX::cmplt_ps(x, y); + + x = _mm256_blendv_ps(y, x, less); + idx = _mm256_blendv_ps(idy, idx, less); + y = Reg::permute(x); + idy = Reg::permute(idx); + less = AVX::cmplt_ps(x, y); + + x = _mm256_blendv_ps(y, x, less); + idx = _mm256_blendv_ps(idy, idx, less); + y = Reg::permute(x); + idy = Reg::permute(idx); + less = AVX::cmplt_ps(x, y); + + idx = _mm256_blendv_ps(idy, idx, less); + + const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx)); +#ifdef Vc_GNU_ASM + __asm__ __volatile__(""); // help GCC to order the instructions better +#endif + x = _mm256_blendv_ps(y, x, less); + return std::make_pair(x, index); } -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) +template Vc_ALWAYS_INLINE AVX2::Vector Vector::partialSum() const { - d.v() = divShort(d.v(), x.d.v()); - return *this; + // a b c d e f g h + // + a b c d e f g -> a ab bc cd de ef fg gh + // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh + // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh + AVX2::Vector tmp = *this; + if (Size > 1) tmp += tmp.shifted(-1); + if (Size > 2) tmp += tmp.shifted(-2); + if (Size > 4) tmp += tmp.shifted(-4); + if (Size > 8) tmp += tmp.shifted(-8); + if (Size > 16) tmp += tmp.shifted(-16); + return tmp; +} + +/* This function requires correct masking because the neutral element of \p op is not necessarily 0 + * +template template Vc_ALWAYS_INLINE AVX2::Vector Vector::partialSum(BinaryOperation op) const +{ + // a b c d e f g h + // + a b c d e f g -> a ab bc cd de ef fg gh + // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh + // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh + AVX2::Vector tmp = *this; + Mask mask(true); + if (Size > 1) tmp(mask) = op(tmp, tmp.shifted(-1)); + if (Size > 2) tmp(mask) = op(tmp, tmp.shifted(-2)); + if (Size > 4) tmp(mask) = op(tmp, tmp.shifted(-4)); + if (Size > 8) tmp(mask) = op(tmp, tmp.shifted(-8)); + if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16)); + return tmp; } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const +*/ + +template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::min(MaskArgument m) const { - return divShort(d.v(), x.d.v()); + AVX2::Vector tmp = std::numeric_limits >::max(); + tmp(m) = *this; + return tmp.min(); } -template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x) +template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::max(MaskArgument m) const { - d.v() = _mm256_div_ps(d.v(), x.d.v()); - return *this; + AVX2::Vector tmp = std::numeric_limits >::min(); + tmp(m) = *this; + return tmp.max(); } -template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const +template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::product(MaskArgument m) const { - return _mm256_div_ps(d.v(), x.d.v()); + AVX2::Vector tmp(Vc::One); + tmp(m) = *this; + return tmp.product(); } -template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x) +template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::sum(MaskArgument m) const { - d.v() = _mm256_div_ps(d.v(), x.d.v()); - return *this; -} -template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const + AVX2::Vector tmp(Vc::Zero); + tmp(m) = *this; + return tmp.sum(); +}//}}} +// exponent {{{1 +namespace Detail { - return _mm256_div_ps(d.v(), x.d.v()); -} -template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x) +Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v) { - d.v() = _mm256_div_pd(d.v(), x.d.v()); - return *this; + using namespace AVX; + __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23); + __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23); + tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); + tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); + return _mm256_cvtepi32_ps(concat(tmp0, tmp1)); } -template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const +Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v) { - return _mm256_div_pd(d.v(), x.d.v()); + using namespace AVX; + __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52); + __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52); + tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff)); + tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff)); + return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1)))); } +} // namespace Detail -/////////////////////////////////////////////////////////////////////////////////////////// -// integer ops {{{1 -#define OP_IMPL(T, symbol) \ -template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) \ -{ \ - for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \ - return *this; \ -} \ -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const \ -{ \ - Vector r; \ - for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \ - return r; \ -} -OP_IMPL(int, <<) -OP_IMPL(int, >>) -OP_IMPL(unsigned int, <<) -OP_IMPL(unsigned int, >>) -OP_IMPL(short, <<) -OP_IMPL(short, >>) -OP_IMPL(unsigned short, <<) -OP_IMPL(unsigned short, >>) -#undef OP_IMPL - -template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { - d.v() = VectorHelper::shiftRight(d.v(), shift); - return *static_cast *>(this); -} -template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { - return VectorHelper::shiftRight(d.v(), shift); -} -template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { - d.v() = VectorHelper::shiftLeft(d.v(), shift); - return *static_cast *>(this); -} -template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { - return VectorHelper::shiftLeft(d.v(), shift); -} - -#define OP_IMPL(T, symbol, fun) \ - template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \ - template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const { return Vector(HV::fun(d.v(), x.d.v())); } - OP_IMPL(int, &, and_) - OP_IMPL(int, |, or_) - OP_IMPL(int, ^, xor_) - OP_IMPL(unsigned int, &, and_) - OP_IMPL(unsigned int, |, or_) - OP_IMPL(unsigned int, ^, xor_) - OP_IMPL(short, &, and_) - OP_IMPL(short, |, or_) - OP_IMPL(short, ^, xor_) - OP_IMPL(unsigned short, &, and_) - OP_IMPL(unsigned short, |, or_) - OP_IMPL(unsigned short, ^, xor_) - OP_IMPL(float, &, and_) - OP_IMPL(float, |, or_) - OP_IMPL(float, ^, xor_) - OP_IMPL(sfloat, &, and_) - OP_IMPL(sfloat, |, or_) - OP_IMPL(sfloat, ^, xor_) - OP_IMPL(double, &, and_) - OP_IMPL(double, |, or_) - OP_IMPL(double, ^, xor_) -#undef OP_IMPL - -// operators {{{1 -#include "../common/operators.h" -// isNegative {{{1 -template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const -{ - return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); -} -template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const -{ - return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); -} -template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const -{ - return Mem::permute(avx_cast( - _mm256_srai_epi32(avx_cast(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31) - )); -} -// gathers {{{1 -// Better implementation (hopefully) with _mm256_set_ -//X template template Vector::Vector(const EntryType *mem, const Index *indexes) -//X { -//X for_all_vector_entries(int i, -//X d.m(i) = mem[indexes[i]]; -//X ); -//X } -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) +Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x) { - gather(mem, indexes); + using Detail::operator>=; + Vc_ASSERT((x >= x.Zero()).isFull()); + return Detail::exponent(x.data()); } -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) +Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x) { - gather(mem, indexes); + using Detail::operator>=; + Vc_ASSERT((x >= x.Zero()).isFull()); + return Detail::exponent(x.data()); } - -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) - : d(HT::zero()) +// }}}1 +// Random {{{1 +static Vc_ALWAYS_INLINE __m256i _doRandomStep() { - gather(mem, indexes, mask); + using Detail::operator*; + using Detail::operator+; +#ifdef Vc_IMPL_AVX2 + using AVX2::uint_v; + uint_v state0(&Common::RandomState[0]); + uint_v state1(&Common::RandomState[uint_v::Size]); + (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]); + uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(), + _mm256_srli_epi32(state1.data(), 16))) + .store(&Common::RandomState[0]); + return state0.data(); +#else + using SSE::uint_v; + uint_v state0(&Common::RandomState[0]); + uint_v state1(&Common::RandomState[uint_v::Size]); + uint_v state2(&Common::RandomState[2 * uint_v::Size]); + uint_v state3(&Common::RandomState[3 * uint_v::Size]); + (state2 * uint_v(0xdeece66du) + uint_v(11)) + .store(&Common::RandomState[2 * uint_v::Size]); + (state3 * uint_v(0xdeece66du) + uint_v(11)) + .store(&Common::RandomState[3 * uint_v::Size]); + uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(), + _mm_srli_epi32(state2.data(), 16))) + .store(&Common::RandomState[0]); + uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(), + _mm_srli_epi32(state3.data(), 16))) + .store(&Common::RandomState[uint_v::Size]); + return AVX::concat(state0.data(), state1.data()); +#endif } -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) - : d(HT::zero()) +#ifdef Vc_IMPL_AVX2 +template Vc_ALWAYS_INLINE AVX2::Vector Vector::Random() { - gather(mem, indexes, mask); + return {_doRandomStep()}; } +#endif -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - gather(array, member1, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, member1, indexes, mask); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - gather(array, member1, member2, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, member1, member2, indexes, mask); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - gather(array, ptrMember1, outerIndexes, innerIndexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) - : d(HT::zero()) +template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random() { - gather(array, ptrMember1, outerIndexes, innerIndexes, mask); + return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()), + HT::one()); } -template struct IndexSizeChecker { static void check() {} }; -template struct IndexSizeChecker, Size> +template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random() { - static void check() { - VC_STATIC_ASSERT(Vector::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); + const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned, + Detail::LoadTag<__m256i, int>()); + for (size_t k = 0; k < 8; k += 2) { + typedef unsigned long long uint64 Vc_MAY_ALIAS; + const uint64 stateX = *reinterpret_cast(&Common::RandomState[k]); + *reinterpret_cast(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11); } -}; -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); + return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one()); } -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) +// }}}1 +// shifted / rotated {{{1 +template Vc_INTRINSIC AVX2::Vector Vector::shifted(int amount) const { - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); + return Detail::shifted(d.v(), amount); } -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) + +template +Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>) { - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); + return Mem::shuffle(left, right); } -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) +template +Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>) { - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); + return Mem::shuffle128(left, right); } -#ifdef VC_USE_SET_GATHERS -template template Vc_ALWAYS_INLINE void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) +template Vc_INTRINSIC AVX2::Vector Vector::shifted(int amount, Vector shiftIn) const { - IndexSizeChecker, Size>::check(); - Vector indexesTmp = indexes; - indexesTmp.setZero(!mask); - (*this)(mask) = Vector(mem, indexesTmp); -} -#endif - -#ifdef VC_USE_BSF_GATHERS -#define VC_MASKED_GATHER \ - int bits = mask.toInt(); \ - while (bits) { \ - const int i = _bit_scan_forward(bits); \ - bits &= ~(1 << i); /* btr? */ \ - d.m(i) = ith_value(i); \ - } -#elif defined(VC_USE_POPCNT_BSF_GATHERS) -#define VC_MASKED_GATHER \ - unsigned int bits = mask.toInt(); \ - unsigned int low, high = 0; \ - switch (_mm_popcnt_u32(bits)) { \ - case 8: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 7: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 6: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 5: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 4: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 3: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 2: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - case 1: \ - low = _bit_scan_forward(bits); \ - d.m(low) = ith_value(low); \ - case 0: \ - break; \ - } -#else -#define VC_MASKED_GATHER \ - if (mask.isEmpty()) { \ - return; \ - } \ - for_all_vector_entries(i, \ - if (mask[i]) d.m(i) = ith_value(i); \ - ); -#endif - -template template -Vc_INTRINSIC void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (mem[indexes[_i_]]) - VC_MASKED_GATHER -#undef ith_value -} - -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1), - array[indexes[2]].*(member1), array[indexes[3]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (array[indexes[_i_]].*(member1)) - VC_MASKED_GATHER -#undef ith_value -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), - array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) - VC_MASKED_GATHER -#undef ith_value -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); -#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] - VC_MASKED_GATHER -#undef ith_value -} - -#undef VC_MASKED_GATHER -#ifdef VC_USE_BSF_SCATTERS -#define VC_MASKED_SCATTER \ - int bits = mask.toInt(); \ - while (bits) { \ - const int i = _bit_scan_forward(bits); \ - bits ^= (1 << i); /* btr? */ \ - ith_value(i) = d.m(i); \ - } -#elif defined(VC_USE_POPCNT_BSF_SCATTERS) -#define VC_MASKED_SCATTER \ - unsigned int bits = mask.toInt(); \ - unsigned int low, high = 0; \ - switch (_mm_popcnt_u32(bits)) { \ - case 8: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 7: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 6: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 5: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 4: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 3: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 2: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - case 1: \ - low = _bit_scan_forward(bits); \ - ith_value(low) = d.m(low); \ - case 0: \ - break; \ +#ifdef __GNUC__ + if (__builtin_constant_p(amount)) { + switch (amount * 2) { + case int(Size): + return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT()); + case -int(Size): + return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT()); + } } -#else -#define VC_MASKED_SCATTER \ - if (mask.isEmpty()) { \ - return; \ - } \ - for_all_vector_entries(i, \ - if (mask[i]) ith_value(i) = d.m(i); \ - ); #endif - -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const -{ - for_all_vector_entries(i, - mem[indexes[i]] = d.m(i); - ); -} -#if defined(VC_MSVC) && VC_MSVC >= 170000000 -// MSVC miscompiles the store mem[indexes[1]] = d.m(1) for T = (u)short -template<> template Vc_ALWAYS_INLINE void short_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const -{ - const unsigned int tmp = d.v()._d.m128i_u32[0]; - mem[indexes[0]] = tmp & 0xffff; - mem[indexes[1]] = tmp >> 16; - mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); - mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); - mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); - mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); - mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); - mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); -} -template<> template Vc_ALWAYS_INLINE void ushort_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const -{ - const unsigned int tmp = d.v()._d.m128i_u32[0]; - mem[indexes[0]] = tmp & 0xffff; - mem[indexes[1]] = tmp >> 16; - mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); - mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); - mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); - mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); - mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); - mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); + using Detail::operator|; + return shifted(amount) | (amount > 0 ? + shiftIn.shifted(amount - Size) : + shiftIn.shifted(Size + amount)); } -#endif -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const +template Vc_INTRINSIC AVX2::Vector Vector::rotated(int amount) const { -#define ith_value(_i_) mem[indexes[_i_]] - VC_MASKED_SCATTER -#undef ith_value + return Detail::rotated(d.v(), amount); } -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const +// sorted {{{1 +template +Vc_ALWAYS_INLINE Vc_PURE Vector Vector::sorted() + const { - for_all_vector_entries(i, - array[indexes[i]].*(member1) = d.m(i); - ); + return Detail::sorted(*this); } -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const +// interleaveLow/-High {{{1 +template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const { -#define ith_value(_i_) array[indexes[_i_]].*(member1) - VC_MASKED_SCATTER -#undef ith_value + return Mem::shuffle128(_mm256_unpacklo_pd(data(), x.data()), + _mm256_unpackhi_pd(data(), x.data())); } -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const +template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const { - for_all_vector_entries(i, - array[indexes[i]].*(member1).*(member2) = d.m(i); - ); + return Mem::shuffle128(_mm256_unpacklo_pd(data(), x.data()), + _mm256_unpackhi_pd(data(), x.data())); } -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const +template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const { -#define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) - VC_MASKED_SCATTER -#undef ith_value + return Mem::shuffle128(_mm256_unpacklo_ps(data(), x.data()), + _mm256_unpackhi_ps(data(), x.data())); } -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const +template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const { - for_all_vector_entries(i, - (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); - ); + return Mem::shuffle128(_mm256_unpacklo_ps(data(), x.data()), + _mm256_unpackhi_ps(data(), x.data())); } -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const -{ -#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] - VC_MASKED_SCATTER -#undef ith_value +#ifdef Vc_IMPL_AVX2 +template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), + _mm256_unpackhi_epi32(data(), x.data())); } - -/////////////////////////////////////////////////////////////////////////////////////////// -// operator- {{{1 -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd()); +template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), + _mm256_unpackhi_epi32(data(), x.data())); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); +template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), + _mm256_unpackhi_epi32(data(), x.data())); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); +template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), + _mm256_unpackhi_epi32(data(), x.data())); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); +template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), + _mm256_unpackhi_epi16(data(), x.data())); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); +template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), + _mm256_unpackhi_epi16(data(), x.data())); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm_sign_epi16(d.v(), _mm_setallone_si128()); +template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), + _mm256_unpackhi_epi16(data(), x.data())); } -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm_sign_epi16(d.v(), _mm_setallone_si128()); +template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const { + return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), + _mm256_unpackhi_epi16(data(), x.data())); } - -/////////////////////////////////////////////////////////////////////////////////////////// -// horizontal ops {{{1 -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::min(MaskArg m) const +#endif +// permutation via operator[] {{{1 +template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const { - Vector tmp = std::numeric_limits >::max(); - tmp(m) = *this; - return tmp.min(); + return Mem::permute128(Mem::permute(d.v())); } -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::max(MaskArg m) const +template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const { - Vector tmp = std::numeric_limits >::min(); - tmp(m) = *this; - return tmp.max(); + return Mem::permute128(Mem::permute(d.v())); } -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::product(MaskArg m) const +#ifdef Vc_IMPL_AVX2 +template <> +Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const { - Vector tmp(VectorSpecialInitializerOne::One); - tmp(m) = *this; - return tmp.product(); + return Mem::permute128(Mem::permute(d.v())); } -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::sum(MaskArg m) const +template <> +Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const { - Vector tmp(VectorSpecialInitializerZero::Zero); - tmp(m) = *this; - return tmp.sum(); -}//}}} -// copySign {{{1 -template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm256_or_ps( - _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), - _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) - ); -} -template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm256_or_ps( - _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), - _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) - ); -} -template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm256_or_pd( - _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()), - _mm256_and_pd(d.v(), _mm256_setabsmask_pd()) - ); -}//}}}1 -// exponent {{{1 -template<> Vc_INTRINSIC Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.f).isFull()); - return Internal::exponent(d.v()); -} -template<> Vc_INTRINSIC Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.f).isFull()); - return Internal::exponent(d.v()); + return Mem::permute128(Mem::permute(d.v())); } -template<> Vc_INTRINSIC Vector Vector::exponent() const +template <> +Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[]( + Permutation::ReversedTag) const { - VC_ASSERT((*this >= 0.).isFull()); - return Internal::exponent(d.v()); + return Mem::permute128(AVX::avx_cast<__m256i>(Mem::shuffle( + AVX::avx_cast<__m256d>(Mem::permuteHi(d.v())), + AVX::avx_cast<__m256d>(Mem::permuteLo(d.v()))))); } -// }}}1 -// Random {{{1 -static Vc_ALWAYS_INLINE void _doRandomStep(Vector &state0, - Vector &state1) +template <> +Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[]( + Permutation::ReversedTag) const { - state0.load(&Vc::RandomState[0]); - state1.load(&Vc::RandomState[uint_v::Size]); - (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); - uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); + return Mem::permute128(AVX::avx_cast<__m256i>(Mem::shuffle( + AVX::avx_cast<__m256d>(Mem::permuteHi(d.v())), + AVX::avx_cast<__m256d>(Mem::permuteLo(d.v()))))); } - -template Vc_ALWAYS_INLINE Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - return state0.reinterpretCast >(); -} - -template<> Vc_ALWAYS_INLINE Vector Vector::Random() +#endif +template <> Vc_INTRINSIC AVX2::float_v Vector::operator[](const IndexType &/*perm*/) const { - Vector state0, state1; - _doRandomStep(state0, state1); - return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); + // TODO + return *this; +#ifdef Vc_IMPL_AVX2 +#else + /* + const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)), + _mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4))); + if (cross128.isNotEmpty()) { + AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data()); + x(cross128) = _mm256_permutevar_ps(Mem::permute128(d.v()), perm.data()); + return x; + } else { + */ +#endif } -template<> Vc_ALWAYS_INLINE Vector Vector::Random() +// reversed {{{1 +template +Vc_INTRINSIC Vc_PURE Vector Vector::reversed() const { - Vector state0, state1; - _doRandomStep(state0, state1); - return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); + return (*this)[Permutation::Reversed]; } -template<> Vc_ALWAYS_INLINE Vector Vector::Random() +// broadcast from constexpr index {{{1 +template <> template Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const { - const m256i state = VectorHelper::load(&Vc::RandomState[0], Vc::Aligned); - for (size_t k = 0; k < 8; k += 2) { - typedef unsigned long long uint64 Vc_MAY_ALIAS; - const uint64 stateX = *reinterpret_cast(&Vc::RandomState[k]); - *reinterpret_cast(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11); - } - return (Vector(_cast(_mm256_srli_epi64(state, 12))) | One()) - One(); + constexpr VecPos Inner = static_cast(Index & 0x3); + constexpr VecPos Outer = static_cast((Index & 0x4) / 4); + return Mem::permute(Mem::permute128(d.v())); } -// }}}1 -// shifted / rotated {{{1 -template struct VectorShift; -template<> struct VectorShift<32, 4, m256d, double> -{ - static Vc_INTRINSIC m256d shifted(param256d v, int amount) - { - switch (amount) { - case 0: return v; - case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(double))); - case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(double))); - case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(double))); - case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(double))); - case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(double))); - case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(double))); - } - return _mm256_setzero_pd(); - } -}; -template struct VectorShift<32, 8, VectorType, EntryType> -{ - typedef typename SseVectorType::Type SmallV; - static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - switch (amount) { - case 0: return v; - case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(EntryType))); - case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(EntryType))); - case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(EntryType))); - case 4: return avx_cast(_mm256_srli_si256(avx_cast(v), 4 * sizeof(EntryType))); - case 5: return avx_cast(_mm256_srli_si256(avx_cast(v), 5 * sizeof(EntryType))); - case 6: return avx_cast(_mm256_srli_si256(avx_cast(v), 6 * sizeof(EntryType))); - case 7: return avx_cast(_mm256_srli_si256(avx_cast(v), 7 * sizeof(EntryType))); - case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(EntryType))); - case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(EntryType))); - case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(EntryType))); - case -4: return avx_cast(_mm256_slli_si256(avx_cast(v), 4 * sizeof(EntryType))); - case -5: return avx_cast(_mm256_slli_si256(avx_cast(v), 5 * sizeof(EntryType))); - case -6: return avx_cast(_mm256_slli_si256(avx_cast(v), 6 * sizeof(EntryType))); - case -7: return avx_cast(_mm256_slli_si256(avx_cast(v), 7 * sizeof(EntryType))); - } - return avx_cast(_mm256_setzero_ps()); - } -}; -template struct VectorShift<16, 8, VectorType, EntryType> -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - switch (amount) { - case 0: return v; - case 1: return avx_cast(_mm_srli_si128(avx_cast(v), 1 * EntryTypeSizeof)); - case 2: return avx_cast(_mm_srli_si128(avx_cast(v), 2 * EntryTypeSizeof)); - case 3: return avx_cast(_mm_srli_si128(avx_cast(v), 3 * EntryTypeSizeof)); - case 4: return avx_cast(_mm_srli_si128(avx_cast(v), 4 * EntryTypeSizeof)); - case 5: return avx_cast(_mm_srli_si128(avx_cast(v), 5 * EntryTypeSizeof)); - case 6: return avx_cast(_mm_srli_si128(avx_cast(v), 6 * EntryTypeSizeof)); - case 7: return avx_cast(_mm_srli_si128(avx_cast(v), 7 * EntryTypeSizeof)); - case -1: return avx_cast(_mm_slli_si128(avx_cast(v), 1 * EntryTypeSizeof)); - case -2: return avx_cast(_mm_slli_si128(avx_cast(v), 2 * EntryTypeSizeof)); - case -3: return avx_cast(_mm_slli_si128(avx_cast(v), 3 * EntryTypeSizeof)); - case -4: return avx_cast(_mm_slli_si128(avx_cast(v), 4 * EntryTypeSizeof)); - case -5: return avx_cast(_mm_slli_si128(avx_cast(v), 5 * EntryTypeSizeof)); - case -6: return avx_cast(_mm_slli_si128(avx_cast(v), 6 * EntryTypeSizeof)); - case -7: return avx_cast(_mm_slli_si128(avx_cast(v), 7 * EntryTypeSizeof)); - } - return _mm_setzero_si128(); - } -}; -template Vc_INTRINSIC Vector Vector::shifted(int amount) const +template <> template Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const { - return VectorShift::shifted(d.v(), amount); -} -template struct VectorRotate; -template struct VectorRotate<32, 4, VectorType, EntryType> -{ - typedef typename SseVectorType::Type SmallV; - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - const m128i vLo = avx_cast(lo128(v)); - const m128i vHi = avx_cast(hi128(v)); - switch (static_cast(amount) % 4) { - case 0: return v; - case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); - case 2: return Mem::permute128(v); - case 3: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); - } - return _mm256_setzero_pd(); - } -}; -template struct VectorRotate<32, 8, VectorType, EntryType> -{ - typedef typename SseVectorType::Type SmallV; - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - const m128i vLo = avx_cast(lo128(v)); - const m128i vHi = avx_cast(hi128(v)); - switch (static_cast(amount) % 8) { - case 0: return v; - case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); - case 2: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof))); - case 3: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof))); - case 4: return Mem::permute128(v); - case 5: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); - case 6: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof))); - case 7: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof))); - } - return avx_cast(_mm256_setzero_ps()); - } -}; -template struct VectorRotate<16, 8, VectorType, EntryType> -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - switch (static_cast(amount) % 8) { - case 0: return v; - case 1: return avx_cast(_mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); - case 2: return avx_cast(_mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); - case 3: return avx_cast(_mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); - case 4: return avx_cast(_mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); - case 5: return avx_cast(_mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); - case 6: return avx_cast(_mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); - case 7: return avx_cast(_mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); - } - return _mm_setzero_si128(); - } -}; -template Vc_INTRINSIC Vector Vector::rotated(int amount) const -{ - return VectorRotate::rotated(d.v(), amount); - /* - const m128i v0 = avx_cast(d.v()[0]); - const m128i v1 = avx_cast(d.v()[1]); - switch (static_cast(amount) % Size) { - case 0: return *this; - case 1: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType)))); - case 2: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType)))); - case 3: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType)))); - case 4: return concat(d.v()[1], d.v()[0]); - case 5: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType)))); - case 6: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType)))); - case 7: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType)))); - } - */ + constexpr VecPos Inner = static_cast(Index & 0x1); + constexpr VecPos Outer = static_cast((Index & 0x2) / 2); + return Mem::permute(Mem::permute128(d.v())); } // }}}1 -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#include "undomacros.h" +} // namespace Vc // vim: foldmethod=marker diff -Nru vc-0.7.4/avx/writemaskedvector.h vc-1.3.0/avx/writemaskedvector.h --- vc-0.7.4/avx/writemaskedvector.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/writemaskedvector.h 1969-12-31 18:00:00.000000000 -0600 @@ -1,82 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_WRITEMASKEDVECTOR_H -#define VC_AVX_WRITEMASKEDVECTOR_H - -#include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace AVX -{ - -template -class WriteMaskedVector -{ - friend class Vector; - typedef typename VectorTypeHelper::Type VectorType; - typedef typename DetermineEntryType::Type EntryType; - enum Constants { Size = sizeof(VectorType) / sizeof(EntryType) }; - typedef typename Vc::AVX::Mask Mask; - public: - FREE_STORE_OPERATORS_ALIGNED(32) - //prefix - Vector Vc_ALWAYS_INLINE_L &operator++() Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator--() Vc_ALWAYS_INLINE_R; - //postfix - Vector Vc_ALWAYS_INLINE_L operator++(int) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L operator--(int) Vc_ALWAYS_INLINE_R; - - Vector Vc_ALWAYS_INLINE_L &operator+=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator-=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator*=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator/=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE &operator+=(EntryType x) { return operator+=(Vector(x)); } - Vector Vc_ALWAYS_INLINE &operator-=(EntryType x) { return operator-=(Vector(x)); } - Vector Vc_ALWAYS_INLINE &operator*=(EntryType x) { return operator*=(Vector(x)); } - Vector Vc_ALWAYS_INLINE &operator/=(EntryType x) { return operator/=(Vector(x)); } - - Vector Vc_ALWAYS_INLINE_L &operator=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE &operator=(EntryType x) { return operator=(Vector(x)); } - - template Vc_INTRINSIC void call(const F &f) const { - return vec->call(f, mask); - } - template Vc_INTRINSIC void call(F &f) const { - return vec->call(f, mask); - } - template Vc_INTRINSIC Vector apply(const F &f) const { - return vec->apply(f, mask); - } - template Vc_INTRINSIC Vector apply(F &f) const { - return vec->apply(f, mask); - } - private: - Vc_ALWAYS_INLINE WriteMaskedVector(Vector *v, const Mask &k) : vec(v), mask(k) {} - Vector *const vec; - Mask mask; -}; - -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ -#include "writemaskedvector.tcc" -#include "undomacros.h" -#endif // VC_AVX_WRITEMASKEDVECTOR_H diff -Nru vc-0.7.4/avx/writemaskedvector.tcc vc-1.3.0/avx/writemaskedvector.tcc --- vc-0.7.4/avx/writemaskedvector.tcc 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/avx/writemaskedvector.tcc 1969-12-31 18:00:00.000000000 -0600 @@ -1,93 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace AVX -{ - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator++() -{ - vec->data() = VectorHelper::add(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator--() { - vec->data() = VectorHelper::sub(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector WriteMaskedVector::operator++(int) { - Vector ret(*vec); - vec->data() = VectorHelper::add(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return ret; -} - -template -Vc_ALWAYS_INLINE Vector WriteMaskedVector::operator--(int) { - Vector ret(*vec); - vec->data() = VectorHelper::sub(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return ret; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator+=(const Vector &x) { - vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator-=(const Vector &x) { - vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator*=(const Vector &x) { - vec->assign(VectorHelper::mul(vec->data(), x.data()), mask); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator/=(const Vector &x) { - vec->assign(*vec / x, mask); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator=(const Vector &x) { - vec->assign(x, mask); - return *vec; -} - -} // namespace AVX -} // namespace Vc -/*OUTER_NAMESPACE_END*/ diff -Nru vc-0.7.4/.clang-format vc-1.3.0/.clang-format --- vc-0.7.4/.clang-format 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/.clang-format 2016-10-27 02:05:02.000000000 -0500 @@ -18,6 +18,8 @@ # If true, while (true) continue; can be put on a single line. AllowShortLoopsOnASingleLine: false +AllowShortFunctionsOnASingleLine: true + # If true, always break before multiline string literals. AlwaysBreakBeforeMultilineStrings: false @@ -25,7 +27,7 @@ AlwaysBreakTemplateDeclarations: false # If false, a function call’s or function definition’s parameters will either all be on the same line or will have one line each. -BinPackParameters: false +BinPackParameters: true # If true, binary operators will be placed after line breaks. BreakBeforeBinaryOperators: false @@ -43,7 +45,7 @@ # The column limit. # A column limit of 0 means that there is no column limit. In this case, clang-format will respect the input’s line breaking decisions within statements. -ColumnLimit: 100 +ColumnLimit: 90 # If the constructor initializers don’t fit on a line, put each initializer on its own line. #ConstructorInitializerAllOnOneLineOrOnePerLine (bool) diff -Nru vc-0.7.4/cmake/AddCompilerFlag.cmake vc-1.3.0/cmake/AddCompilerFlag.cmake --- vc-0.7.4/cmake/AddCompilerFlag.cmake 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/AddCompilerFlag.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -5,7 +5,7 @@ # [CXX_RESULT ]) #============================================================================= -# Copyright 2010-2013 Matthias Kretz +# Copyright 2010-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -18,10 +18,9 @@ # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -38,14 +37,20 @@ get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include("${_currentDir}/CheckCCompilerFlag.cmake") include("${_currentDir}/CheckCXXCompilerFlag.cmake") +include("${_currentDir}/CheckMicCCompilerFlag.cmake") +include("${_currentDir}/CheckMicCXXCompilerFlag.cmake") macro(AddCompilerFlag _flag) string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}") set(_c_flags "CMAKE_C_FLAGS") set(_cxx_flags "CMAKE_CXX_FLAGS") + set(_mic_c_flags "CMAKE_MIC_C_FLAGS") + set(_mic_cxx_flags "CMAKE_MIC_CXX_FLAGS") set(_c_result tmp) set(_cxx_result tmp) + set(_mic_c_result) + set(_mic_cxx_result) if(${ARGC} EQUAL 2) message(WARNING "Deprecated use of the AddCompilerFlag macro.") unset(_c_result) @@ -54,23 +59,41 @@ set(state 0) unset(_c_flags) unset(_cxx_flags) + unset(_mic_c_flags) + unset(_mic_cxx_flags) unset(_c_result) unset(_cxx_result) + unset(_mic_c_result) + unset(_mic_cxx_result) foreach(_arg ${ARGN}) - if(_arg STREQUAL "C_FLAGS") + if("x${_arg}" STREQUAL "xC_FLAGS") set(state 1) if(NOT DEFINED _c_result) - set(_c_result tmp) + set(_c_result tmp0) endif() - elseif(_arg STREQUAL "CXX_FLAGS") + elseif("x${_arg}" STREQUAL "xCXX_FLAGS") set(state 2) if(NOT DEFINED _cxx_result) - set(_cxx_result tmp) + set(_cxx_result tmp1) endif() - elseif(_arg STREQUAL "C_RESULT") + elseif("x${_arg}" STREQUAL "xC_RESULT") set(state 3) - elseif(_arg STREQUAL "CXX_RESULT") + elseif("x${_arg}" STREQUAL "xCXX_RESULT") set(state 4) + elseif("x${_arg}" STREQUAL "xMIC_C_RESULT") + set(state 5) + elseif("x${_arg}" STREQUAL "xMIC_CXX_RESULT") + set(state 6) + elseif("x${_arg}" STREQUAL "xMIC_C_FLAGS") + if(NOT DEFINED _mic_c_result) + set(_mic_c_result tmp2) + endif() + set(state 7) + elseif("x${_arg}" STREQUAL "xMIC_CXX_FLAGS") + if(NOT DEFINED _mic_cxx_result) + set(_mic_cxx_result tmp3) + endif() + set(state 8) elseif(state EQUAL 1) set(_c_flags "${_arg}") elseif(state EQUAL 2) @@ -79,40 +102,79 @@ set(_c_result "${_arg}") elseif(state EQUAL 4) set(_cxx_result "${_arg}") + elseif(state EQUAL 5) + set(_mic_c_result "${_arg}") + elseif(state EQUAL 6) + set(_mic_cxx_result "${_arg}") + elseif(state EQUAL 7) + set(_mic_c_flags "${_arg}") + elseif(state EQUAL 8) + set(_mic_cxx_flags "${_arg}") else() message(FATAL_ERROR "Syntax error for AddCompilerFlag") endif() endforeach() endif() + set(_c_code "int main() { return 0; }") + set(_cxx_code "int main() { return 0; }") if("${_flag}" STREQUAL "-mfma") # Compiling with FMA3 support may fail only at the assembler level. # In that case we need to have such an instruction in the test code - set(_code "#include + set(_c_code "#include __m128 foo(__m128 x) { return _mm_fmadd_ps(x, x, x); } int main() { return 0; }") + set(_cxx_code "${_c_code}") elseif("${_flag}" STREQUAL "-stdlib=libc++") # Compiling with libc++ not only requires a compiler that understands it, but also # the libc++ headers itself - set(_code "#include + set(_cxx_code "#include + #include int main() { return 0; }") else() - set(_code "int main() { return 0; }") + set(_cxx_code "#include + int main() { return 0; }") endif() if(DEFINED _c_result) - check_c_compiler_flag("${_flag}" check_c_compiler_flag_${_flag_esc} "${_code}") + check_c_compiler_flag("${_flag}" check_c_compiler_flag_${_flag_esc} "${_c_code}") set(${_c_result} ${check_c_compiler_flag_${_flag_esc}}) endif() if(DEFINED _cxx_result) - check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc} "${_code}") + check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc} "${_cxx_code}") set(${_cxx_result} ${check_cxx_compiler_flag_${_flag_esc}}) endif() + macro(my_append _list _flag _special) + if("x${_list}" STREQUAL "x${_special}") + set(${_list} "${${_list}} ${_flag}") + else() + list(APPEND ${_list} "${_flag}") + endif() + endmacro() + if(check_c_compiler_flag_${_flag_esc} AND DEFINED _c_flags) - set(${_c_flags} "${${_c_flags}} ${_flag}") + my_append(${_c_flags} "${_flag}" CMAKE_C_FLAGS) endif() if(check_cxx_compiler_flag_${_flag_esc} AND DEFINED _cxx_flags) - set(${_cxx_flags} "${${_cxx_flags}} ${_flag}") + my_append(${_cxx_flags} "${_flag}" CMAKE_CXX_FLAGS) + endif() + + if(MIC_NATIVE_FOUND) + if(DEFINED _mic_c_result) + check_mic_c_compiler_flag("${_flag}" check_mic_c_compiler_flag_${_flag_esc} "${_c_code}") + set(${_mic_c_result} ${check_mic_c_compiler_flag_${_flag_esc}}) + endif() + if(DEFINED _mic_cxx_result) + check_mic_cxx_compiler_flag("${_flag}" check_mic_cxx_compiler_flag_${_flag_esc} "${_cxx_code}") + set(${_mic_cxx_result} ${check_mic_cxx_compiler_flag_${_flag_esc}}) + endif() + + if(check_mic_c_compiler_flag_${_flag_esc} AND DEFINED _mic_c_flags) + my_append(${_mic_c_flags} "${_flag}" CMAKE_MIC_C_FLAGS) + endif() + if(check_mic_cxx_compiler_flag_${_flag_esc} AND DEFINED _mic_cxx_flags) + my_append(${_mic_cxx_flags} "${_flag}" CMAKE_MIC_CXX_FLAGS) + endif() endif() endmacro(AddCompilerFlag) diff -Nru vc-0.7.4/cmake/AddTargetProperty.cmake vc-1.3.0/cmake/AddTargetProperty.cmake --- vc-0.7.4/cmake/AddTargetProperty.cmake 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/AddTargetProperty.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -1,5 +1,5 @@ #============================================================================= -# Copyright 2010-2013 Matthias Kretz +# Copyright 2010-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -7,15 +7,12 @@ # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. -# # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff -Nru vc-0.7.4/cmake/CheckCCompilerFlag.cmake vc-1.3.0/cmake/CheckCCompilerFlag.cmake --- vc-0.7.4/cmake/CheckCCompilerFlag.cmake 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/CheckCCompilerFlag.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -57,10 +57,14 @@ FAIL_REGEX "unrecognized .*option" # GNU FAIL_REGEX "ignored for target" # GNU FAIL_REGEX "ignoring unknown option" # MSVC + FAIL_REGEX "warning D9002" # MSVC FAIL_REGEX "[Uu]nknown option" # HP FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro FAIL_REGEX "command option .* is not recognized" # XL FAIL_REGEX "WARNING: unknown flag:" # Open64 + FAIL_REGEX "command line error" # ICC + FAIL_REGEX "command line warning" # ICC + FAIL_REGEX "#10236:" # ICC: File not found FAIL_REGEX " #10159: " # ICC FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' ) diff -Nru vc-0.7.4/cmake/CheckCXXCompilerFlag.cmake vc-1.3.0/cmake/CheckCXXCompilerFlag.cmake --- vc-0.7.4/cmake/CheckCXXCompilerFlag.cmake 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/CheckCXXCompilerFlag.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -57,10 +57,14 @@ FAIL_REGEX "unrecognized .*option" # GNU FAIL_REGEX "ignored for target" # GNU FAIL_REGEX "ignoring unknown option" # MSVC + FAIL_REGEX "warning D9002" # MSVC FAIL_REGEX "[Uu]nknown option" # HP FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro FAIL_REGEX "command option .* is not recognized" # XL FAIL_REGEX "WARNING: unknown flag:" # Open64 + FAIL_REGEX "command line error" # ICC + FAIL_REGEX "command line warning" # ICC + FAIL_REGEX "#10236:" # ICC: File not found FAIL_REGEX " #10159: " # ICC FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' ) diff -Nru vc-0.7.4/cmake/CheckMicCCompilerFlag.cmake vc-1.3.0/cmake/CheckMicCCompilerFlag.cmake --- vc-0.7.4/cmake/CheckMicCCompilerFlag.cmake 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/cmake/CheckMicCCompilerFlag.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,101 @@ +# - Check whether the MIC C compiler supports a given flag. +# CHECK_MIC_C_COMPILER_FLAG( ) +# - the compiler flag +# - variable to store the result +# This internally calls the check_c_source_compiles macro. See help +# for CheckCSourceCompiles for a listing of variables that can +# modify the build. + +#============================================================================= +# Copyright 2006-2009 Kitware, Inc. +# Copyright 2006 Alexander Neundorf +# Copyright 2011-2013 Matthias Kretz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * The names of Kitware, Inc., the Insight Consortium, or the names of +# any consortium members, or of any contributors, may not be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#============================================================================= + +macro(check_mic_c_compiler_flag _FLAG _RESULT) + if(NOT DEFINED "${_RESULT}") + set(_tmpdir "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp") + if(${ARGC} GREATER 2) + file(WRITE "${_tmpdir}/src.c" "${ARGV2}") + else() + file(WRITE "${_tmpdir}/src.c" "int main() { return 0; }") + endif() + + execute_process( + COMMAND "${MIC_CC}" -mmic -c -o "${_tmpdir}/src.o" + "${_FLAG}" "${_tmpdir}/src.c" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + RESULT_VARIABLE ${_RESULT} + OUTPUT_VARIABLE OUTPUT + ERROR_VARIABLE OUTPUT + ) + + if(${_RESULT} EQUAL 0) + foreach(_fail_regex + "error: bad value (.*) for .* switch" # GNU + "argument unused during compilation" # clang + "is valid for .* but not for C" # GNU + "unrecognized .*option" # GNU + "ignored for target" # GNU + "ignoring unknown option" # MSVC + "[Uu]nknown option" # HP + "[Ww]arning: [Oo]ption" # SunPro + "command option .* is not recognized" # XL + "WARNING: unknown flag:" # Open64 + "command line error" # ICC + "command line warning" # ICC + "#10236:" # ICC: File not found + ) + if("${OUTPUT}" MATCHES "${_fail_regex}") + set(${_RESULT} 1) + endif() + endforeach() + endif() + + if(${_RESULT} EQUAL 0) + set(${_RESULT} 1 CACHE INTERNAL "Test ${_FLAG}") + message(STATUS "Performing Test Check MIC C Compiler flag ${_FLAG} - Success") + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log + "Performing MIC C Compiler Flag Test ${_FLAG} succeded with the following output:\n" + "${OUTPUT}\n" + "COMMAND: ${MIC_CC} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" + ) + else() + message(STATUS "Performing Test Check MIC C Compiler flag ${_FLAG} - Failed") + set(${_RESULT} "" CACHE INTERNAL "Test ${_FLAG}") + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log + "Performing MIC C Compiler Flag Test ${_FLAG} failed with the following output:\n" + "${OUTPUT}\n" + "COMMAND: ${MIC_CC} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" + ) + endif() + endif() +endmacro() + diff -Nru vc-0.7.4/cmake/CheckMicCXXCompilerFlag.cmake vc-1.3.0/cmake/CheckMicCXXCompilerFlag.cmake --- vc-0.7.4/cmake/CheckMicCXXCompilerFlag.cmake 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/cmake/CheckMicCXXCompilerFlag.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,101 @@ +# - Check whether the MIC CXX compiler supports a given flag. +# CHECK_MIC_CXX_COMPILER_FLAG( ) +# - the compiler flag +# - variable to store the result +# This internally calls the check_cxx_source_compiles macro. See help +# for CheckCXXSourceCompiles for a listing of variables that can +# modify the build. + +#============================================================================= +# Copyright 2006-2009 Kitware, Inc. +# Copyright 2006 Alexander Neundorf +# Copyright 2011-2013 Matthias Kretz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * The names of Kitware, Inc., the Insight Consortium, or the names of +# any consortium members, or of any contributors, may not be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#============================================================================= + +macro(check_mic_cxx_compiler_flag _FLAG _RESULT) + if(NOT DEFINED "${_RESULT}") + set(_tmpdir "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp") + if(${ARGC} GREATER 2) + file(WRITE "${_tmpdir}/src.cpp" "${ARGV2}") + else() + file(WRITE "${_tmpdir}/src.cpp" "int main() { return 0; }") + endif() + + execute_process( + COMMAND "${MIC_CXX}" -mmic -c -o "${_tmpdir}/src.o" + "${_FLAG}" "${_tmpdir}/src.cpp" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + RESULT_VARIABLE ${_RESULT} + OUTPUT_VARIABLE OUTPUT + ERROR_VARIABLE OUTPUT + ) + + if(${_RESULT} EQUAL 0) + foreach(_fail_regex + "error: bad value (.*) for .* switch" # GNU + "argument unused during compilation" # clang + "is valid for .* but not for C\\\\+\\\\+" # GNU + "unrecognized .*option" # GNU + "ignored for target" # GNU + "ignoring unknown option" # MSVC + "[Uu]nknown option" # HP + "[Ww]arning: [Oo]ption" # SunPro + "command option .* is not recognized" # XL + "WARNING: unknown flag:" # Open64 + "command line error" # ICC + "command line warning" # ICC + "#10236:" # ICC: File not found + ) + if("${OUTPUT}" MATCHES "${_fail_regex}") + set(${_RESULT} 1) + endif() + endforeach() + endif() + + if(${_RESULT} EQUAL 0) + set(${_RESULT} 1 CACHE INTERNAL "Test ${_FLAG}") + message(STATUS "Performing Test Check MIC C++ Compiler flag ${_FLAG} - Success") + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log + "Performing MIC C++ Compiler Flag Test ${_FLAG} succeded with the following output:\n" + "${OUTPUT}\n" + "COMMAND: ${MIC_CXX} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" + ) + else() + message(STATUS "Performing Test Check MIC C++ Compiler flag ${_FLAG} - Failed") + set(${_RESULT} "" CACHE INTERNAL "Test ${_FLAG}") + file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log + "Performing MIC C++ Compiler Flag Test ${_FLAG} failed with the following output:\n" + "${OUTPUT}\n" + "COMMAND: ${MIC_CXX} -mmic -c -o ${_tmpdir}/src.o ${_FLAG} ${_tmpdir}/src.cpp\n" + ) + endif() + endif() +endmacro() + diff -Nru vc-0.7.4/cmake/FindMIC.cmake vc-1.3.0/cmake/FindMIC.cmake --- vc-0.7.4/cmake/FindMIC.cmake 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/cmake/FindMIC.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,449 @@ +#============================================================================= +# Copyright © 2010-2015 Matthias Kretz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#============================================================================= +# +# This check will search for a MIC compiler and check whether the C and C++ +# compilers are able to offload via offload pragma and target(mic) attribute. +# The project may choose to either build native MIC binaries, or offload +# binaries (hybrid code), or both. In the case where only native MIC binaries +# are built, the compiler does not need to support offloading +# +# MIC_NATIVE_FOUND is true if native MIC binaries can be built +# MIC_OFFLOAD_FOUND is true if hybrid host/MIC binaries via offload can be built +# MIC_FOUND is true if either MIC_NATIVE_FOUND or MIC_OFFLOAD_FOUND is true +# +# When MIC_NATIVE_FOUND is true you can use the macros +# mic_add_definitions +# mic_include_directories +# mic_set_link_libraries +# mic_add_library +# mic_add_executable +# for building native libraries/executables +# +# When MIC_OFFLOAD_FOUND is true you use the standard cmake macros to build +# libraries and executables but have to make sure manually that the necessary +# offload compiler switches are present. You might want to add something like: +# if(MIC_OFFLOAD_FOUND) +# AddCompilerFlag("-offload-build") +# AddCompilerFlag("-offload-copts=-vec-report=3 -H") +# AddCompilerFlag("-offload-ldopts=-lmylib") +# AddCompilerFlag("-opt-report-phase=offload") +# endif() + +set(MIC_FOUND false) +set(MIC_NATIVE_FOUND false) +set(MIC_OFFLOAD_FOUND false) + +option(ENABLE_MIC "Enable native builds for the MIC architecture (Intel Knights Corner)" ON) +if(ENABLE_MIC) + file(GLOB _intel_dirs "/opt/intel/compilers_and_libraries_*/linux") + if ("${_intel_dirs}" STREQUAL "") + file(GLOB _intel_dirs "/opt/intel/composer_xe_*") + endif() + + list(SORT _intel_dirs) + list(REVERSE _intel_dirs) + find_path(MIC_SDK_DIR bin/intel64_mic/icpc PATHS + "$ENV{MIC_SDK_DIR}" + ${_intel_dirs} + ) + mark_as_advanced(MIC_SDK_DIR) + + ############################################################################## + # First check whether offload works + + # For now offload is not supported so skip it +# if(NOT DEFINED c_compiler_can_offload OR NOT DEFINED cxx_compiler_can_offload) +# set(c_compiler_can_offload FALSE) +# set(cxx_compiler_can_offload FALSE) +# +# include(CheckCSourceCompiles) +# include(CheckCXXSourceCompiles) +# +# #find_library(MIC_HOST_IMF_LIBRARY imf HINTS ENV LIBRARY_PATH) +# #find_library(MIC_HOST_SVML_LIBRARY svml HINTS ENV LIBRARY_PATH) +# #find_library(MIC_HOST_INTLC_LIBRARY intlc HINTS ENV LIBRARY_PATH) +# +# #set(MIC_HOST_LIBS ${MIC_HOST_IMF_LIBRARY} ${MIC_HOST_SVML_LIBRARY} ${MIC_HOST_INTLC_LIBRARY}) +# +# set(_mic_offload_test_source " +##ifdef __MIC__ +##include +##endif +#__attribute__((target(mic))) void test() +#{ +##ifdef __MIC__ +# __m512 v = _mm512_setzero_ps(); +# (void)v; +##endif +#} +# +#int main() +#{ +##pragma offload target(mic) +# test(); +# return 0; +#} +#") +# set(CMAKE_REQUIRED_FLAGS "-offload-build") +# check_c_source_compiles("${_mic_offload_test_source}" c_compiler_can_offload) +# check_cxx_source_compiles("${_mic_offload_test_source}" cxx_compiler_can_offload) +# set(CMAKE_REQUIRED_FLAGS) +# endif() +# +# if(c_compiler_can_offload AND cxx_compiler_can_offload) +# message(STATUS "C/C++ Compiler can offload to MIC.") +# set(MIC_OFFLOAD_FOUND true) +# else() +# message(STATUS "C/C++ Compiler can NOT offload to MIC.") +# endif() + + ############################################################################## + # Next check whether everything required for native builds is available + + find_path(MIC_TARGET_TOOLS_DIR bin/x86_64-k1om-linux-ar HINTS + "$ENV{MIC_TARGET_TOOLS_DIR}" + "${MIC_SDK_DIR}/target" + "/usr/linux-k1om-4.7" + ) + find_program(MIC_AR x86_64-k1om-linux-ar PATHS "${MIC_TARGET_TOOLS_DIR}/bin") + find_program(MIC_RANLIB x86_64-k1om-linux-ranlib PATHS "${MIC_TARGET_TOOLS_DIR}/bin") + find_program(MIC_OBJCOPY x86_64-k1om-linux-objcopy PATHS "${MIC_TARGET_TOOLS_DIR}/bin") + find_program(MIC_NATIVELOAD micnativeloadex PATHS ENV PATH) + mark_as_advanced(MIC_TARGET_TOOLS_DIR MIC_AR MIC_RANLIB MIC_NATIVELOAD MIC_OBJCOPY) + + if(MIC_SDK_DIR AND MIC_AR AND MIC_RANLIB) + find_program(MIC_CC icc HINTS "${MIC_SDK_DIR}/bin" "${MIC_SDK_DIR}/bin/intel64") + find_program(MIC_CXX icpc HINTS "${MIC_SDK_DIR}/bin" "${MIC_SDK_DIR}/bin/intel64") + + find_library(MIC_IMF_LIBRARY imf HINTS "${MIC_SDK_DIR}/compiler/lib/mic") + find_library(MIC_SVML_LIBRARY svml HINTS "${MIC_SDK_DIR}/compiler/lib/mic") + find_library(MIC_INTLC_LIBRARY intlc HINTS "${MIC_SDK_DIR}/compiler/lib/mic") + mark_as_advanced(MIC_CC MIC_CXX MIC_IMF_LIBRARY MIC_SVML_LIBRARY MIC_INTLC_LIBRARY) + + set(MIC_LIBS ${MIC_IMF_LIBRARY} ${MIC_SVML_LIBRARY} ${MIC_INTLC_LIBRARY}) + set(MIC_CFLAGS "-O2 -vec") + + exec_program(${MIC_CXX} ARGS -V OUTPUT_VARIABLE _mic_icc_version_string RETURN_VALUE _mic_icc_ok) + if(0 EQUAL _mic_icc_ok) + string(REGEX MATCH "Version (Mainline)?[0-9. a-zA-Z]+" Vc_MIC_ICC_VERSION "${_mic_icc_version_string}") + string(SUBSTRING "${Vc_MIC_ICC_VERSION}" 8 -1 Vc_MIC_ICC_VERSION) + message(STATUS "MIC ICC Version: \"${Vc_MIC_ICC_VERSION}\"") + + if(MIC_CC AND MIC_CXX AND MIC_IMF_LIBRARY AND MIC_SVML_LIBRARY AND MIC_INTLC_LIBRARY) + set(MIC_NATIVE_FOUND true) + endif() + else() + message(STATUS "MIC ICC found, but not usable.") + endif() + endif() +endif(ENABLE_MIC) + +if(MIC_NATIVE_FOUND OR MIC_OFFLOAD_FOUND) + set(MIC_FOUND true) + list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 2338") # this switch statement does not have a default clause + list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 193") # zero used for undefined preprocessing identifier "Vc_GCC" + list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 61") # warning #61: integer operation result is out of range + list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 173") # warning #173: floating-point value does not fit in required integral type + list(APPEND CMAKE_MIC_CXX_FLAGS "-diag-disable 264") # warning #264: floating-point value does not fit in required floating-point type + + list(APPEND CMAKE_MIC_CXX_FLAGS "-fp-model source") # fix IEEE FP comliance + + set(Vc_MIC_CXX_FLAGS "") + + macro(mic_add_definitions) + add_definitions(${ARGN}) + foreach(_def ${ARGN}) + set(_mic_cflags ${_mic_cflags} "${_def}") + endforeach() + endmacro() + macro(mic_include_directories) + foreach(_dir ${ARGN}) + set(_mic_cflags ${_mic_cflags} "-I${_dir}") + endforeach() + include_directories(${ARGN}) + endmacro() + if(NOT DEFINED MIC_C_FLAGS) + set(MIC_C_FLAGS) + endif() + if(NOT DEFINED MIC_CXX_FLAGS) + set(MIC_CXX_FLAGS) + endif() +else() + message(STATUS "MIC SDK was not found!") +endif() + +if(MIC_NATIVE_FOUND) + macro(_mic_add_object _target _source _output) + get_property(_deps SOURCE "${_source}" PROPERTY OBJECT_DEPENDS) + get_filename_component(_abs "${_source}" ABSOLUTE) + get_filename_component(_ext "${_source}" EXT) + get_filename_component(_tmp "${_source}" NAME_WE) + set(${_output} "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${_tmp}${_ext}.mic.o") + set(_lang CXX) + set(_compiler "${MIC_CXX}") + if(_ext STREQUAL "c") + set(_lang C) + set(_compiler "${MIC_CC}") + endif() + + string(TOUPPER "${CMAKE_BUILD_TYPE}" _tmp) + string(STRIP "${CMAKE_MIC_${_lang}_FLAGS} ${CMAKE_${_lang}_FLAGS_${_tmp}} ${_mic_cflags} ${Vc_MIC_CXX_FLAGS}" _flags) + string(REPLACE " " ";" _flags "${_flags} ${ARGN}") + get_directory_property(_inc INCLUDE_DIRECTORIES) + foreach(_i ${_inc}) + list(APPEND _flags "-I${_i}") + endforeach() + + get_property(_launch_rule GLOBAL PROPERTY RULE_LAUNCH_COMPILE) + string(REPLACE "\"" "" _launch_rule "${_launch_rule}") + string(REPLACE " " ";" _launch_rule "${_launch_rule}") + string(REPLACE "" "${_target}" _launch_rule "${_launch_rule}") + string(REPLACE "" "${CMAKE_CURRENT_BINARY_DIR}" _launch_rule "${_launch_rule}") + string(REPLACE "" "${${_output}}" _launch_rule "${_launch_rule}") + string(REPLACE "" "${_abs}" _launch_rule "${_launch_rule}") + string(REPLACE "" "C++" _launch_rule "${_launch_rule}") + add_custom_command(OUTPUT "${${_output}}" + COMMAND ${_launch_rule} "${_compiler}" -mmic + -DVc_IMPL=MIC + ${_flags} -c -o "${${_output}}" "${_abs}" + DEPENDS "${_abs}" ${_deps} + IMPLICIT_DEPENDS ${_lang} "${_abs}" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Compiling (MIC) ${${_output}}" + VERBATIM + ) + endmacro() + macro(mic_set_link_libraries) + set(_mic_lflags) + foreach(_lib ${ARGN}) + get_filename_component(_lpath "${_lib}" PATH) + get_filename_component(_lname "${_lib}" NAME) + set(_mic_lflags ${_mic_lflags} "-L${_lpath}" "-l${_lname}") + endforeach() + endmacro() + macro(mic_add_library _target) + set(_state 0) + if(BUILD_SHARED_LIBS) + set(_type SHARED) + else() + set(_type STATIC) + endif() + set(_all ALL) + set(_srcs) + set(_cflags) + set(_libs) + foreach(_arg ${ARGN}) + if(_arg MATCHES "^(STATIC|SHARED|MODULE)$") + set(_type ${_arg}) + elseif(_arg STREQUAL "EXCLUDE_FROM_ALL") + set(_all) + elseif(_arg STREQUAL "COMPILE_FLAGS" OR _arg STREQUAL "COMPILE_OPTIONS") + set(_state 1) + elseif(_arg STREQUAL "LINK_LIBRARIES") + set(_state 2) + elseif(_arg STREQUAL "SOURCES") + set(_state 0) + elseif(_state EQUAL 0) # SOURCES + set(_srcs ${_srcs} "${_arg}") + elseif(_state EQUAL 1) # COMPILE_FLAGS + list(APPEND _cflags ${_arg}) + elseif(_state EQUAL 2) # LINK_LIBRARIES + get_filename_component(_lpath "${_arg}" PATH) + get_filename_component(_lname "${_arg}" NAME) + set(_libs ${_libs} "-L${_lpath}" "-l${_lname}") + endif() + endforeach() + set(_objects) + set(_objectsStr) + foreach(_src ${_srcs}) + _mic_add_object("${_target}" "${_src}" _obj ${_cflags}) + list(APPEND _objects "${_obj}") + set(_objectsStr "${_objectsStr} \"${_obj}\"") + endforeach() + + set(_outdir "${CMAKE_CURRENT_BINARY_DIR}/x86_64-k1om-linux") + file(MAKE_DIRECTORY "${_outdir}") + + #TODO: handle STATIC/SHARED/MODULE differently + set(_output "lib${_target}.a") + set(_linkscript "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/link.txt") + set(_cleanscript "CMakeFiles/${_target}.dir/cmake_clean_target.cmake") + file(WRITE "${_linkscript}" + "${MIC_AR} cr ${_output} ${_objectsStr} +${MIC_RANLIB} ${_output} +") + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${_cleanscript}" + "FILE(REMOVE_RECURSE \"${_output}\") +") + add_custom_command(OUTPUT "${_outdir}/${_output}" + COMMAND "${CMAKE_COMMAND}" -E cmake_link_script "${_linkscript}" --verbose=$(VERBOSE) + DEPENDS ${_objects} + WORKING_DIRECTORY "${_outdir}" + COMMENT "Linking (MIC) ${_output}" + VERBATIM + ) + add_custom_target("${_target}" ${_all} + DEPENDS "${_outdir}/${_output}" + COMMENT "" + SOURCES ${_srcs} + ) + set_target_properties("${_target}" PROPERTIES + OUTPUT_NAME "${_outdir}/${_output}" + ) + endmacro() + + macro(mic_add_executable _target) + set(_state 0) + set(_all ALL) + set(_srcs) + set(_cflags) + set(_libs) + set(_libTargets) + set(_dump_asm false) + set(_exec_output_name "${_target}") + set(_objects) + set(_objectsStr) + foreach(_arg ${ARGN}) + if(_arg STREQUAL "EXCLUDE_FROM_ALL") + set(_all) + elseif(_arg STREQUAL "COMPILE_FLAGS") + set(_state 1) + elseif(_arg STREQUAL "LINK_LIBRARIES") + set(_state 2) + elseif(_arg STREQUAL "OUTPUT_NAME") + set(_state 3) + elseif(_arg STREQUAL "SOURCES") + set(_state 0) + elseif(_arg STREQUAL "OBJECTS") + set(_state 4) + elseif(_arg STREQUAL "DUMP_ASM") + set(_dump_asm true) + elseif(_state EQUAL 0) # SOURCES + set(_srcs ${_srcs} "${_arg}") + elseif(_state EQUAL 1) # COMPILE_FLAGS + set(_cflags ${_cflags} "${_arg}") + elseif(_state EQUAL 2) # LINK_LIBRARIES + if(TARGET ${_arg}) + get_target_property(_tmp "${_arg}" OUTPUT_NAME) + if(_tmp) + set(_libs ${_libs} "${_tmp}") + set(_libTargets ${_libTargets} "${_tmp}" "${_arg}") + else() + set(_libs ${_libs} "${_arg}") + if(EXISTS "${_arg}") + set(_libTargets ${_libTargets} "${_arg}") + endif() + endif() + else() + set(_libs ${_libs} "${_arg}") + if(EXISTS "${_arg}") + set(_libTargets ${_libTargets} "${_arg}") + endif() + endif() + elseif(_state EQUAL 3) # OUTPUT_NAME + set(_exec_output_name "${_arg}") + elseif(_state EQUAL 4) # OBJECTS + set(_objects ${_objects} "${_arg}") + set(_objectsStr "${_objectsStr} \"${_arg}\"") + endif() + endforeach() + foreach(_src ${_srcs}) + _mic_add_object("${_target}" "${_src}" _obj ${_cflags}) + set(_objects ${_objects} "${_obj}") + set(_objectsStr "${_objectsStr} \"${_obj}\"") + endforeach() + + set(_exec_output "${CMAKE_CURRENT_BINARY_DIR}/${_exec_output_name}") + get_property(_launch_rule GLOBAL PROPERTY RULE_LAUNCH_LINK) + string(REPLACE "\"" "" _launch_rule "${_launch_rule}") + string(REPLACE " " ";" _launch_rule "${_launch_rule}") + string(REPLACE "" "${_target}" _launch_rule "${_launch_rule}") + string(REPLACE "" "${CMAKE_CURRENT_BINARY_DIR}" _launch_rule "${_launch_rule}") + add_custom_command(OUTPUT "${_exec_output}" + COMMAND ${_launch_rule} "${MIC_CXX}" -mmic + "-L${MIC_SDK_DIR}/compiler/lib/mic/" + ${_mic_lflags} ${_objects} -o "${_exec_output}" + ${_libs} + DEPENDS ${_objects} ${_libTargets} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Linking (MIC) ${_exec_output}" + VERBATIM + ) + set(_dump_asm_output) + if(_dump_asm) + foreach(_src ${_srcs}) + get_filename_component(_abs "${_src}" ABSOLUTE) + get_filename_component(_name "${_src}" NAME) + add_custom_command(OUTPUT "${_name}.s" + COMMAND "${MIC_CXX}" -mmic + -DVc_IMPL=MIC ${_mic_cflags} ${_cflags} ${Vc_MIC_CXX_FLAGS} + ${_abs} + -S -fsource-asm -fno-verbose-asm -o "${_name}.x" + COMMAND sh -c "grep -v ___tag_value '${_name}.x' | c++filt > '${_name}.s'" + COMMAND rm "${_name}.x" + DEPENDS ${_abs} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Creating MIC Assembly ${_name}.s" + VERBATIM + ) + set(_dump_asm_output ${_dump_asm_output} "${CMAKE_CURRENT_BINARY_DIR}/${_name}.s") + endforeach() + endif() + add_custom_target("${_target}" ${_all} + DEPENDS "${_exec_output}" ${_dump_asm_output} + COMMENT "" + SOURCES ${_srcs} + ) + set_target_properties("${_target}" PROPERTIES OUTPUT_NAME "${_exec_output_name}") + endmacro() +endif() + +if(MIC_OFFLOAD_FOUND) + macro(mic_offload _target) + set(_mic_debug) + if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + set(_mic_debug "-g") + endif() + add_target_property(${_target} COMPILE_FLAGS "-offload-build -DCAN_OFFLOAD ${_mic_debug}") + set(_offload_ldflags "${_mic_debug}") + set(_libTargets) + foreach(_lib ${ARGN}) + get_target_property(_tmp "${_lib}" OUTPUT_NAME) + if(_tmp) + set(_offload_ldflags "${_offload_ldflags} ${_tmp}") + set(_libTargets ${_libTargets} "${_arg}") + else() + get_filename_component(_lpath "${_arg}" PATH) + get_filename_component(_lname "${_arg}" NAME) + set(_offload_ldflags "${_offload_ldflags} -L${_lpath} -l${_lname}") + endif() + endforeach() + add_target_property(${_target} LINK_FLAGS "-offload-build -offload-ldopts=\"${_offload_ldflags}\" ${_mic_debug}") + if(_libTargets) + add_dependencies(${_target} ${_libTargets}) + endif() + endmacro() +endif() diff -Nru vc-0.7.4/cmake/FindVc.cmake vc-1.3.0/cmake/FindVc.cmake --- vc-0.7.4/cmake/FindVc.cmake 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/FindVc.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -1,4 +1,4 @@ -# Locate the Vc template library. Vc can be found at http://gitorious.org/Vc/ +# Locate the Vc template library. Vc can be found at https://github.com/VcDevel/Vc # # This file is meant to be copied into projects that want to use Vc. It will # search for VcConfig.cmake, which ships with Vc and will provide up-to-date @@ -10,6 +10,9 @@ # Vc_INCLUDE_DIR # Vc_LIBRARIES # Vc_DEFINITIONS +# Vc_COMPILE_FLAGS +# Vc_ARCHITECTURE_FLAGS +# Vc_ALL_FLAGS (the union of the above three variables) # Vc_VERSION_MAJOR # Vc_VERSION_MINOR # Vc_VERSION_PATCH @@ -25,7 +28,7 @@ # Vc_AVX_INTRINSICS_BROKEN # #============================================================================= -# Copyright 2009-2012 Matthias Kretz +# Copyright 2009-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -33,15 +36,12 @@ # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. -# # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff -Nru vc-0.7.4/cmake/OptimizeForArchitecture.cmake vc-1.3.0/cmake/OptimizeForArchitecture.cmake --- vc-0.7.4/cmake/OptimizeForArchitecture.cmake 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/OptimizeForArchitecture.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -1,5 +1,23 @@ +# Determine the host CPU feature set and determine the best set of compiler +# flags to enable all supported SIMD relevant features. Alternatively, the +# target CPU can be explicitly selected (for generating more generic binaries +# or for targeting a different system). +# Compilers provide e.g. the -march=native flag to achieve a similar result. +# This fails to address the need for building for a different microarchitecture +# than the current host. +# The script tries to deduce all settings from the model and family numbers of +# the CPU instead of reading the CPUID flags from e.g. /proc/cpuinfo. This makes +# the detection more independent from the CPUID code in the kernel (e.g. avx2 is +# not listed on older kernels). +# +# Usage: +# OptimizeForArchitecture() +# If either of Vc_SSE_INTRINSICS_BROKEN, Vc_AVX_INTRINSICS_BROKEN, +# Vc_AVX2_INTRINSICS_BROKEN is defined and set, the OptimizeForArchitecture +# macro will consequently disable the relevant features via compiler flags. + #============================================================================= -# Copyright 2010-2013 Matthias Kretz +# Copyright 2010-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -7,15 +25,12 @@ # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. -# # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -31,7 +46,7 @@ get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) include("${_currentDir}/AddCompilerFlag.cmake") -include(CheckIncludeFile) +include(CheckIncludeFileCXX) macro(_my_find _list _value _ret) list(FIND ${_list} "${_value}" _found) @@ -70,37 +85,58 @@ endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") if(_vendor_id STREQUAL "GenuineIntel") if(_cpu_family EQUAL 6) - # Any recent Intel CPU except NetBurst - if(_cpu_model EQUAL 62) + # taken from the Intel ORM + # http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html + # CPUID Signature Values of Of Recent Intel Microarchitectures + # 4E 5E | Skylake microarchitecture + # 3D 47 56 | Broadwell microarchitecture + # 3C 45 46 3F | Haswell microarchitecture + # 3A 3E | Ivy Bridge microarchitecture + # 2A 2D | Sandy Bridge microarchitecture + # 25 2C 2F | Intel microarchitecture Westmere + # 1A 1E 1F 2E | Intel microarchitecture Nehalem + # 17 1D | Enhanced Intel Core microarchitecture + # 0F | Intel Core microarchitecture + # + # Values from the Intel SDE: + # 5C | Goldmont + # 5A | Silvermont + # 57 | Knights Landing + # 66 | Cannonlake + # 55 | Skylake Server + # 4E | Skylake Client + # 3C | Broadwell (likely a bug in the SDE) + # 3C | Haswell + if(_cpu_model EQUAL 87) + set(TARGET_ARCHITECTURE "knl") # Knights Landing + elseif(_cpu_model EQUAL 92) + set(TARGET_ARCHITECTURE "goldmont") + elseif(_cpu_model EQUAL 90) + set(TARGET_ARCHITECTURE "silvermont") + elseif(_cpu_model EQUAL 102) + set(TARGET_ARCHITECTURE "cannonlake") + elseif(_cpu_model EQUAL 85) # 55 + set(TARGET_ARCHITECTURE "skylake-avx512") + elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) # 4E, 5E + set(TARGET_ARCHITECTURE "skylake") + elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 86) + set(TARGET_ARCHITECTURE "broadwell") + elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) + set(TARGET_ARCHITECTURE "haswell") + elseif(_cpu_model EQUAL 58 OR _cpu_model EQUAL 62) set(TARGET_ARCHITECTURE "ivy-bridge") - elseif(_cpu_model EQUAL 58) - set(TARGET_ARCHITECTURE "ivy-bridge") - elseif(_cpu_model EQUAL 47) # Xeon E7 4860 - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 46) # Xeon 7500 series - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 45) # Xeon TNG - set(TARGET_ARCHITECTURE "sandy-bridge") - elseif(_cpu_model EQUAL 44) # Xeon 5600 series - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 42) # Core TNG + elseif(_cpu_model EQUAL 42 OR _cpu_model EQUAL 45) set(TARGET_ARCHITECTURE "sandy-bridge") - elseif(_cpu_model EQUAL 37) # Core i7/i5/i3 - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 31) # Core i7/i5 + elseif(_cpu_model EQUAL 37 OR _cpu_model EQUAL 44 OR _cpu_model EQUAL 47) set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 30) # Core i7/i5 - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 29) - set(TARGET_ARCHITECTURE "penryn") - elseif(_cpu_model EQUAL 28) - set(TARGET_ARCHITECTURE "atom") - elseif(_cpu_model EQUAL 26) + elseif(_cpu_model EQUAL 26 OR _cpu_model EQUAL 30 OR _cpu_model EQUAL 31 OR _cpu_model EQUAL 46) set(TARGET_ARCHITECTURE "nehalem") - elseif(_cpu_model EQUAL 23) + elseif(_cpu_model EQUAL 23 OR _cpu_model EQUAL 29) set(TARGET_ARCHITECTURE "penryn") elseif(_cpu_model EQUAL 15) set(TARGET_ARCHITECTURE "merom") + elseif(_cpu_model EQUAL 28) + set(TARGET_ARCHITECTURE "atom") elseif(_cpu_model EQUAL 14) set(TARGET_ARCHITECTURE "core") elseif(_cpu_model LESS 14) @@ -142,7 +178,15 @@ endmacro() macro(OptimizeForArchitecture) - set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used.\nSetting the value to \"auto\" will try to optimize for the architecture where cmake is called.\nOther supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"ivy-bridge\", \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"AMD 14h\", \"AMD 16h\".") + set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. \ +Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. \ +Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. \ +Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \ +\"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"ivy-bridge\", \ +\"haswell\", \"broadwell\", \"skylake\", \"skylake-avx512\", \"cannonlake\", \"silvermont\", \ +\"goldmont\", \"knl\" (Knights Landing), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \ +\"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \ +\"AMD 14h\", \"AMD 16h\".") set(_force) if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") @@ -160,6 +204,58 @@ message(STATUS "Detected CPU: ${TARGET_ARCHITECTURE}") endif(TARGET_ARCHITECTURE STREQUAL "auto") + macro(_nehalem) + list(APPEND _march_flag_list "nehalem") + list(APPEND _march_flag_list "corei7") + list(APPEND _march_flag_list "core2") + list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") + endmacro() + macro(_westmere) + list(APPEND _march_flag_list "westmere") + _nehalem() + endmacro() + macro(_sandybridge) + list(APPEND _march_flag_list "sandybridge") + list(APPEND _march_flag_list "corei7-avx") + _westmere() + list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx") + endmacro() + macro(_ivybridge) + list(APPEND _march_flag_list "ivybridge") + list(APPEND _march_flag_list "core-avx-i") + _sandybridge() + list(APPEND _available_vector_units_list "rdrnd" "f16c") + endmacro() + macro(_haswell) + list(APPEND _march_flag_list "haswell") + list(APPEND _march_flag_list "core-avx2") + _ivybridge() + list(APPEND _available_vector_units_list "avx2" "fma" "bmi" "bmi2") + endmacro() + macro(_broadwell) + list(APPEND _march_flag_list "broadwell") + _haswell() + endmacro() + macro(_skylake) + list(APPEND _march_flag_list "skylake") + _broadwell() + endmacro() + macro(_skylake_avx512) + list(APPEND _march_flag_list "skylake-avx512") + _skylake() + list(APPEND _available_vector_units_list "avx512f" "avx512cd" "avx512dq" "avx512bw" "avx512vl") + endmacro() + macro(_cannonlake) + list(APPEND _march_flag_list "cannonlake") + _skylake_avx512() + list(APPEND _available_vector_units_list "avx512ifma" "avx512vbmi") + endmacro() + macro(_knightslanding) + list(APPEND _march_flag_list "knl") + _broadwell() + list(APPEND _available_vector_units_list "avx512f" "avx512pf" "avx512er" "avx512cd") + endmacro() + if(TARGET_ARCHITECTURE STREQUAL "core") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3") @@ -178,27 +274,26 @@ else() message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)") endif() - elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") - list(APPEND _march_flag_list "nehalem") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") - elseif(TARGET_ARCHITECTURE STREQUAL "westmere") - list(APPEND _march_flag_list "westmere") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") + elseif(TARGET_ARCHITECTURE STREQUAL "knl") + _knightslanding() + elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") + _cannonlake() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") + _skylake_avx512() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake") + _skylake() + elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") + _broadwell() + elseif(TARGET_ARCHITECTURE STREQUAL "haswell") + _haswell() elseif(TARGET_ARCHITECTURE STREQUAL "ivy-bridge") - list(APPEND _march_flag_list "core-avx-i") - list(APPEND _march_flag_list "corei7-avx") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx" "rdrnd" "f16c") + _ivybridge() elseif(TARGET_ARCHITECTURE STREQUAL "sandy-bridge") - list(APPEND _march_flag_list "sandybridge") - list(APPEND _march_flag_list "corei7-avx") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx") + _sandybridge() + elseif(TARGET_ARCHITECTURE STREQUAL "westmere") + _westmere() + elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") + _nehalem() elseif(TARGET_ARCHITECTURE STREQUAL "atom") list(APPEND _march_flag_list "atom") list(APPEND _march_flag_list "core2") @@ -259,100 +354,84 @@ if(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_disable_vector_unit_list) set(_enable_vector_unit_list) - _my_find(_available_vector_units_list "sse2" SSE2_FOUND) - _my_find(_available_vector_units_list "sse3" SSE3_FOUND) - _my_find(_available_vector_units_list "ssse3" SSSE3_FOUND) - _my_find(_available_vector_units_list "sse4.1" SSE4_1_FOUND) - _my_find(_available_vector_units_list "sse4.2" SSE4_2_FOUND) - _my_find(_available_vector_units_list "sse4a" SSE4a_FOUND) if(DEFINED Vc_AVX_INTRINSICS_BROKEN AND Vc_AVX_INTRINSICS_BROKEN) - UserWarning("AVX disabled per default because of old/broken compiler") - set(AVX_FOUND false) - set(XOP_FOUND false) - set(FMA4_FOUND false) + UserWarning("AVX disabled per default because of old/broken toolchain") + set(_avx_broken true) + set(_avx2_broken true) + set(_fma4_broken true) + set(_xop_broken true) else() - _my_find(_available_vector_units_list "avx" AVX_FOUND) + set(_avx_broken false) if(DEFINED Vc_FMA4_INTRINSICS_BROKEN AND Vc_FMA4_INTRINSICS_BROKEN) - UserWarning("FMA4 disabled per default because of old/broken compiler") - set(FMA4_FOUND false) + UserWarning("FMA4 disabled per default because of old/broken toolchain") + set(_fma4_broken true) else() - _my_find(_available_vector_units_list "fma4" FMA4_FOUND) + set(_fma4_broken false) endif() if(DEFINED Vc_XOP_INTRINSICS_BROKEN AND Vc_XOP_INTRINSICS_BROKEN) - UserWarning("XOP disabled per default because of old/broken compiler") - set(XOP_FOUND false) + UserWarning("XOP disabled per default because of old/broken toolchain") + set(_xop_broken true) else() - _my_find(_available_vector_units_list "xop" XOP_FOUND) + set(_xop_broken false) + endif() + if(DEFINED Vc_AVX2_INTRINSICS_BROKEN AND Vc_AVX2_INTRINSICS_BROKEN) + UserWarning("AVX2 disabled per default because of old/broken toolchain") + set(_avx2_broken true) + else() + set(_avx2_broken false) endif() endif() - set(USE_SSE2 ${SSE2_FOUND} CACHE BOOL "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." ${_force}) - set(USE_SSE3 ${SSE3_FOUND} CACHE BOOL "Use SSE3. If SSE3 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSSE3 ${SSSE3_FOUND} CACHE BOOL "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSE4_1 ${SSE4_1_FOUND} CACHE BOOL "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSE4_2 ${SSE4_2_FOUND} CACHE BOOL "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSE4a ${SSE4a_FOUND} CACHE BOOL "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." ${_force}) - set(USE_AVX ${AVX_FOUND} CACHE BOOL "Use AVX. This will double some of the vector sizes relative to SSE." ${_force}) - set(USE_XOP ${XOP_FOUND} CACHE BOOL "Use XOP." ${_force}) - set(USE_FMA4 ${FMA4_FOUND} CACHE BOOL "Use FMA4." ${_force}) - mark_as_advanced(USE_SSE2 USE_SSE3 USE_SSSE3 USE_SSE4_1 USE_SSE4_2 USE_SSE4a USE_AVX USE_XOP USE_FMA4) - if(USE_SSE2) - list(APPEND _enable_vector_unit_list "sse2") - else(USE_SSE2) - list(APPEND _disable_vector_unit_list "sse2") - endif(USE_SSE2) - if(USE_SSE3) - list(APPEND _enable_vector_unit_list "sse3") - else(USE_SSE3) - list(APPEND _disable_vector_unit_list "sse3") - endif(USE_SSE3) - if(USE_SSSE3) - list(APPEND _enable_vector_unit_list "ssse3") - else(USE_SSSE3) - list(APPEND _disable_vector_unit_list "ssse3") - endif(USE_SSSE3) - if(USE_SSE4_1) - list(APPEND _enable_vector_unit_list "sse4.1") - else(USE_SSE4_1) - list(APPEND _disable_vector_unit_list "sse4.1") - endif(USE_SSE4_1) - if(USE_SSE4_2) - list(APPEND _enable_vector_unit_list "sse4.2") - else(USE_SSE4_2) - list(APPEND _disable_vector_unit_list "sse4.2") - endif(USE_SSE4_2) - if(USE_SSE4a) - list(APPEND _enable_vector_unit_list "sse4a") - else(USE_SSE4a) - list(APPEND _disable_vector_unit_list "sse4a") - endif(USE_SSE4a) - if(USE_AVX) - list(APPEND _enable_vector_unit_list "avx") - # we want SSE intrinsics to result in instructions using the VEX prefix. - # Otherwise integer ops (which require the older SSE intrinsics) would - # always have a large penalty. - list(APPEND _enable_vector_unit_list "sse2avx") - else(USE_AVX) - list(APPEND _disable_vector_unit_list "avx") - endif(USE_AVX) - if(USE_XOP) - list(APPEND _enable_vector_unit_list "xop") - else() - list(APPEND _disable_vector_unit_list "xop") - endif() - if(USE_FMA4) - list(APPEND _enable_vector_unit_list "fma4") - else() - list(APPEND _disable_vector_unit_list "fma4") - endif() + + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + _enable_or_disable(SSE2 "sse2" "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." false) + _enable_or_disable(SSE3 "sse3" "Use SSE3. If SSE3 instructions are not enabled they will be emulated." false) + _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." false) + _enable_or_disable(SSE4_1 "sse4.1" "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." false) + _enable_or_disable(SSE4_2 "sse4.2" "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." false) + _enable_or_disable(SSE4a "sse4a" "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." false) + _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken) + _enable_or_disable(FMA "fma" "Use FMA." _avx_broken) + _enable_or_disable(BMI2 "bmi2" "Use BMI2." _avx_broken) + _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken) + _enable_or_disable(XOP "xop" "Use XOP." _xop_broken) + _enable_or_disable(FMA4 "fma4" "Use FMA4." _fma4_broken) + _enable_or_disable(AVX512F "avx512f" "Use AVX512F. This will double all floating-point vector sizes relative to AVX2." false) + _enable_or_disable(AVX512VL "avx512vl" "Use AVX512VL. This enables 128- and 256-bit vector length instructions with EVEX coding (improved write-masking & more vector registers)." _avx2_broken) + _enable_or_disable(AVX512PF "avx512pf" "Use AVX512PF. This enables prefetch instructions for gathers and scatters." false) + _enable_or_disable(AVX512ER "avx512er" "Use AVX512ER. This enables exponential and reciprocal instructions." false) + _enable_or_disable(AVX512CD "avx512cd" "Use AVX512CD." false) + _enable_or_disable(AVX512DQ "avx512dq" "Use AVX512DQ." false) + _enable_or_disable(AVX512BW "avx512bw" "Use AVX512BW." false) + _enable_or_disable(AVX512IFMA "avx512ifma" "Use AVX512IFMA." false) + _enable_or_disable(AVX512VBMI "avx512vbmi" "Use AVX512VBMI." false) + if(MSVC) # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX) # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010) - _my_find(_enable_vector_unit_list "avx" _avx) - set(_avx_flag FALSE) - if(_avx) - AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _avx_flag) + _my_find(_enable_vector_unit_list "avx2" _found) + if(_found) + AddCompilerFlag("/arch:AVX2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() - if(NOT _avx_flag) + if(NOT _found) + _my_find(_enable_vector_unit_list "avx" _found) + if(_found) + AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + endif() + if(NOT _found) _my_find(_enable_vector_unit_list "sse2" _found) if(_found) AddCompilerFlag("/arch:SSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) @@ -435,6 +514,8 @@ set(_header "ammintrin.h") elseif(_flag STREQUAL "avx") set(_header "immintrin.h") + elseif(_flag STREQUAL "avx2") + set(_header "immintrin.h") elseif(_flag STREQUAL "fma4") set(_header "x86intrin.h") elseif(_flag STREQUAL "xop") @@ -443,7 +524,7 @@ set(_resultVar "HAVE_${_header}") string(REPLACE "." "_" _resultVar "${_resultVar}") if(_header) - CHECK_INCLUDE_FILE("${_header}" ${_resultVar} "-m${_flag}") + CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") if(NOT ${_resultVar}) set(_useVar "USE_${_flag}") string(TOUPPER "${_useVar}" _useVar) @@ -454,7 +535,7 @@ endif() endif() if(NOT _header OR ${_resultVar}) - set(Vc_ARCHITECTURE_FLAGS "${Vc_ARCHITECTURE_FLAGS} -m${_flag}") + list(APPEND Vc_ARCHITECTURE_FLAGS "-m${_flag}") endif() endif() endforeach(_flag) diff -Nru vc-0.7.4/cmake/VcConfig.cmake.in vc-1.3.0/cmake/VcConfig.cmake.in --- vc-0.7.4/cmake/VcConfig.cmake.in 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/VcConfig.cmake.in 2016-10-27 02:05:02.000000000 -0500 @@ -6,13 +6,27 @@ set(Vc_INSTALL_DIR "@CMAKE_INSTALL_PREFIX@") -set(Vc_LIB_DIR "@CMAKE_INSTALL_PREFIX@/lib") -set(Vc_INCLUDE_DIR "@CMAKE_INSTALL_PREFIX@/include") -set(Vc_CMAKE_MODULES_DIR "@CMAKE_INSTALL_PREFIX@/lib/cmake/Vc") +set(Vc_LIB_DIR "@CMAKE_INSTALL_PREFIX@/lib@LIB_SUFFIX@") +find_path(Vc_INCLUDE_DIR Vc/global.h HINTS "@CMAKE_INSTALL_PREFIX@/include") +find_path(Vc_CMAKE_MODULES_DIR AddCompilerFlag.cmake HINTS "${Vc_LIB_DIR}/cmake/Vc") +list(APPEND CMAKE_MODULE_PATH "${Vc_CMAKE_MODULES_DIR}") find_library(Vc_LIBRARIES Vc PATHS "${Vc_LIB_DIR}" NO_DEFAULT_PATH) +find_library(Vc_MIC_LIBRARIES Vc_MIC PATHS "${Vc_LIB_DIR}" NO_DEFAULT_PATH) include("${Vc_CMAKE_MODULES_DIR}/VcMacros.cmake") set(Vc_DEFINITIONS) +set(Vc_COMPILE_FLAGS) +set(Vc_ARCHITECTURE_FLAGS) vc_set_preferred_compiler_flags() +separate_arguments(Vc_ALL_FLAGS UNIX_COMMAND "${Vc_DEFINITIONS}") +list(APPEND Vc_ALL_FLAGS ${Vc_COMPILE_FLAGS}) +list(APPEND Vc_ALL_FLAGS ${Vc_ARCHITECTURE_FLAGS}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Vc + FOUND_VAR Vc_FOUND + REQUIRED_VARS Vc_LIBRARIES Vc_INCLUDE_DIR Vc_CMAKE_MODULES_DIR + VERSION_VAR Vc_VERSION + ) diff -Nru vc-0.7.4/cmake/VcMacros.cmake vc-1.3.0/cmake/VcMacros.cmake --- vc-0.7.4/cmake/VcMacros.cmake 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/cmake/VcMacros.cmake 2016-10-27 02:05:02.000000000 -0500 @@ -5,7 +5,7 @@ # vc_set_preferred_compiler_flags # #============================================================================= -# Copyright 2009-2013 Matthias Kretz +# Copyright 2009-2015 Matthias Kretz # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -13,15 +13,12 @@ # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. -# # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -52,8 +49,13 @@ set(Vc_COMPILER_IS_GCC false) if(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") set(Vc_COMPILER_IS_INTEL true) - exec_program(${CMAKE_C_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_ICC_VERSION) + exec_program(${CMAKE_CXX_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_ICC_VERSION) message(STATUS "Detected Compiler: Intel ${Vc_ICC_VERSION}") + + # break build with too old clang as early as possible. + if(Vc_ICC_VERSION VERSION_LESS 15.0.3) + message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least ICC 15.0.3") + endif() elseif(CMAKE_CXX_COMPILER MATCHES "(opencc|openCC)$") set(Vc_COMPILER_IS_OPEN64 true) message(STATUS "Detected Compiler: Open64") @@ -62,43 +64,60 @@ exec_program(${CMAKE_CXX_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_CLANG_VERSION) string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" Vc_CLANG_VERSION "${Vc_CLANG_VERSION}") message(STATUS "Detected Compiler: Clang ${Vc_CLANG_VERSION}") + + # break build with too old clang as early as possible. + if(Vc_CLANG_VERSION VERSION_LESS 3.4) + message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least clang 3.4") + endif() elseif(MSVC) set(Vc_COMPILER_IS_MSVC true) - message(STATUS "Detected Compiler: MSVC ${MSVC_VERSION}") + execute_process(COMMAND ${CMAKE_CXX_COMPILER} /nologo -EP "${_currentDir}/msvc_version.c" OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE Vc_MSVC_VERSION) + string(STRIP "${Vc_MSVC_VERSION}" Vc_MSVC_VERSION) + string(REPLACE "MSVC " "" Vc_MSVC_VERSION "${Vc_MSVC_VERSION}") + message(STATUS "Detected Compiler: MSVC ${Vc_MSVC_VERSION}") elseif(CMAKE_COMPILER_IS_GNUCXX) set(Vc_COMPILER_IS_GCC true) - exec_program(${CMAKE_C_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_GCC_VERSION) + exec_program(${CMAKE_CXX_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_GCC_VERSION) message(STATUS "Detected Compiler: GCC ${Vc_GCC_VERSION}") # some distributions patch their GCC to return nothing or only major and minor version on -dumpversion. # In that case we must extract the version number from --version. if(NOT Vc_GCC_VERSION OR Vc_GCC_VERSION MATCHES "^[0-9]\\.[0-9]+$") - exec_program(${CMAKE_C_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_GCC_VERSION) + exec_program(${CMAKE_CXX_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_GCC_VERSION) string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" Vc_GCC_VERSION "${Vc_GCC_VERSION}") message(STATUS "GCC Version from --version: ${Vc_GCC_VERSION}") endif() # some distributions patch their GCC to be API incompatible to what the FSF released. In # those cases we require a macro to identify the distribution version - find_program(_lsb_release lsb_release) - mark_as_advanced(_lsb_release) - if(_lsb_release) - execute_process(COMMAND ${_lsb_release} -is OUTPUT_VARIABLE _distributor_id OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${_lsb_release} -rs OUTPUT_VARIABLE _distributor_release OUTPUT_STRIP_TRAILING_WHITESPACE) - string(TOUPPER "${_distributor_id}" _distributor_id) - if(_distributor_id STREQUAL "UBUNTU") - execute_process(COMMAND ${CMAKE_C_COMPILER} --version OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _gcc_version) + find_program(Vc_lsb_release lsb_release) + mark_as_advanced(Vc_lsb_release) + if(Vc_lsb_release) + if(NOT Vc_distributor_id) + execute_process(COMMAND ${Vc_lsb_release} -is OUTPUT_VARIABLE Vc_distributor_id OUTPUT_STRIP_TRAILING_WHITESPACE) + string(TOUPPER "${Vc_distributor_id}" Vc_distributor_id) + set(Vc_distributor_id "${Vc_distributor_id}" CACHE STRING "lsb distribution id") + execute_process(COMMAND ${Vc_lsb_release} -rs OUTPUT_VARIABLE Vc_distributor_release OUTPUT_STRIP_TRAILING_WHITESPACE) + set(Vc_distributor_release "${Vc_distributor_release}" CACHE STRING "lsb release id") + endif() + if(Vc_distributor_id STREQUAL "UBUNTU") + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _gcc_version) string(REGEX MATCH "\\(.* ${Vc_GCC_VERSION}-([0-9]+).*\\)" _tmp "${_gcc_version}") if(_tmp) set(_patch ${CMAKE_MATCH_1}) - string(REGEX MATCH "^([0-9]+)\\.([0-9]+)$" _tmp "${_distributor_release}") + string(REGEX MATCH "^([0-9]+)\\.([0-9]+)$" _tmp "${Vc_distributor_release}") execute_process(COMMAND printf 0x%x%02x%02x ${CMAKE_MATCH_1} ${CMAKE_MATCH_2} ${_patch} OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _tmp) set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -D__GNUC_UBUNTU_VERSION__=${_tmp}") endif() endif() endif() + + # break build with too old GCC as early as possible. + if(Vc_GCC_VERSION VERSION_LESS 4.8.1) + message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least GCC 4.8.1") + endif() else() - message(WARNING "Untested/-supported Compiler for use with Vc.\nPlease fill out the missing parts in the CMake scripts and submit a patch to http://code.compeng.uni-frankfurt.de/projects/vc") + message(WARNING "Untested/-supported Compiler (${CMAKE_CXX_COMPILER}) for use with Vc.\nPlease fill out the missing parts in the CMake scripts and submit a patch to http://code.compeng.uni-frankfurt.de/projects/vc") endif() endif() endmacro() @@ -128,47 +147,24 @@ endmacro() macro(vc_check_assembler) - if(APPLE) - if(NOT Vc_COMPILER_IS_CLANG) - message(WARNING "Apple does not provide an assembler with AVX support. AVX will not be available. Please use Clang if you want to use AVX.") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_NO_XGETBV") + exec_program(${CMAKE_CXX_COMPILER} ARGS -print-prog-name=as OUTPUT_VARIABLE _as) + mark_as_advanced(_as) + if(NOT _as) + message(WARNING "Could not find 'as', the assembler used by GCC. Hoping everything will work out...") + else() + exec_program(${_as} ARGS --version OUTPUT_VARIABLE _as_version) + string(REGEX REPLACE "\\([^\\)]*\\)" "" _as_version "${_as_version}") + string(REGEX MATCH "[1-9]\\.[0-9]+(\\.[0-9]+)?" _as_version "${_as_version}") + if(_as_version VERSION_LESS "2.18.93") + UserWarning("Your binutils is too old (${_as_version}). Some optimizations of Vc will be disabled.") + set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVc_NO_XGETBV") # old assembler doesn't know the xgetbv instruction set(Vc_AVX_INTRINSICS_BROKEN true) - endif() - else(APPLE) - if(${ARGC} EQUAL 1) - set(_as "${ARGV1}") - else() - exec_program(${CMAKE_CXX_COMPILER} ARGS -print-prog-name=as OUTPUT_VARIABLE _as) - mark_as_advanced(_as) - endif() - if(NOT _as) - message(WARNING "Could not find 'as', the assembler used by GCC. Hoping everything will work out...") - else() - exec_program(${_as} ARGS --version OUTPUT_VARIABLE _as_version) - string(REGEX REPLACE "\\([^\\)]*\\)" "" _as_version "${_as_version}") - string(REGEX MATCH "[1-9]\\.[0-9]+(\\.[0-9]+)?" _as_version "${_as_version}") - if(_as_version VERSION_LESS "2.18.93") - UserWarning("Your binutils is too old (${_as_version}). Some optimizations of Vc will be disabled.") - add_definitions(-DVC_NO_XGETBV) # old assembler doesn't know the xgetbv instruction - set(Vc_AVX_INTRINSICS_BROKEN true) - set(Vc_XOP_INTRINSICS_BROKEN true) - set(Vc_FMA4_INTRINSICS_BROKEN true) - elseif(_as_version VERSION_LESS "2.21.0") - UserWarning("Your binutils is too old (${_as_version}) for XOP instructions. They will therefore not be provided in libVc.") - set(Vc_XOP_INTRINSICS_BROKEN true) - endif() - endif() - endif(APPLE) -endmacro() - -macro(vc_check_fpmath) - # if compiling for 32 bit x86 we need to use the -mfpmath=sse since the x87 is broken by design - include (CheckCXXSourceRuns) - check_cxx_source_runs("int main() { return sizeof(void*) != 8; }" Vc_VOID_PTR_IS_64BIT) - if(NOT Vc_VOID_PTR_IS_64BIT) - exec_program(${CMAKE_C_COMPILER} ARGS -dumpmachine OUTPUT_VARIABLE _gcc_machine) - if(_gcc_machine MATCHES "[x34567]86" OR _gcc_machine STREQUAL "mingw32") - vc_add_compiler_flag(Vc_DEFINITIONS "-mfpmath=sse") + set(Vc_XOP_INTRINSICS_BROKEN true) + set(Vc_FMA4_INTRINSICS_BROKEN true) + elseif(_as_version VERSION_LESS "2.21.0") + UserWarning("Your binutils is too old (${_as_version}) for XOP and AVX2 instructions. They will therefore not be provided in libVc.") + set(Vc_XOP_INTRINSICS_BROKEN true) + set(Vc_AVX2_INTRINSICS_BROKEN true) endif() endif() endmacro() @@ -188,6 +184,7 @@ set(Vc_SSE_INTRINSICS_BROKEN false) set(Vc_AVX_INTRINSICS_BROKEN false) + set(Vc_AVX2_INTRINSICS_BROKEN false) set(Vc_XOP_INTRINSICS_BROKEN false) set(Vc_FMA4_INTRINSICS_BROKEN false) @@ -208,7 +205,6 @@ AddCompilerFlag("-Wpointer-arith") AddCompilerFlag("-Wcast-align") AddCompilerFlag("-Wreturn-type") - AddCompilerFlag("-ansi") AddCompilerFlag("-pedantic") AddCompilerFlag("-Wno-long-long") AddCompilerFlag("-Wshadow") @@ -223,78 +219,40 @@ # Open64 4.5.1 still doesn't ship immintrin.h set(Vc_AVX_INTRINSICS_BROKEN true) + set(Vc_AVX2_INTRINSICS_BROKEN true) elseif(Vc_COMPILER_IS_GCC) ################################################################################################## # GCC # ################################################################################################## if(_add_warning_flags) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wno-long-long -Wshadow") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wno-long-long -Wshadow") - if(NOT WIN32) - # the -ansi flag makes MinGW unusable, so maybe it's better to omit it - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ansi") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ansi") - endif() - AddCompilerFlag("-Wundef") - AddCompilerFlag("-Wold-style-cast") - AddCompilerFlag("-Wno-variadic-macros") - if(Vc_GCC_VERSION VERSION_GREATER "4.5.2" AND Vc_GCC_VERSION VERSION_LESS "4.6.4") - # GCC gives bogus "array subscript is above array bounds" warnings in math.cpp - AddCompilerFlag("-Wno-array-bounds") - endif() + foreach(_f -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wshadow -Wundef) + AddCompilerFlag("${_f}") + endforeach() + foreach(_f -Wold-style-cast) + AddCompilerFlag("${_f}" CXX_FLAGS CMAKE_CXX_FLAGS) + endforeach() endif() - vc_add_compiler_flag(Vc_DEFINITIONS "-Wabi") - vc_add_compiler_flag(Vc_DEFINITIONS "-fabi-version=0") # ABI version 4 is required to make __m128 and __m256 appear as different types. 0 should give us the latest version. + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wabi") + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-fabi-version=0") # ABI version 4 is required to make __m128 and __m256 appear as different types. 0 should give us the latest version. + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-fabi-compat-version=0") # GCC 5 introduced this switch + # and defaults it to 2 if -fabi-version is 0. But in that case the bug -fabi-version=0 is + # supposed to fix resurfaces. For now just make sure that it compiles and links. + # Bug report pending. if(_add_buildtype_flags) vc_set_gnu_buildtype_flags() endif() - # GCC 4.5.[01] fail at inlining some functions, creating functions with a single instructions, - # thus creating a large overhead. - if(Vc_GCC_VERSION VERSION_LESS "4.5.2" AND NOT Vc_GCC_VERSION VERSION_LESS "4.5.0") - UserWarning("GCC 4.5.0 and 4.5.1 have problems with inlining correctly. Setting early-inlining-insns=12 as workaround.") - AddCompilerFlag("--param early-inlining-insns=12") - endif() - - if(Vc_GCC_VERSION VERSION_LESS "4.1.99") - UserWarning("Your GCC is ancient and crashes on some important optimizations. The full set of SSE2 intrinsics is not supported. Vc will fall back to the scalar implementation. Use of the may_alias and always_inline attributes will be disabled. In turn all code using Vc must be compiled with -fno-strict-aliasing") - vc_add_compiler_flag(Vc_DEFINITIONS "-fno-strict-aliasing") - set(Vc_AVX_INTRINSICS_BROKEN true) - set(Vc_SSE_INTRINSICS_BROKEN true) - elseif(Vc_GCC_VERSION VERSION_LESS "4.4.6") - UserWarning("Your GCC is older than 4.4.6. This is known to cause problems/bugs. Please update to the latest GCC if you can.") - set(Vc_AVX_INTRINSICS_BROKEN true) - if(Vc_GCC_VERSION VERSION_LESS "4.3.0") - UserWarning("Your GCC is older than 4.3.0. It is unable to handle the full set of SSE2 intrinsics. All SSE code will be disabled. Please update to the latest GCC if you can.") - set(Vc_SSE_INTRINSICS_BROKEN true) - endif() - endif() - - if(Vc_GCC_VERSION VERSION_LESS 4.5.0) - UserWarning("GCC 4.4.x shows false positives for -Wparentheses, thus we rather disable the warning.") - string(REPLACE " -Wparentheses " " " CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - string(REPLACE " -Wparentheses " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -Wno-parentheses") - - UserWarning("GCC 4.4.x shows false positives for -Wstrict-aliasing, thus we rather disable the warning. Use a newer GCC for better warnings.") - AddCompilerFlag("-Wno-strict-aliasing") - - UserWarning("GCC 4.4.x shows false positives for -Wuninitialized, thus we rather disable the warning. Use a newer GCC for better warnings.") - AddCompilerFlag("-Wno-uninitialized") - elseif(Vc_GCC_VERSION VERSION_EQUAL 4.6.0) - UserWarning("GCC 4.6.0 miscompiles AVX loads/stores, leading to spurious segfaults. Disabling AVX per default.") - set(Vc_AVX_INTRINSICS_BROKEN true) - elseif(Vc_GCC_VERSION VERSION_EQUAL 4.7.0) - UserWarning("GCC 4.7.0 miscompiles at -O3, adding -fno-predictive-commoning to the compiler flags as workaround") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -fno-predictive-commoning") - elseif(Vc_GCC_VERSION VERSION_EQUAL 4.8.0) - UserWarning("GCC 4.8.0 miscompiles at -O3, adding -fno-tree-vectorize to the compiler flags as workaround") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -fno-tree-vectorize") + if(APPLE) + # The GNU assembler (as) on Mac OS X is hopelessly outdated. The -q flag + # to as tells it to use the clang assembler, though, which is fine. + # -Wa,-q tells GCC to pass -q to as. + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wa,-q") + # Apparently the MacOS clang assember doesn't understand XOP instructions. + set(Vc_XOP_INTRINSICS_BROKEN true) + else() + vc_check_assembler() endif() - - vc_check_fpmath() - vc_check_assembler() elseif(Vc_COMPILER_IS_INTEL) ################################################################################################## # Intel Compiler # @@ -305,32 +263,36 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") - - set(ALIAS_FLAGS "-no-ansi-alias") - if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - # default ICC to -no-ansi-alias because otherwise tests/utils_sse fails. So far I suspect a miscompilation... - set(ENABLE_STRICT_ALIASING false CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") - if(ENABLE_STRICT_ALIASING) - set(ALIAS_FLAGS "-ansi-alias") - endif(ENABLE_STRICT_ALIASING) - endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ALIAS_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ALIAS_FLAGS}") endif() - vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 913") + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 913") # Disable warning #13211 "Immediate parameter to intrinsic call too large". (sse/vector.tcc rotated(int)) - vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 13211") + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 13211") + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 61") # warning #61: integer operation result is out of range + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 173") # warning #173: floating-point value does not fit in required integral type + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 264") # warning #264: floating-point value does not fit in required floating-point type + if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + set(ENABLE_STRICT_ALIASING true CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") + if(ENABLE_STRICT_ALIASING) + AddCompilerFlag(-ansi-alias CXX_FLAGS Vc_COMPILE_FLAGS) + else() + AddCompilerFlag(-no-ansi-alias CXX_FLAGS Vc_COMPILE_FLAGS) + endif() + endif() if(NOT "$ENV{DASHBOARD_TEST_FROM_CTEST}" STREQUAL "") # disable warning #2928: the __GXX_EXPERIMENTAL_CXX0X__ macro is disabled when using GNU version 4.6 with the c++0x option # this warning just adds noise about problems in the compiler - but I'm only interested in seeing problems in Vc - vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 2928") + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-diag-disable 2928") endif() # Intel doesn't implement the XOP or FMA4 intrinsics set(Vc_XOP_INTRINSICS_BROKEN true) set(Vc_FMA4_INTRINSICS_BROKEN true) elseif(Vc_COMPILER_IS_MSVC) + ################################################################################################## + # Microsoft Visual Studio # + ################################################################################################## + if(_add_warning_flags) AddCompilerFlag("/wd4800") # Disable warning "forcing value to bool" AddCompilerFlag("/wd4996") # Disable warning about strdup vs. _strdup @@ -341,116 +303,95 @@ AddCompilerFlag("/wd4748") # Disable warning "/GS can not protect parameters and local variables from local buffer overrun because optimizations are disabled in function" (I don't get it) add_definitions(-D_CRT_SECURE_NO_WARNINGS) endif() + vc_add_compiler_flag(Vc_COMPILE_FLAGS "/Gv") # default to __vectorcall - # MSVC does not support inline assembly on 64 bit! :( - # searching the help for xgetbv doesn't turn up anything. So just fall back to not supporting AVX on Windows :( - # TODO: apparently MSVC 2010 SP1 added _xgetbv - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_NO_XGETBV") - - # get rid of the min/max macros - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DNOMINMAX") - - # MSVC doesn't implement the XOP or FMA4 intrinsics - set(Vc_XOP_INTRINSICS_BROKEN true) - set(Vc_FMA4_INTRINSICS_BROKEN true) - - if(MSVC_VERSION LESS 1700) - UserWarning("MSVC before 2012 has a broken std::vector::resize implementation. STL + Vc code will probably not compile.") + if(MSVC_VERSION LESS 1900) + UserWarning("MSVC before 2015 does not support enough of C++11") endif() elseif(Vc_COMPILER_IS_CLANG) - # for now I don't know of any arguments I want to pass. -march and stuff is tried by OptimizeForArchitecture... - if(Vc_CLANG_VERSION VERSION_EQUAL "3.0") - UserWarning("Clang 3.0 has serious issues to compile Vc code and will most likely crash when trying to do so.\nPlease update to a recent clang version.") - elseif(Vc_CLANG_VERSION VERSION_EQUAL "3.2" AND NOT APPLE) - # the LLVM assembler gets FMAs wrong (bug 15040) - vc_add_compiler_flag(Vc_DEFINITIONS "-no-integrated-as") + ################################################################################################## + # Clang # + ################################################################################################## + + if(Vc_CLANG_VERSION VERSION_GREATER "3.5.99" AND Vc_CLANG_VERSION VERSION_LESS 3.7.0) + UserWarning("Clang 3.6 has serious issues with AVX code generation, frequently losing 50% of the data. AVX is therefore disabled.\nPlease update to a more recent clang version.\n") + set(Vc_AVX_INTRINSICS_BROKEN true) + set(Vc_AVX2_INTRINSICS_BROKEN true) endif() # disable these warnings because clang shows them for function overloads that were discarded via SFINAE - vc_add_compiler_flag(Vc_DEFINITIONS "-Wno-local-type-template-args") - vc_add_compiler_flag(Vc_DEFINITIONS "-Wno-unnamed-type-template-args") - - if(NOT DEFINED Vc_INSIDE_ROOT) # ROOT has to set this up - AddCompilerFlag(-stdlib=libc++) - endif() + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wno-local-type-template-args") + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-Wno-unnamed-type-template-args") endif() if(NOT Vc_COMPILER_IS_MSVC) - vc_add_compiler_flag(Vc_DEFINITIONS "-ffp-contract=fast") + vc_add_compiler_flag(Vc_COMPILE_FLAGS "-ffp-contract=fast") endif() OptimizeForArchitecture() - set(Vc_DEFINITIONS "${Vc_ARCHITECTURE_FLAGS} ${Vc_DEFINITIONS}") - - set(VC_IMPL "auto" CACHE STRING "Force the Vc implementation globally to the selected instruction set. \"auto\" lets Vc use the best available instructions.") - if(NOT VC_IMPL STREQUAL "auto") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_IMPL=${VC_IMPL}") - if(NOT VC_IMPL STREQUAL "Scalar") - set(_use_var "USE_${VC_IMPL}") - if(VC_IMPL STREQUAL "SSE") + set(Vc_IMPL "auto" CACHE STRING "Force the Vc implementation globally to the selected instruction set. \"auto\" lets Vc use the best available instructions.") + if(NOT Vc_IMPL STREQUAL "auto") + set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVc_IMPL=${Vc_IMPL}") + if(NOT Vc_IMPL STREQUAL "Scalar") + set(_use_var "USE_${Vc_IMPL}") + if(Vc_IMPL STREQUAL "SSE") set(_use_var "USE_SSE2") endif() if(NOT ${_use_var}) - message(WARNING "The selected value for VC_IMPL (${VC_IMPL}) will not work because the relevant instructions are not enabled via compiler flags.") + message(WARNING "The selected value for Vc_IMPL (${Vc_IMPL}) will not work because the relevant instructions are not enabled via compiler flags.") endif() endif() endif() endmacro() # helper macro for vc_compile_for_all_implementations -macro(_vc_compile_one_implementation _objs _impl) +macro(_vc_compile_one_implementation _srcs _impl) list(FIND _disabled_targets "${_impl}" _disabled_index) list(FIND _only_targets "${_impl}" _only_index) if(${_disabled_index} EQUAL -1 AND (NOT _only_targets OR ${_only_index} GREATER -1)) set(_extra_flags) set(_ok FALSE) - foreach(_flag ${ARGN}) - if(_flag STREQUAL "NO_FLAG") + foreach(_flags_it ${ARGN}) + if(_flags_it STREQUAL "NO_FLAG") set(_ok TRUE) break() endif() - string(REPLACE " " ";" _flag_list "${_flag}") - foreach(_flag ${_flag_list}) - AddCompilerFlag(${_flag} CXX_RESULT _ok) + string(REPLACE " " ";" _flag_list "${_flags_it}") + foreach(_f ${_flag_list}) + AddCompilerFlag(${_f} CXX_RESULT _ok) if(NOT _ok) break() endif() endforeach() if(_ok) - set(_extra_flags ${_flag_list}) + set(_extra_flags ${_flags_it}) break() endif() endforeach() - set(_outfile_flag -c -o) if(Vc_COMPILER_IS_MSVC) # MSVC for 64bit does not recognize /arch:SSE2 anymore. Therefore we set override _ok if _impl # says SSE if("${_impl}" MATCHES "SSE") set(_ok TRUE) endif() - set(_outfile_flag /c /Fo) endif() if(_ok) get_filename_component(_out "${_vc_compile_src}" NAME_WE) get_filename_component(_ext "${_vc_compile_src}" EXT) - if(Vc_COMPILER_IS_MSVC) - set(_out "${_out}_${_impl}${_ext}.obj") - else() - set(_out "${_out}_${_impl}${_ext}.o") - endif() - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${_out} - COMMAND ${CMAKE_CXX_COMPILER} ${_flags} ${_extra_flags} - -DVC_IMPL=${_impl} - ${_outfile_flag}${_out} ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} - MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} - IMPLICIT_DEPENDS CXX ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} - COMMENT "Building CXX object ${_out}" - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" - VERBATIM - ) - list(APPEND ${_objs} "${CMAKE_CURRENT_BINARY_DIR}/${_out}") + set(_out "${CMAKE_CURRENT_BINARY_DIR}/${_out}_${_impl}${_ext}") + add_custom_command(OUTPUT "${_out}" + COMMAND ${CMAKE_COMMAND} -E copy "${_vc_compile_src}" "${_out}" + DEPENDS "${_vc_compile_src}" + COMMENT "Copy to ${_out}" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + VERBATIM) + set_source_files_properties( "${_out}" PROPERTIES + COMPILE_DEFINITIONS "Vc_IMPL=${_impl}" + COMPILE_FLAGS "${_flags} ${_extra_flags}" + ) + list(APPEND ${_srcs} "${_out}") endif() endif() endmacro() @@ -461,19 +402,8 @@ # Example: # vc_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS -DCOMPILE_BLAH EXCLUDE Scalar) # add_executable(executable main.cpp ${_objs}) -macro(vc_compile_for_all_implementations _objs _src) - set(${_objs}) - - # remove all -march, -msse, etc. flags from the flags we want to pass - string(REPLACE "${Vc_ARCHITECTURE_FLAGS}" "" _flags "${Vc_DEFINITIONS}") - string(REPLACE "-DVC_IMPL=[^ ]*" "" _flags "${_flags}") - - # capture the -march= switch as -mtune; if there is none skip it - if(Vc_ARCHITECTURE_FLAGS MATCHES "-march=") - string(REGEX REPLACE "^.*-march=([^ ]*).*$" "-mtune=\\1" _tmp "${Vc_ARCHITECTURE_FLAGS}") - set(_flags "${_flags} ${_tmp}") - endif() - +macro(vc_compile_for_all_implementations _srcs _src) + set(_flags) unset(_disabled_targets) unset(_only_targets) set(_state 0) @@ -495,49 +425,35 @@ endif() endforeach() - # make a semicolon separated list of all flags - string(TOUPPER "${CMAKE_BUILD_TYPE}" _tmp) - set(_tmp "CMAKE_CXX_FLAGS_${_tmp}") - string(REPLACE " " ";" _tmp "${CMAKE_CXX_FLAGS} ${${_tmp}} ${_flags}") - set(_flags) - foreach(item ${_tmp}) - if(item MATCHES "^[^']*'[^']*$") - if(_str) - list(APPEND _flags "${_str} ${item}") - unset(_str) - else() - set(_str "${item}") - endif() - else() - list(APPEND _flags "${item}") - endif() - endforeach() - get_directory_property(_inc INCLUDE_DIRECTORIES) - foreach(_i ${_inc}) - list(APPEND _flags "-I${_i}") - endforeach() - set(_vc_compile_src "${_src}") - _vc_compile_one_implementation(${_objs} Scalar NO_FLAG) + _vc_compile_one_implementation(${_srcs} Scalar NO_FLAG) if(NOT Vc_SSE_INTRINSICS_BROKEN) - _vc_compile_one_implementation(${_objs} SSE2 "-msse2" "-xSSE2" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE3 "-msse3" "-xSSE3" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSSE3 "-mssse3" "-xSSSE3" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE4_1 "-msse4.1" "-xSSE4.1" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE4_2 "-msse4.2" "-xSSE4.2" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE3+SSE4a "-msse4a") + _vc_compile_one_implementation(${_srcs} SSE2 "-xSSE2" "-msse2" "/arch:SSE2") + _vc_compile_one_implementation(${_srcs} SSE3 "-xSSE3" "-msse3" "/arch:SSE2") + _vc_compile_one_implementation(${_srcs} SSSE3 "-xSSSE3" "-mssse3" "/arch:SSE2") + _vc_compile_one_implementation(${_srcs} SSE4_1 "-xSSE4.1" "-msse4.1" "/arch:SSE2") + _vc_compile_one_implementation(${_srcs} SSE4_2 "-xSSE4.2" "-msse4.2" "/arch:SSE2") + _vc_compile_one_implementation(${_srcs} SSE3+SSE4a "-msse4a") endif() if(NOT Vc_AVX_INTRINSICS_BROKEN) - _vc_compile_one_implementation(${_objs} AVX "-mavx" "-xAVX" "/arch:AVX") + _vc_compile_one_implementation(${_srcs} AVX "-xAVX" "-mavx" "/arch:AVX") if(NOT Vc_XOP_INTRINSICS_BROKEN) if(NOT Vc_FMA4_INTRINSICS_BROKEN) - _vc_compile_one_implementation(${_objs} SSE+XOP+FMA4 "-mxop -mfma4" "" "") - _vc_compile_one_implementation(${_objs} AVX+XOP+FMA4 "-mavx -mxop -mfma4" "" "") + _vc_compile_one_implementation(${_srcs} SSE+XOP+FMA4 "-mxop -mfma4" "" "") + _vc_compile_one_implementation(${_srcs} AVX+XOP+FMA4 "-mavx -mxop -mfma4" "" "") endif() - _vc_compile_one_implementation(${_objs} SSE+XOP+FMA "-mxop -mfma" "" "") - _vc_compile_one_implementation(${_objs} AVX+XOP+FMA "-mavx -mxop -mfma" "" "") + _vc_compile_one_implementation(${_srcs} SSE+XOP+FMA "-mxop -mfma" "" "") + _vc_compile_one_implementation(${_srcs} AVX+XOP+FMA "-mavx -mxop -mfma" "" "") endif() - _vc_compile_one_implementation(${_objs} AVX+FMA "-mavx -mfma" "" "") + _vc_compile_one_implementation(${_srcs} AVX+FMA "-mavx -mfma" "" "") + endif() + if(NOT Vc_AVX2_INTRINSICS_BROKEN) + # The necessary list is not clear to me yet. At this point I'll only consider Intel CPUs, in + # which case AVX2 implies the availability of FMA and BMI2 + #_vc_compile_one_implementation(${_srcs} AVX2 "-mavx2") + #_vc_compile_one_implementation(${_srcs} AVX2+BMI2 "-mavx2 -mbmi2") + _vc_compile_one_implementation(${_srcs} AVX2+FMA+BMI2 "-xCORE-AVX2" "-mavx2 -mfma -mbmi2" "/arch:AVX2") + #_vc_compile_one_implementation(${_srcs} AVX2+FMA "-mavx2 -mfma") endif() endmacro() diff -Nru vc-0.7.4/CMakeLists.txt vc-1.3.0/CMakeLists.txt --- vc-0.7.4/CMakeLists.txt 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/CMakeLists.txt 2016-10-27 02:05:02.000000000 -0500 @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8.3) +cmake_minimum_required(VERSION 3.0) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR) message(FATAL_ERROR "You don't want to configure in the source directory!") @@ -7,9 +7,6 @@ project(Vc) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") -set(ROOT_RELEASE FALSE CACHE BOOL "Set up for creating a Vc copy inside ROOT/AliRoot.") -mark_as_advanced(ROOT_RELEASE) - set(disabled_targets) include (VcMacros) @@ -17,82 +14,78 @@ include (OptimizeForArchitecture) vc_determine_compiler() +find_package(MIC) -if(ROOT_RELEASE) - if(EXISTS "${CMAKE_INSTALL_PREFIX}/Module.mk") - file(READ "${CMAKE_INSTALL_PREFIX}/Module.mk" ROOT_MODULE_MK) - if(NOT "${ROOT_MODULE_MK}" MATCHES "\nMODNAME *:= *vc *\n") - message(FATAL_ERROR "CMAKE_INSTALL_PREFIX is incorrect. It must point to the Vc subdirectory inside ROOT/AliRoot") - endif() - set(_extra_namespace "ROOT") +option(USE_CCACHE "If enabled, ccache will be used (if it exists on the system) to speed up recompiles." OFF) +if(USE_CCACHE) + find_program(CCACHE_COMMAND ccache) + if(CCACHE_COMMAND) + mark_as_advanced(CCACHE_COMMAND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_COMMAND}") endif() - if(EXISTS "${CMAKE_INSTALL_PREFIX}/Vc.cmake") - file(READ "${CMAKE_INSTALL_PREFIX}/Vc.cmake" ALIROOT_VC_CMAKE) - if(NOT "${ALIROOT_VC_CMAKE}" MATCHES "\nmacro\\(ALICE_UseVc\\)\n") - message(FATAL_ERROR "CMAKE_INSTALL_PREFIX is incorrect. It must point to the Vc subdirectory inside ROOT/AliRoot") +endif() + +# TODO: check that 'decltype' compiles +# TODO: check that 'constexpr' compiles +if(NOT Vc_COMPILER_IS_MSVC) # MSVC doesn't provide a switch to turn C++11 on/off AFAIK + AddCompilerFlag("-std=c++14" CXX_RESULT _ok MIC_CXX_RESULT _mic_ok CXX_FLAGS CMAKE_CXX_FLAGS MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) + if(MIC_NATIVE_FOUND AND NOT _mic_ok) + AddCompilerFlag("-std=c++1y" MIC_CXX_RESULT _mic_ok MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) + if(NOT _mic_ok) + AddCompilerFlag("-std=c++11" MIC_CXX_RESULT _mic_ok MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) + if(NOT _mic_ok) + AddCompilerFlag("-std=c++0x" MIC_CXX_RESULT _mic_ok MIC_CXX_FLAGS Vc_MIC_CXX_FLAGS) + if(NOT _mic_ok) + message(FATAL_ERROR "Vc 1.x requires C++11, better even C++14. The MIC native compiler does not support any of the C++11 language flags.") + endif() + endif() endif() - set(_extra_namespace "AliRoot") endif() -else() - if(Vc_COMPILER_IS_GCC) - if(Vc_GCC_VERSION STREQUAL "4.6.0") - UserWarning("GCC 4.6.0 is broken. The following tests are therefore disabled: - gather_avx, gather_sse, gather_VC_USE_SET_GATHERS_avx, gather_VC_USE_SET_GATHERS_sse, - gather_sse_LOOP, scatter_avx, and scatter_sse") - list(APPEND disabled_targets - gather_avx - gather_sse - gather_VC_USE_SET_GATHERS_avx - gather_VC_USE_SET_GATHERS_sse - scatter_avx - scatter_sse - c++11_gather_avx - c++11_gather_sse - c++11_gather_VC_USE_SET_GATHERS_avx - c++11_gather_VC_USE_SET_GATHERS_sse - c++11_scatter_avx - c++11_scatter_sse - ) - elseif(Vc_GCC_VERSION STREQUAL "4.5.0" OR Vc_GCC_VERSION STREQUAL "4.5.1") - UserWarning("GCC 4.5.[12] are known to generate an internal compiler error on the memory unit test. - (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46723) - The test will therefore not be compiled and executed.") - list(APPEND disabled_targets - memory_scalar - memory_sse - memory_avx - c++11_memory_scalar - c++11_memory_sse - c++11_memory_avx - ) - elseif(Vc_GCC_VERSION STREQUAL "4.5.2") - UserWarning("GCC 4.5.2 generates an internal compiler error on the memory_scalar unit test. The test will not be compiled and executed.") - list(APPEND disabled_targets - memory_scalar - c++11_memory_scalar - ) + if(NOT _ok) + AddCompilerFlag("-std=c++1y" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS) + if(NOT _ok) + AddCompilerFlag("-std=c++11" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS) + if(NOT _ok) + AddCompilerFlag("-std=c++0x" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS) + if(NOT _ok) + message(FATAL_ERROR "Vc 1.x requires C++11, better even C++14. It seems this is not available. If this was incorrectly determined please notify vc-devel@compeng.uni-frankfurt.de") + endif() + endif() endif() - elseif(Vc_COMPILER_IS_CLANG) - if(Vc_CLANG_VERSION VERSION_EQUAL "3.0") - UserWarning("Clang 3.0 generates an internal compiler error on the finitediff example. The example will not be compiled.") - list(APPEND disabled_targets - example_finitediff + endif() +elseif(Vc_MSVC_VERSION LESS 180021114) + message(FATAL_ERROR "Vc 1.x requires C++11 support. This requires at least Visual Studio 2013 with the Nov 2013 CTP.") +endif() + +if(Vc_COMPILER_IS_GCC) + if(Vc_GCC_VERSION VERSION_GREATER "5.0.0" AND Vc_GCC_VERSION VERSION_LESS "6.0.0") + UserWarning("GCC 5 goes into an endless loop comiling example_scaling_scalar. Therefore, this target is disabled.") + list(APPEND disabled_targets + example_scaling_scalar + ) + elseif(Vc_GCC_VERSION VERSION_GREATER "5.99") + AddCompilerFlag(-Wno-ignored-attributes) + endif() +elseif(Vc_COMPILER_IS_MSVC) + if(MSVC_VERSION LESS 1700) + # MSVC before 2012 has a broken std::vector::resize implementation. STL + Vc code will probably not compile. + # UserWarning in VcMacros.cmake + list(APPEND disabled_targets + stlcontainer_sse + stlcontainer_avx + ) + endif() + # Disable warning "C++ exception specification ignored except to indicate a function is not __declspec(nothrow)" + # MSVC emits the warning for the _UnitTest_Compare desctructor which needs the throw declaration so that it doesn't std::terminate + AddCompilerFlag("/wd4290") +endif() +if(MIC_NATIVE_FOUND) + if("${Vc_MIC_ICC_VERSION}" VERSION_LESS "16.1.0") + UserWarning("ICC for MIC uses an incompatible STL. Disabling simdize_mic.") + list(APPEND disabled_targets + simdize_mic + example_simdize_mic ) - endif() - elseif(Vc_COMPILER_IS_MSVC) - if(MSVC_VERSION LESS 1700) - # MSVC before 2012 has a broken std::vector::resize implementation. STL + Vc code will probably not compile. - # UserWarning in VcMacros.cmake - list(APPEND disabled_targets - stlcontainer_sse - stlcontainer_avx - c++11_stlcontainer_sse - c++11_stlcontainer_avx - ) - endif() - # Disable warning "C++ exception specification ignored except to indicate a function is not __declspec(nothrow)" - # MSVC emits the warning for the _UnitTest_Compare desctructor which needs the throw declaration so that it doesn't std::terminate - AddCompilerFlag("/wd4290") endif() endif() @@ -103,15 +96,13 @@ endif(NOT CMAKE_BUILD_TYPE) vc_set_preferred_compiler_flags(WARNING_FLAGS BUILDTYPE_FLAGS) -add_definitions("${Vc_DEFINITIONS}") -if(Vc_COMPILER_IS_GCC AND Vc_GCC_VERSION VERSION_LESS 4.3.0) - add_definitions(-DVC_DONT_WARN_OLD_GCC) # this warning is only interesting for external users of Vc -endif() + +add_definitions(${Vc_DEFINITIONS}) +add_compile_options(${Vc_COMPILE_FLAGS}) if(Vc_COMPILER_IS_INTEL) # per default icc is not IEEE compliant, but we need that for verification - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fp-model source") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fp-model source") + AddCompilerFlag("-fp-model source") endif() if(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]") @@ -120,86 +111,88 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include) -if(NOT ROOT_RELEASE) - add_custom_target(other VERBATIM) - add_custom_target(Scalar COMMENT "build Scalar code" VERBATIM) - add_custom_target(SSE COMMENT "build SSE code" VERBATIM) - add_custom_target(AVX COMMENT "build AVX code" VERBATIM) - - set(libvc_compile_flags "-DVC_COMPILE_LIB") - AddCompilerFlag("-fPIC" CXX_FLAGS libvc_compile_flags) - vc_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS ${libvc_compile_flags} - ONLY SSE2 SSE3 SSSE3 SSE4_1 AVX SSE+XOP+FMA4 AVX+XOP+FMA4 AVX+XOP+FMA AVX+FMA) - set(_srcs src/const.cpp src/cpuid.cpp src/support.cpp ${_objs}) - vc_compile_for_all_implementations(_objs src/avx_sorthelper.cpp FLAGS ${libvc_compile_flags} ONLY AVX) - set(_srcs ${_srcs} ${_objs}) - add_library(Vc STATIC ${_srcs}) - add_target_property(Vc COMPILE_FLAGS ${libvc_compile_flags}) - add_target_property(Vc LABELS "other") - add_dependencies(other Vc) - - install(TARGETS Vc DESTINATION lib${LIB_SUFFIX}) - install(DIRECTORY include/Vc/ DESTINATION include/Vc) - install(DIRECTORY scalar sse avx common DESTINATION include/Vc FILES_MATCHING REGEX "/*.(h|tcc|def)$") +add_custom_target(other VERBATIM) +add_custom_target(Scalar COMMENT "build Scalar code" VERBATIM) +add_custom_target(SSE COMMENT "build SSE code" VERBATIM) +add_custom_target(AVX COMMENT "build AVX code" VERBATIM) +add_custom_target(AVX2 COMMENT "build AVX2 code" VERBATIM) +add_custom_target(MIC COMMENT "build MIC code" VERBATIM) + +AddCompilerFlag(-ftemplate-depth=128 CXX_FLAGS CMAKE_CXX_FLAGS MIC_CXX_FLAGS CMAKE_MIC_CXX_FLAGS) + +set(libvc_compile_flags "-DVc_COMPILE_LIB") +set(libvc_mic_compile_flags "-DVc_COMPILE_LIB") +AddCompilerFlag("-fPIC" CXX_FLAGS libvc_compile_flags MIC_CXX_FLAGS libvc_mic_compile_flags) + +if(MIC_FOUND) + mic_add_library(Vc_MIC STATIC src/mic_const.cpp src/cpuid.cpp src/support_x86.cpp src/mic_sorthelper.cpp + COMPILE_FLAGS ${libvc_mic_compile_flags}) + add_target_property(Vc_MIC LABELS "MIC") + add_dependencies(MIC Vc_MIC) + get_target_property(outputName Vc_MIC OUTPUT_NAME) + install(FILES ${outputName} DESTINATION lib${LIB_SUFFIX}) +endif() + +set(_srcs src/const.cpp) +if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "([x3-7]86|AMD64)") + + list(APPEND _srcs src/cpuid.cpp src/support_x86.cpp) + vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY SSE2 SSE3 SSSE3 SSE4_1 AVX SSE+XOP+FMA4 AVX+XOP+FMA4 AVX+XOP+FMA AVX+FMA)# AVX2+FMA+BMI2) + vc_compile_for_all_implementations(_srcs src/sse_sorthelper.cpp ONLY SSE2 SSE4_1 AVX AVX2+FMA+BMI2) + vc_compile_for_all_implementations(_srcs src/avx_sorthelper.cpp ONLY AVX AVX2+FMA+BMI2) else() - # libVc should be compiled in the ROOT/AliRoot tree, so we need to install the sources - # - # Sadly there are messed up systems where putting include/Vc in the include paths will - # break the standard library (e.g. MacOS X Lion with case insensitive filesystem). - # Thus, we modify the includes such that include/Vc never needs to be in the path. - file(GLOB_RECURSE _srcs RELATIVE "${CMAKE_SOURCE_DIR}" src/*.cpp examples/*.cpp examples/*.h tests/*.cpp tests/*.h) - foreach(_src ${_srcs}) - message(STATUS "Processing ${CMAKE_SOURCE_DIR}/${_src} -> ${CMAKE_BINARY_DIR}/${_src}") - get_filename_component(_path "${CMAKE_BINARY_DIR}/${_src}" PATH) - file(MAKE_DIRECTORY "${_path}") - execute_process( - COMMAND sed -e "s,#include \\(.\\)\\(common\\|avx\\|sse\\|scalar\\)/,#include \\1Vc/\\2/," - -e "s,::Vc::,::${_extra_namespace}::Vc::,g" - -e "s,/\\*OUTER_NAMESPACE_BEGIN\\*/,namespace ${_extra_namespace} {," - -e "s,/\\*OUTER_NAMESPACE_END\\*/,} // namespace ${_extra_namespace}," - -e "s,/\\*NAMESPACE_ALIAS\\*/,namespace Vc = ${_extra_namespace}::Vc;," - INPUT_FILE ${CMAKE_SOURCE_DIR}/${_src} - OUTPUT_FILE ${CMAKE_BINARY_DIR}/${_src} - ) - endforeach() - - set(includes) - macro(copy_and_set_outer_namespace dst) - foreach(_name ${ARGN}) - set(_dst "${dst}${_name}") - set(_src "${CMAKE_SOURCE_DIR}/${_name}") - get_filename_component(_dir "${_dst}" PATH) - add_custom_command(OUTPUT "${_dst}" - COMMAND mkdir -p "${_dir}" - COMMAND cp "${_src}" "${_dst}" - COMMAND sed -e "s,::Vc::,::${_extra_namespace}::Vc::,g" - -e "s,/\\*OUTER_NAMESPACE_BEGIN\\*/,namespace ${_extra_namespace} {," - -e "s,/\\*OUTER_NAMESPACE_END\\*/,} // namespace ${_extra_namespace}," - -e "s,/\\*NAMESPACE_ALIAS\\*/,namespace Vc = ${_extra_namespace}::Vc;," - -i "${_dst}" - MAIN_DEPENDENCY "${_src}" - COMMENT "Rewrite ${_dst}" - WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" - VERBATIM) - list(APPEND includes "${_dst}") - endforeach() - endmacro() - - file(GLOB_RECURSE _srcs RELATIVE "${CMAKE_SOURCE_DIR}" include/*.h include/*.tcc include/*.def) - file(GLOB _src2 RELATIVE "${CMAKE_SOURCE_DIR}" include/Vc/*) - list(APPEND _srcs ${_src2}) - list(REMOVE_DUPLICATES _srcs) - copy_and_set_outer_namespace("" "${_srcs}") - - foreach(_dir in scalar sse avx common) - file(GLOB_RECURSE _srcs RELATIVE "${CMAKE_SOURCE_DIR}" ${_dir}/*.h ${_dir}/*.tcc ${_dir}/*.def) - copy_and_set_outer_namespace("include/Vc/" "${_srcs}") - endforeach() - add_custom_target(rewrite ALL DEPENDS ${includes}) + message(FATAL_ERROR "Unsupported target architecture '${CMAKE_SYSTEM_PROCESSOR}'. No support_???.cpp file exists for this architecture.") endif() +add_library(Vc STATIC ${_srcs}) +set_property(TARGET Vc APPEND PROPERTY COMPILE_OPTIONS ${libvc_compile_flags}) +add_target_property(Vc LABELS "other") +if(XCODE) + # TODO: document what this does and why it has no counterpart in the non-XCODE logic + set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_INLINES_ARE_PRIVATE_EXTERN "NO") + set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES") + set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++0x") + set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") +elseif(UNIX AND Vc_COMPILER_IS_CLANG) + # On UNIX (Linux) the standard library used by default typically is libstdc++ (GCC). + # To get the full clang deal we rather want to build against libc++. This requires + # additionally the libc++abi and libsupc++ libraries in all linker invokations. + option(USE_LIBC++ "Use libc++ instead of the system default C++ standard library." ON) + if(USE_LIBC++) + AddCompilerFlag(-stdlib=libc++ CXX_FLAGS CMAKE_CXX_FLAGS CXX_RESULT _use_libcxx) + if(_use_libcxx) + find_library(LIBC++ABI c++abi) + mark_as_advanced(LIBC++ABI) + if(LIBC++ABI) + set(CMAKE_REQUIRED_LIBRARIES "${LIBC++ABI};supc++") + CHECK_CXX_SOURCE_COMPILES("#include + #include + void foo() { + std::cout << 'h' << std::flush << std::endl; + throw std::exception(); + } + int main() { + try { foo(); } + catch (int) { return 0; } + return 1; + }" libcxx_compiles) + unset(CMAKE_REQUIRED_LIBRARIES) + if(libcxx_compiles) + link_libraries(${LIBC++ABI} supc++) + endif() + endif() + endif() + endif() +endif() +add_dependencies(other Vc) + +install(TARGETS Vc DESTINATION lib${LIB_SUFFIX}) +install(DIRECTORY include/Vc/ DESTINATION include/Vc) + +# Install all implementation headers +install(DIRECTORY scalar sse avx mic common traits DESTINATION include/Vc FILES_MATCHING REGEX "/*.(h|tcc|def)$") # read version parts from version.h to be put into VcConfig.cmake -file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/include/Vc/version.h _version_lines REGEX "^#define VC_VERSION_STRING ") +file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/include/Vc/version.h _version_lines REGEX "^#define Vc_VERSION_STRING ") string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" _version_matches "${_version_lines}") set(Vc_VERSION_MAJOR ${CMAKE_MATCH_1}) set(Vc_VERSION_MINOR ${CMAKE_MATCH_2}) @@ -210,55 +203,49 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/VcConfigVersion.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfigVersion.cmake @ONLY) -set(cmake_install_files +install(FILES cmake/UserWarning.cmake cmake/VcMacros.cmake cmake/AddCompilerFlag.cmake cmake/CheckCCompilerFlag.cmake cmake/CheckCXXCompilerFlag.cmake + cmake/CheckMicCCompilerFlag.cmake + cmake/CheckMicCXXCompilerFlag.cmake + cmake/FindMIC.cmake + cmake/OptimizeForArchitecture.cmake + cmake/FindVc.cmake + ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfigVersion.cmake + DESTINATION lib${LIB_SUFFIX}/cmake/Vc ) -if(ROOT_RELEASE) - execute_process( - COMMAND sed "s, \"auto\" CACHE, \"none\" CACHE," - INPUT_FILE ${CMAKE_SOURCE_DIR}/cmake/OptimizeForArchitecture.cmake - OUTPUT_FILE ${CMAKE_BINARY_DIR}/cmake/OptimizeForArchitecture.cmake - ) - install(FILES - ${cmake_install_files} - cmake/AddTargetProperty.cmake - ${CMAKE_BINARY_DIR}/cmake/OptimizeForArchitecture.cmake - DESTINATION cmake + +option(BUILD_TESTING "Build the testing tree." OFF) +include (CTest) +configure_file(${CMAKE_SOURCE_DIR}/CTestCustom.cmake ${CMAKE_BINARY_DIR}/CTestCustom.cmake COPYONLY) +if(BUILD_TESTING) + add_custom_target(build_tests ALL VERBATIM) + add_subdirectory(tests) +endif(BUILD_TESTING) + +set(BUILD_EXAMPLES FALSE CACHE BOOL "Build examples.") +if(BUILD_EXAMPLES) + add_subdirectory(examples) +endif(BUILD_EXAMPLES) + +# Hide Vc_IMPL as it is only meant for users of Vc +mark_as_advanced(Vc_IMPL) + +find_program(BIN_CAT cat) +mark_as_advanced(BIN_CAT) +if(BIN_CAT) + file(REMOVE ${CMAKE_BINARY_DIR}/help.txt) + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/help.txt + COMMAND ${CMAKE_MAKE_PROGRAM} help > ${CMAKE_BINARY_DIR}/help.txt + VERBATIM ) - install(DIRECTORY ${CMAKE_BINARY_DIR}/examples/ DESTINATION examples) - install(DIRECTORY ${CMAKE_BINARY_DIR}/tests/ DESTINATION tests) - install(FILES tests/CMakeLists.txt tests/download.cmake DESTINATION tests) - install(DIRECTORY ${CMAKE_BINARY_DIR}/src/ DESTINATION src) - install(DIRECTORY ${CMAKE_BINARY_DIR}/include/Vc/ DESTINATION include/Vc) - install(DIRECTORY examples/ DESTINATION examples FILES_MATCHING PATTERN CMakeLists.txt) -else() - install(FILES - ${cmake_install_files} - ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/cmake/VcConfigVersion.cmake - cmake/OptimizeForArchitecture.cmake - cmake/FindVc.cmake - DESTINATION lib/cmake/Vc + add_custom_target(cached_help + ${BIN_CAT} ${CMAKE_BINARY_DIR}/help.txt + DEPENDS ${CMAKE_BINARY_DIR}/help.txt + VERBATIM ) endif() - -if(NOT ROOT_RELEASE) - include (CTest) - configure_file(${CMAKE_SOURCE_DIR}/CTestCustom.cmake ${CMAKE_BINARY_DIR}/CTestCustom.cmake COPYONLY) - if(BUILD_TESTING) - add_custom_target(build_tests VERBATIM) - add_subdirectory(tests) - endif(BUILD_TESTING) - - set(BUILD_EXAMPLES FALSE CACHE BOOL "Build examples.") - if(BUILD_EXAMPLES) - add_subdirectory(examples) - endif(BUILD_EXAMPLES) -endif() - -# Hide VC_IMPL as it is only meant for users of Vc -mark_as_advanced(VC_IMPL) diff -Nru vc-0.7.4/common/algorithms.h vc-1.3.0/common/algorithms.h --- vc-0.7.4/common/algorithms.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/algorithms.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,210 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_ALGORITHMS_H_ +#define VC_COMMON_ALGORITHMS_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +/** + * \ingroup Utilities + * + * \name Boolean Reductions + */ +//@{ +/** \ingroup Utilities + * Returns whether all entries in the mask \p m are \c true. + */ +template constexpr bool all_of(const Mask &m) { return m.isFull(); } +/** \ingroup Utilities + * Returns \p b + */ +constexpr bool all_of(bool b) { return b; } + +/** \ingroup Utilities + * Returns whether at least one entry in the mask \p m is \c true. + */ +template constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); } +/** \ingroup Utilities + * Returns \p b + */ +constexpr bool any_of(bool b) { return b; } + +/** \ingroup Utilities + * Returns whether all entries in the mask \p m are \c false. + */ +template constexpr bool none_of(const Mask &m) { return m.isEmpty(); } +/** \ingroup Utilities + * Returns \p !b + */ +constexpr bool none_of(bool b) { return !b; } + +/** \ingroup Utilities + * Returns whether at least one entry in \p m is \c true and at least one entry in \p m is \c + * false. + */ +template constexpr bool some_of(const Mask &m) { return m.isMix(); } +/** \ingroup Utilities + * Returns \c false + */ +constexpr bool some_of(bool) { return false; } +//@} + +template +inline enable_if::value && + Traits::is_functor_argument_immutable< + UnaryFunction, Vector>::value, + UnaryFunction> +simd_for_each(InputIt first, InputIt last, UnaryFunction f) +{ + typedef Vector V; + typedef Scalar::Vector V1; + for (; reinterpret_cast(std::addressof(*first)) & + (V::MemoryAlignment - 1) && + first != last; + ++first) { + f(V1(std::addressof(*first), Vc::Aligned)); + } + const auto lastV = last - (V::Size + 1); + for (; first < lastV; first += V::Size) { + f(V(std::addressof(*first), Vc::Aligned)); + } + for (; first != last; ++first) { + f(V1(std::addressof(*first), Vc::Aligned)); + } + return std::move(f); +} + +template +inline enable_if::value && + !Traits::is_functor_argument_immutable< + UnaryFunction, Vector>::value, + UnaryFunction> +simd_for_each(InputIt first, InputIt last, UnaryFunction f) +{ + typedef Vector V; + typedef Scalar::Vector V1; + for (; reinterpret_cast(std::addressof(*first)) & + (V::MemoryAlignment - 1) && + first != last; + ++first) { + V1 tmp(std::addressof(*first), Vc::Aligned); + f(tmp); + tmp.store(std::addressof(*first), Vc::Aligned); + } + const auto lastV = last - (V::Size + 1); + for (; first < lastV; first += V::Size) { + V tmp(std::addressof(*first), Vc::Aligned); + f(tmp); + tmp.store(std::addressof(*first), Vc::Aligned); + } + for (; first != last; ++first) { + V1 tmp(std::addressof(*first), Vc::Aligned); + f(tmp); + tmp.store(std::addressof(*first), Vc::Aligned); + } + return std::move(f); +} + +template +inline enable_if::value, UnaryFunction> +simd_for_each(InputIt first, InputIt last, UnaryFunction f) +{ + return std::for_each(first, last, std::move(f)); +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline enable_if::value && + Traits::is_functor_argument_immutable< + UnaryFunction, Vector>::value, + UnaryFunction> +simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f) +{ + typename std::make_signed::type len = count; + typedef Vector V; + typedef Scalar::Vector V1; + for (; reinterpret_cast(std::addressof(*first)) & + (V::MemoryAlignment - 1) && + len != 0; + --len, ++first) { + f(V1(std::addressof(*first), Vc::Aligned)); + } + for (; len >= int(V::Size); len -= V::Size, first += V::Size) { + f(V(std::addressof(*first), Vc::Aligned)); + } + for (; len != 0; --len, ++first) { + f(V1(std::addressof(*first), Vc::Aligned)); + } + return std::move(f); +} + +template +inline enable_if::value && + !Traits::is_functor_argument_immutable< + UnaryFunction, Vector>::value, + UnaryFunction> +simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f) +{ + typename std::make_signed::type len = count; + typedef Vector V; + typedef Scalar::Vector V1; + for (; reinterpret_cast(std::addressof(*first)) & + (V::MemoryAlignment - 1) && + len != 0; + --len, ++first) { + V1 tmp(std::addressof(*first), Vc::Aligned); + f(tmp); + tmp.store(std::addressof(*first), Vc::Aligned); + } + for (; len >= int(V::Size); len -= V::Size, first += V::Size) { + V tmp(std::addressof(*first), Vc::Aligned); + f(tmp); + tmp.store(std::addressof(*first), Vc::Aligned); + } + for (; len != 0; --len, ++first) { + V1 tmp(std::addressof(*first), Vc::Aligned); + f(tmp); + tmp.store(std::addressof(*first), Vc::Aligned); + } + return std::move(f); +} + +#ifdef Vc_CXX17 +template +inline enable_if::value, UnaryFunction> +simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f) +{ + return std::for_each_n(first, count, std::move(f)); +} +#endif + +} // namespace Vc + +#endif // VC_COMMON_ALGORITHMS_H_ diff -Nru vc-0.7.4/common/aliasingentryhelper.h vc-1.3.0/common/aliasingentryhelper.h --- vc-0.7.4/common/aliasingentryhelper.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/aliasingentryhelper.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,29 +1,36 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2010-2015 Matthias Kretz - Copyright (C) 2010-2011 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_ALIASINGENTRYHELPER_H -#define VC_COMMON_ALIASINGENTRYHELPER_H +#ifndef VC_COMMON_ALIASINGENTRYHELPER_H_ +#define VC_COMMON_ALIASINGENTRYHELPER_H_ #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { namespace Common { @@ -32,12 +39,13 @@ { private: typedef typename StorageType::EntryType T; -#ifdef VC_ICC +#ifdef Vc_ICC StorageType *const m_storage; const int m_index; public: Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {} - Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &rhs) : m_storage(rhs.m_storage), m_index(rhs.m_index) {} + Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default; + Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default; Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { m_storage->assign(m_index, rhs); return *this; @@ -54,11 +62,6 @@ Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; } - - Vc_ALWAYS_INLINE AliasingEntryHelper &operator++() { m_storage->assign(m_index, m_storage->m(m_index) + T(1)); return *this; } - Vc_ALWAYS_INLINE T operator++(int) { T r = m_storage->m(m_index); m_storage->assign(m_index, m_storage->m(m_index) + T(1)); return r; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator--() { m_storage->assign(m_index, m_storage->m(m_index) - T(1)); return *this; } - Vc_ALWAYS_INLINE T operator--(int) { T r = m_storage->m(m_index); m_storage->assign(m_index, m_storage->m(m_index) - T(1)); return r; } #define m_data m_storage->read(m_index) #else typedef T A Vc_MAY_ALIAS; @@ -84,11 +87,6 @@ Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; } - - Vc_ALWAYS_INLINE AliasingEntryHelper &operator++() { ++m_data; return *this; } - Vc_ALWAYS_INLINE T operator++(int) { T r = m_data; ++m_data; return r; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator--() { --m_data; return *this; } - Vc_ALWAYS_INLINE T operator--(int) { T r = m_data; --m_data; return r; } #endif Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; } @@ -117,10 +115,7 @@ #endif }; -} // namespace Common -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#include "undomacros.h" +} // namespace Common +} // namespace Vc -#endif // VC_COMMON_ALIASINGENTRYHELPER_H +#endif // VC_COMMON_ALIASINGENTRYHELPER_H_ diff -Nru vc-0.7.4/common/alignedbase.h vc-1.3.0/common/alignedbase.h --- vc-0.7.4/common/alignedbase.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/alignedbase.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,137 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_ALIGNEDBASE_H_ +#define VC_COMMON_ALIGNEDBASE_H_ + +#include "types.h" +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Detail +{ +/**\internal + * Break the recursion of the function below. + */ +template constexpr T max(T a) { return a; } +/**\internal + * \returns the maximum of all specified arguments. + */ +template constexpr T max(T a, T b, Ts... rest) +{ + return a > b ? max(a, rest...) : max(b, rest...); +} +} // namespace Detail +namespace Common +{ +template Vc_INTRINSIC void *aligned_malloc(std::size_t); +Vc_ALWAYS_INLINE void free(void *); +} // namespace Common + +/** + * \ingroup Utilities + * + * Helper class to ensure a given alignment. + * + * This class reimplements the \c new and \c delete operators to align objects allocated + * on the heap suitably with the specified alignment \c Alignment. + * + * \see Vc::VectorAlignedBase + * \see Vc::MemoryAlignedBase + */ +template struct alignas(Alignment) AlignedBase +{ + Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment); +}; + +/** + * \ingroup Utilities + * + * Helper type to ensure suitable alignment for any Vc::Vector type (using the default + * VectorAbi). + * + * This class reimplements the \c new and \c delete operators to align objects allocated + * on the heap suitably for objects of Vc::Vector type. This is necessary since the + * standard \c new operator does not adhere to the alignment requirements of the type. + * + * \see Vc::VectorAlignedBaseT + * \see Vc::MemoryAlignedBase + * \see Vc::AlignedBase + */ +using VectorAlignedBase = AlignedBase< + Detail::max(alignof(Vector), alignof(Vector), alignof(Vector), + alignof(Vector), alignof(Vector), alignof(Vector), + alignof(Vector), alignof(Vector), alignof(Vector), + alignof(Vector), alignof(Vector), alignof(Vector))>; + +/** + * \ingroup Utilities + * Variant of the above type ensuring suitable alignment only for the specified vector + * type \p V. + * + * \see Vc::VectorAlignedBase + * \see Vc::MemoryAlignedBaseT + */ +template using VectorAlignedBaseT = AlignedBase; + +/** + * \ingroup Utilities + * + * Helper class to ensure suitable alignment for arrays of scalar objects for any + * Vc::Vector type (using the default VectorAbi). + * + * This class reimplements the \c new and \c delete operators to align objects allocated + * on the heap suitably for arrays of type \p Vc::Vector::EntryType. Subsequent load + * and store operations are safe to use the aligned variant. + * + * \see Vc::MemoryAlignedBaseT + * \see Vc::VectorAlignedBase + * \see Vc::AlignedBase + */ +using MemoryAlignedBase = AlignedBase< + Detail::max(Vector::MemoryAlignment, Vector::MemoryAlignment, + Vector::MemoryAlignment, Vector::MemoryAlignment, + Vector::MemoryAlignment, Vector::MemoryAlignment, + Vector::MemoryAlignment, Vector::MemoryAlignment, + Vector::MemoryAlignment, Vector::MemoryAlignment, + Vector::MemoryAlignment, Vector::MemoryAlignment)>; + +/** + * \ingroup Utilities + * Variant of the above type ensuring suitable alignment only for the specified vector + * type \p V. + * + * \see Vc::MemoryAlignedBase + * \see Vc::VectorAlignedBaseT + */ +template using MemoryAlignedBaseT = AlignedBase; +} + +#endif // VC_COMMON_ALIGNEDBASE_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/bitscanintrinsics.h vc-1.3.0/common/bitscanintrinsics.h --- vc-0.7.4/common/bitscanintrinsics.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/bitscanintrinsics.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,27 +1,35 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2011-2015 Matthias Kretz - Copyright (C) 2011-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. +#ifndef VC_COMMON_BITSCANINTRINSICS_H_ +#define VC_COMMON_BITSCANINTRINSICS_H_ - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_BITSCANINTRINSICS_H -#define VC_COMMON_BITSCANINTRINSICS_H - -#if defined(VC_GCC) || defined(VC_CLANG) -# if VC_GCC >= 0x40500 +#if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG) +# if Vc_GCC >= 0x40500 // GCC 4.5.0 introduced _bit_scan_forward / _bit_scan_reverse # include # else @@ -33,30 +41,25 @@ __asm__("bsr %1,%0" : "=r"(r) : "X"(x)); return r; } -#include "undomacros.h" # define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x) # endif -#elif defined(VC_ICC) -// for all I know ICC supports the _bit_scan_* intrinsics -#elif defined(VC_OPEN64) -// TODO -#elif defined(VC_MSVC) -#include "windows_fix_intrin.h" -#pragma intrinsic(_BitScanForward) -#pragma intrinsic(_BitScanReverse) +#elif defined(_WIN32) +#include "intrin.h" static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) { - unsigned long index; - _BitScanForward(&index, x); - return index; + unsigned long index; + _BitScanForward(&index, x); + return index; } static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) { - unsigned long index; - _BitScanReverse(&index, x); - return index; + unsigned long index; + _BitScanReverse(&index, x); + return index; } +#elif defined(Vc_ICC) +// for all I know ICC supports the _bit_scan_* intrinsics #else // just assume the compiler can do it #endif -#endif // VC_COMMON_BITSCANINTRINSICS_H +#endif // VC_COMMON_BITSCANINTRINSICS_H_ diff -Nru vc-0.7.4/common/const.h vc-1.3.0/common/const.h --- vc-0.7.4/common/const.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/const.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,92 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_CONST_H_ +#define VC_COMMON_CONST_H_ + +#include +#include + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Detail +{ + +template constexpr double exponentToFloat(std::integral_constant); +template constexpr double exponentToFloat(std::integral_constant); +template <> constexpr double exponentToFloat<0>(std::integral_constant) +{ + return 1.; +} +template <> constexpr double exponentToFloat<0>(std::integral_constant) +{ + return 1.; +} +template <> constexpr double exponentToFloat<-32>(std::integral_constant) +{ + return 1. / (65536. * 65536.); +} +template <> constexpr double exponentToFloat<32>(std::integral_constant) +{ + return 65536. * 65536.; +} +template <> constexpr double exponentToFloat<-64>(std::integral_constant) +{ + return 1. / (65536. * 65536. * 65536. * 65536.); +} +template <> constexpr double exponentToFloat<64>(std::integral_constant) +{ + return 65536. * 65536. * 65536. * 65536.; +} +template +constexpr double exponentToFloat(std::integral_constant negative) +{ + return exponentToFloat(negative) * 2.0; +} +template +constexpr double exponentToFloat(std::integral_constant negative) +{ + return exponentToFloat(negative) * 0.5; +} +template constexpr double doubleConstant() +{ + return (static_cast((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) / + 0x0010000000000000ull) * + exponentToFloat(std::integral_constant()) * sign; +} +template constexpr float floatConstant() +{ + return (static_cast((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) * + static_cast( + exponentToFloat(std::integral_constant())) * + sign; +} + +} // namespace Detail +} // namespace Vc + +#endif // VC_COMMON_CONST_H_ diff -Nru vc-0.7.4/common/data.h vc-1.3.0/common/data.h --- vc-0.7.4/common/data.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/data.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,43 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_CONST_DATA_H_ +#define VC_COMMON_CONST_DATA_H_ + +#include "macros.h" +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +alignas(64) extern unsigned int RandomState[]; +alignas(32) extern const unsigned int AllBitsSet[8]; + +} // namespace Common +} // namespace Vc + +#endif // VC_COMMON_CONST_DATA_H_ diff -Nru vc-0.7.4/common/deinterleave.h vc-1.3.0/common/deinterleave.h --- vc-0.7.4/common/deinterleave.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/deinterleave.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,34 +1,43 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2010-2015 Matthias Kretz - Copyright (C) 2010-2011 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_DEINTERLEAVE_H -#define VC_COMMON_DEINTERLEAVE_H +#ifndef VC_COMMON_DEINTERLEAVE_H_ +#define VC_COMMON_DEINTERLEAVE_H_ #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { /** * \ingroup Vectors * + * \deprecated Turn to InterleavedMemoryWrapper for a more flexible and complete solution. + * * Loads two vectors of values from an interleaved array. * * \param a, b The vectors to load the values from memory into. @@ -53,8 +62,6 @@ =========|=======|========|========|=======|======|===== float_v | X | | X | X | | ---------|-------|--------|--------|-------|------|----- -sfloat_v | X | | X | X | | ----------|-------|--------|--------|-------|------|----- double_v | | X | | | | ---------|-------|--------|--------|-------|------|----- int_v | | | | X | | X @@ -69,19 +76,16 @@ template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, const M *memory, A align) { - Internal::Helper::deinterleave(*a, *b, memory, align); + Detail::deinterleave(*a, *b, memory, align); } // documented as default for align above template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, const M *memory) { - Internal::Helper::deinterleave(*a, *b, memory, Aligned); + Detail::deinterleave(*a, *b, memory, Aligned); } -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#include "undomacros.h" +} // namespace Vc -#endif // VC_COMMON_DEINTERLEAVE_H +#endif // VC_COMMON_DEINTERLEAVE_H_ diff -Nru vc-0.7.4/common/elementreference.h vc-1.3.0/common/elementreference.h --- vc-0.7.4/common/elementreference.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/elementreference.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,140 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2016 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_ELEMENTREFERENCE_H_ +#define VC_COMMON_ELEMENTREFERENCE_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Detail +{ +template class ElementReference +{ + using value_type = typename U::value_type; + friend U; + friend Accessor; + Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {} + + static constexpr bool get_noexcept = + noexcept(Accessor::get(std::declval(), int())); + template static constexpr bool set_noexcept() + { + return noexcept(Accessor::set(std::declval(), int(), std::declval())); + } + +public: + Vc_INTRINSIC ElementReference(const ElementReference &) = delete; + + Vc_INTRINSIC operator value_type() const noexcept(get_noexcept) + { + return Accessor::get(obj, index); + } + + template + Vc_INTRINSIC ElementReference &operator=(T &&x) && + noexcept(noexcept(Accessor::set(std::declval(), int(), std::declval()))) + { + Accessor::set(obj, index, std::forward(x)); + return *this; + } + +// TODO: improve with operator.() + +#define Vc_OP_(op_) \ + template () \ + op_ std::declval())> \ + Vc_INTRINSIC ElementReference &operator op_##=(T &&x) && \ + noexcept(get_noexcept && noexcept(Accessor::set(std::declval(), int(), \ + std::declval()))) \ + { \ + const value_type &lhs = Accessor::get(obj, index); \ + Accessor::set(obj, index, lhs op_ std::forward(x)); \ + return *this; \ + } + Vc_ALL_ARITHMETICS(Vc_OP_); + Vc_ALL_SHIFTS(Vc_OP_); + Vc_ALL_BINARY(Vc_OP_); +#undef Vc_OP_ + + template + Vc_INTRINSIC ElementReference &operator++() && + noexcept(noexcept(std::declval() = + Accessor::get(std::declval(), int())) && + set_noexcept())>()) + { + value_type x = Accessor::get(obj, index); + Accessor::set(obj, index, ++x); + return *this; + } + + template + Vc_INTRINSIC value_type operator++(int) && + noexcept(noexcept(std::declval() = + Accessor::get(std::declval(), int())) && + set_noexcept()++)>()) + { + const value_type r = Accessor::get(obj, index); + value_type x = r; + Accessor::set(obj, index, ++x); + return r; + } + + template + Vc_INTRINSIC ElementReference &operator--() && + noexcept(noexcept(std::declval() = + Accessor::get(std::declval(), int())) && + set_noexcept())>()) + { + value_type x = Accessor::get(obj, index); + Accessor::set(obj, index, --x); + return *this; + } + + template + Vc_INTRINSIC value_type operator--(int) && + noexcept(noexcept(std::declval() = + Accessor::get(std::declval(), int())) && + set_noexcept()--)>()) + { + const value_type r = Accessor::get(obj, index); + value_type x = r; + Accessor::set(obj, index, --x); + return r; + } + +private: + int index; + U &obj; +}; +} // namespace Detail +} // namespace Vc + +#endif // VC_COMMON_ELEMENTREFERENCE_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/exponential.h vc-1.3.0/common/exponential.h --- vc-0.7.4/common/exponential.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/exponential.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,62 +1,53 @@ -#ifndef COMMON_EXPONENTIAL_H -#define COMMON_EXPONENTIAL_H /* This file is part of the Vc library. {{{ +Copyright © 2012-2015 Matthias Kretz - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - - ------------------------------------------------------------------- - - The exp implementation is derived from Cephes, which carries the - following Copyright notice: - - Cephes Math Library Release 2.2: June, 1992 - Copyright 1984, 1987, 1989 by Stephen L. Moshier - Direct inquiries to 30 Frost Street, Cambridge, MA 02140 +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------- + +The exp implementation is derived from Cephes, which carries the +following Copyright notice: + +Cephes Math Library Release 2.2: June, 1992 +Copyright 1984, 1987, 1989 by Stephen L. Moshier +Direct inquiries to 30 Frost Street, Cambridge, MA 02140 }}}*/ -#ifndef VC_COMMON_EXPONENTIAL_H -#define VC_COMMON_EXPONENTIAL_H +#ifdef Vc_COMMON_MATH_H_INTERNAL -#include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +constexpr float log2_e = 1.44269504088896341f; +constexpr float MAXLOGF = 88.72283905206835f; +constexpr float MINLOGF = -103.278929903431851103f; /* log(2^-149) */ +constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f; + +template ::value || + std::is_same::value>> +inline Vector exp(Vector x) { -namespace Common -{ - using Vc::VC__USE_NAMESPACE::c_log; - using Vc::VC__USE_NAMESPACE::Vector; - using Vc::VC__USE_NAMESPACE::floor; - using Vc::VC__USE_NAMESPACE::ldexp; - - static const float log2_e = 1.44269504088896341f; - static const float MAXLOGF = 88.72283905206835f; - static const float MINLOGF = -103.278929903431851103f; /* log(2^-149) */ - static const float MAXNUMF = 3.4028234663852885981170418348451692544e38f; - - template struct TypenameForLdexp { typedef Vector Type; }; - template<> struct TypenameForLdexp { typedef Vector Type; }; - - template static inline Vector exp(VC_ALIGNED_PARAMETER(Vector) _x) { - typedef Vector V; - typedef typename V::Mask M; - typedef typename TypenameForLdexp::Type I; - typedef Const C; - - V x(_x); + using V = Vector; + typedef typename V::Mask M; + typedef Detail::Const C; const M overflow = x > MAXLOGF; const M underflow = x < MINLOGF; @@ -68,7 +59,7 @@ // => y = x - n * ln(2) | recall that: ln(2) * log₂(e) == 1 // <=> eˣ = 2ⁿ * eʸ V z = floor(C::log2_e() * x + 0.5f); - I n = static_cast(z); + const auto n = static_cast>(z); x -= z * C::ln2_large(); x -= z * C::ln2_small(); @@ -89,57 +80,5 @@ return x; } - static inline Vector exp(Vector::AsArg _x) { - Vector x = _x; - typedef Vector V; - typedef V::Mask M; - typedef Const C; - - const M overflow = x > Vc_buildDouble( 1, 0x0006232bdd7abcd2ull, 9); // max log - const M underflow = x < Vc_buildDouble(-1, 0x0006232bdd7abcd2ull, 9); // min log - - V px = floor(C::log2_e() * x + 0.5); -#ifdef VC_IMPL_SSE - Vector n(px); - n.data() = Mem::permute(n.data()); -#elif defined(VC_IMPL_AVX) - __m128i tmp = _mm256_cvttpd_epi32(px.data()); - Vector n = AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); -#endif - x -= px * C::ln2_large(); //Vc_buildDouble(1, 0x00062e4000000000ull, -1); // ln2 - x -= px * C::ln2_small(); //Vc_buildDouble(1, 0x0007f7d1cf79abcaull, -20); // ln2 - - const double P[] = { - Vc_buildDouble(1, 0x000089cdd5e44be8ull, -13), - Vc_buildDouble(1, 0x000f06d10cca2c7eull, -6), - Vc_buildDouble(1, 0x0000000000000000ull, 0) - }; - const double Q[] = { - Vc_buildDouble(1, 0x00092eb6bc365fa0ull, -19), - Vc_buildDouble(1, 0x0004ae39b508b6c0ull, -9), - Vc_buildDouble(1, 0x000d17099887e074ull, -3), - Vc_buildDouble(1, 0x0000000000000000ull, 1) - }; - const V x2 = x * x; - px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); - x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); - x = V::One() + 2.0 * x; - - x = ldexp(x, n); // == x * 2ⁿ - - x(overflow) = std::numeric_limits::infinity(); - x.setZero(underflow); - - return x; - } -} // namespace Common -namespace VC__USE_NAMESPACE -{ - using Vc::Common::exp; -} // namespace VC__USE_NAMESPACE -} // namespace Vc -/*OUTER_NAMESPACE_END*/ -#include "undomacros.h" -#endif // VC_COMMON_EXPONENTIAL_H -#endif // COMMON_EXPONENTIAL_H +#endif // Vc_COMMON_MATH_H_INTERNAL diff -Nru vc-0.7.4/common/fix_clang_emmintrin.h vc-1.3.0/common/fix_clang_emmintrin.h --- vc-0.7.4/common/fix_clang_emmintrin.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/fix_clang_emmintrin.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,5 +1,5 @@ /*{{{ - Copyright (C) 2013 Matthias Kretz + Copyright (C) 2013-2015 Matthias Kretz Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby @@ -21,12 +21,12 @@ }}}*/ -#ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H -#define VC_COMMON_FIX_CLANG_EMMINTRIN_H +#ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_ +#define VC_COMMON_FIX_CLANG_EMMINTRIN_H_ #include -#ifdef VC_CLANG +#if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000) #ifdef _mm_slli_si128 #undef _mm_slli_si128 @@ -74,6 +74,6 @@ __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); }) #endif -#endif // VC_CLANG +#endif // Vc_CLANG || Vc_APPLECLANG -#endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H +#endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H_ diff -Nru vc-0.7.4/common/gatherimplementation.h vc-1.3.0/common/gatherimplementation.h --- vc-0.7.4/common/gatherimplementation.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/gatherimplementation.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,282 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_GATHERIMPLEMENTATION_H_ +#define VC_COMMON_GATHERIMPLEMENTATION_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +enum class GatherScatterImplementation : int { + SimpleLoop, + SetIndexZero, + BitScanLoop, + PopcntSwitch +}; + +using SimpleLoopT = std::integral_constant; +using SetIndexZeroT = std::integral_constant; +using BitScanLoopT = std::integral_constant; +using PopcntSwitchT = std::integral_constant; + +template +Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT, + V &v, + const MT *mem, + IT &&indexes_, + typename V::MaskArgument mask) +{ + auto indexes = std::forward(indexes_); + indexes.setZeroInverted(static_cast(mask)); + const V tmp(mem, indexes); + where(mask) | v = tmp; +} + +template +Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, + V &v, + const MT *mem, + const IT &indexes, + typename V::MaskArgument mask) +{ + if (Vc_IS_UNLIKELY(mask.isEmpty())) { + return; + } + Common::unrolled_loop([&](std::size_t i) { + if (mask[i]) + v[i] = mem[indexes[i]]; + }); +} + +template +Vc_ALWAYS_INLINE void executeGather(BitScanLoopT, + V &v, + const MT *mem, + const IT &indexes, + typename V::MaskArgument mask) +{ +#ifdef Vc_GNU_ASM + size_t bits = mask.toInt(); + while (Vc_IS_LIKELY(bits > 0)) { + size_t i, j; + asm("bsf %[bits],%[i]\n\t" + "bsr %[bits],%[j]\n\t" + "btr %[i],%[bits]\n\t" + "btr %[j],%[bits]\n\t" + : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits)); + v[i] = mem[indexes[i]]; + v[j] = mem[indexes[j]]; + } +#else + // Alternative from Vc::SSE (0.7) + int bits = mask.toInt(); + while (bits) { + const int i = _bit_scan_forward(bits); + bits &= bits - 1; + v[i] = mem[indexes[i]]; + } +#endif // Vc_GNU_ASM +} + +template +Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, + V &v, + const MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low, high = 0; + switch (Vc::Detail::popcnt16(bits)) { + case 16: + v.gather(mem, indexes); + break; + case 15: + low = _bit_scan_forward(bits); + bits ^= 1 << low; + v[low] = mem[indexes[low]]; + case 14: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 13: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 12: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 11: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 10: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 9: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 8: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 7: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 6: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 5: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 4: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 3: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 2: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + case 1: + low = _bit_scan_forward(bits); + v[low] = mem[indexes[low]]; + case 0: + break; + } +} +template +Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, + V &v, + const MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low, high = 0; + switch (Vc::Detail::popcnt8(bits)) { + case 8: + v.gather(mem, indexes); + break; + case 7: + low = _bit_scan_forward(bits); + bits ^= 1 << low; + v[low] = mem[indexes[low]]; + case 6: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 5: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 4: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + high = (1 << high); + case 3: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + v[low] = mem[indexes[low]]; + case 2: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + case 1: + low = _bit_scan_forward(bits); + v[low] = mem[indexes[low]]; + case 0: + break; + } +} +template +Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, + V &v, + const MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low, high = 0; + switch (Vc::Detail::popcnt4(bits)) { + case 4: + v.gather(mem, indexes); + break; + case 3: + low = _bit_scan_forward(bits); + bits ^= 1 << low; + v[low] = mem[indexes[low]]; + case 2: + high = _bit_scan_reverse(bits); + v[high] = mem[indexes[high]]; + case 1: + low = _bit_scan_forward(bits); + v[low] = mem[indexes[low]]; + case 0: + break; + } +} +template +Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, + V &v, + const MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low; + switch (Vc::Detail::popcnt4(bits)) { + case 2: + v.gather(mem, indexes); + break; + case 1: + low = _bit_scan_forward(bits); + v[low] = mem[indexes[low]]; + case 0: + break; + } +} + +} // namespace Common +} // namespace Vc + +#endif // VC_COMMON_GATHERIMPLEMENTATION_H_ diff -Nru vc-0.7.4/common/gatherinterface.h vc-1.3.0/common/gatherinterface.h --- vc-0.7.4/common/gatherinterface.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/gatherinterface.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,539 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef Vc_CURRENT_CLASS_NAME +#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." +#endif + +/////////////////////////////////////////////////////////////////////////////////////////// +// gathers +// A gather takes the following arguments: +// 1. A const pointer to memory of any type that can convert to EntryType +// 2. An indexes “vector”. The requirement is that the type implements the subscript operator, +// stores «Size» valid index values, and each offset to the pointer above yields a valid +// memory location for reading. +// 3. Optionally the third argument may be a mask. The mask disables several memory reads and +// thus removes the requirements in (2.) for the disabled entries. + +private: + /**\internal + * This function implements a gather given a pointer to memory \p mem and some + * container object storing the gather \p indexes. + * + * \param mem This pointer must be aligned correctly for the type \p MT. This is the + * natural behavior of C++, so this is typically the case. + * \param indexes This object contains at least \VSize{T} indexes that denote the + * offset in \p mem where the components for the current vector should be copied from. + * The offset is not in Bytes, but in multiples of `sizeof(MT)`. + */ + // enable_if::value && + // has_subscript_operator::value> + template + inline void gatherImplementation(const MT *mem, IT &&indexes); + + /**\internal + * This overload of the above function adds a \p mask argument to disable memory + * accesses at the \p indexes offsets where \p mask is \c false. + */ + template + inline void gatherImplementation(const MT *mem, IT &&indexes, MaskArgument mask); + + /**\internal + * Overload for the case of C-arrays or %Vc vector objects. + * + * In this case the \p indexes parameter is usable without adjustment. + * + * \param indexes An object to be used for gather or scatter. + * \returns Forwards the \p indexes parameter. + */ + template ::value || + Traits::is_simd_vector::value>> + static Vc_INTRINSIC IT adjustIndexParameter(IT &&indexes) + { + return std::forward(indexes); + } + + /**\internal + * Overload for the case of a container that returns an lvalue reference from its + * subscript operator. + * + * In this case the container is assumed to use contiguous storage and therefore the + * \p indexes object is converted to a C-array interface. + * + * \param indexes An object to be used for gather or scatter. + * \returns A pointer to the first object in the \p indexes container. + */ + template ::value && !Traits::is_simd_vector::value && + std::is_lvalue_reference()[0])>::value>> + static Vc_INTRINSIC decltype(std::addressof(std::declval()[0])) + adjustIndexParameter(IT &&i) + { + return std::addressof(i[0]); + } + + /**\internal + * Overload for the case of a container that returns an rvalue from its + * subscript operator. + * + * \param indexes An object to be used for gather or scatter. + * \returns Forwards the \p indexes parameter. + */ + template + static Vc_INTRINSIC + enable_if::value && !Traits::is_simd_vector::value && + !std::is_lvalue_reference()[0])>::value, + IT> + adjustIndexParameter(IT &&i) + { + return std::forward(i); + } + +public: +#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ + static_assert( \ + std::is_convertible::value, \ + "The memory pointer needs to point to a type that can be converted to the " \ + "EntryType of this SIMD vector type."); \ + static_assert( \ + Vc::Traits::has_subscript_operator::value, \ + "The indexes argument must be a type that implements the subscript operator."); \ + static_assert( \ + !Traits::is_simd_vector::value || \ + Traits::simd_vector_size::value >= Size, \ + "If you use a SIMD vector for the indexes parameter, the index vector must " \ + "have at least as many entries as this SIMD vector."); \ + static_assert( \ + !std::is_array::value || \ + (std::rank::value == 1 && \ + (std::extent::value == 0 || std::extent::value >= Size)), \ + "If you use a simple array for the indexes parameter, the array must have " \ + "at least as many entries as this SIMD vector.") + + /** + * \name Gather constructors and member functions + * + * Constructs or loads a vector from the objects at `mem[indexes[0]]`, + * `mem[indexes[1]]`, `mem[indexes[2]]`, ... + * + * All gather functions optionally take a mask as last argument. In that case only the + * entries that are selected in the mask are accessed in memory and copied to the + * vector. This enables invalid indexes in the \p indexes vector if those are masked + * off in \p mask. + * + * Gathers from structured data (AoS: arrays of struct) are possible via a special + * subscript operator of the container (array). You can use \ref Vc::array and \ref + * Vc::vector as drop-in replacements for \c std::array and \c std::vector. These + * container classes contain the necessary subscript operator overload. Example: + * \code + * Vc::vector data(100); + * std::iota(data.begin(), data.end(), 0.f); // fill with values 0, 1, 2, ... + * auto indexes = float_v::IndexType::IndexesFromZero(); + * float_v gathered = data[indexes]; // gathered == [0, 1, 2, ...] + * \endcode + * + * Alternatively, you can use Vc::Common::AdaptSubscriptOperator to extend a given + * container class with the necessary subscript operator. Example: + * \code + * template > + * using my_vector = Vc::Common::AdaptSubscriptOperator>; + * \endcode + * + * \param mem A pointer to memory which contains objects of type \p MT at the offsets + * given by \p indexes. + * \param indexes A container/vector of offsets into \p mem. + * The type of \p indexes (\p IT) may either be a pointer to integers + * (C-array) or a vector of integers (preferrably IndexType). + * \param mask If a mask is given, only the active entries will be copied from memory. + * + * \note If you use a masked gather constructor the masked-off entries of the vector + * are zero-initilized. + */ + ///@{ + + /// Gather constructor + template ::value>> + Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, IT &&indexes) + { + Vc_ASSERT_GATHER_PARAMETER_TYPES_; + gatherImplementation(mem, adjustIndexParameter(std::forward(indexes))); + } + + /// Masked gather constructor + template ::value>> + Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, IT &&indexes, MaskArgument mask) + { + Vc_ASSERT_GATHER_PARAMETER_TYPES_; + gatherImplementation(mem, adjustIndexParameter(std::forward(indexes)), mask); + } + + /// Gather function + template ::value>> + Vc_INTRINSIC void gather(const MT *mem, IT &&indexes) + { + Vc_ASSERT_GATHER_PARAMETER_TYPES_; + gatherImplementation(mem, adjustIndexParameter(std::forward(indexes))); + } + + /// Masked gather function + template ::value>> + Vc_INTRINSIC void gather(const MT *mem, IT &&indexes, MaskArgument mask) + { + Vc_ASSERT_GATHER_PARAMETER_TYPES_; + gatherImplementation(mem, adjustIndexParameter(std::forward(indexes)), mask); + } + ///@} + + /// \name Deprecated Members + ///@{ + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, + const EntryType S1::*member1, + IT indexes) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1] + .gatherArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, + const EntryType S1::*member1, + IT indexes, MaskArgument mask) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1] + .gatherArguments(), + mask); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that + * struct (i.e. array[i].*member1.*member2 is read). + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, + const S2 S1::*member1, + const EntryType S2::*member2, + IT indexes) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1][member2] + .gatherArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that + * struct (i.e. array[i].*member1.*member2 is read). + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, + const S2 S1::*member1, + const EntryType S2::*member2, + IT indexes, MaskArgument mask) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1][member2] + .gatherArguments(), + mask); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param outerIndexes + * \param innerIndexes + */ + template + Vc_DEPRECATED( + "use the subscript operator to Vc::array or Vc::vector " + "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, + const EntryType *const S1::*ptrMember1, + IT1 outerIndexes, IT2 innerIndexes) + { + gather(Common::SubscriptOperation, true>( + array, outerIndexes)[ptrMember1][innerIndexes] + .gatherArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param outerIndexes + * \param innerIndexes + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED( + "use the subscript operator to Vc::array or Vc::vector " + "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array, + const EntryType *const S1::*ptrMember1, + IT1 outerIndexes, IT2 innerIndexes, + MaskArgument mask) + { + gather(Common::SubscriptOperation, true>( + array, outerIndexes)[ptrMember1][innerIndexes] + .gatherArguments(), + mask); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void gather(const S1 *array, + const EntryType S1::*member1, IT indexes) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1] + .gatherArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void gather(const S1 *array, + const EntryType S1::*member1, + IT indexes, + MaskArgument mask) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1] + .gatherArguments(), + mask); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that + * struct (i.e. array[i].*member1.*member2 is read). + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void gather(const S1 *array, const S2 S1::*member1, + const EntryType S2::*member2, IT indexes) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1][member2] + .gatherArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that + * struct (i.e. array[i].*member1.*member2 is read). + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void gather(const S1 *array, const S2 S1::*member1, + const EntryType S2::*member2, IT indexes, + MaskArgument mask) + { + gather(Common::SubscriptOperation, true>( + array, indexes)[member1][member2] + .gatherArguments(), + mask); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param outerIndexes + * \param innerIndexes + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void gather(const S1 *array, + const EntryType *const S1::*ptrMember1, + IT1 outerIndexes, IT2 innerIndexes) + { + gather(Common::SubscriptOperation, true>( + array, outerIndexes)[ptrMember1][innerIndexes] + .gatherArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param outerIndexes + * \param innerIndexes + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void gather(const S1 *array, + const EntryType *const S1::*ptrMember1, + IT1 outerIndexes, IT2 innerIndexes, + MaskArgument mask) + { + gather(Common::SubscriptOperation, true>( + array, outerIndexes)[ptrMember1][innerIndexes] + .gatherArguments(), + mask); + } + ///@} + + /**\internal + * \name Gather function to use from Vc::Common::subscript_operator + * + * \param args + * \param mask + */ + ///@{ + template + Vc_INTRINSIC void gather(const Common::GatherArguments &args) + { + gather(args.address, adjustIndexParameter(args.indexes)); + } + + template + Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) + { + gather(args.address, adjustIndexParameter(args.indexes), mask); + } + ///@} + +#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ diff -Nru vc-0.7.4/common/generalinterface.h vc-1.3.0/common/generalinterface.h --- vc-0.7.4/common/generalinterface.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/generalinterface.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,52 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +public: + /////////////////////////////////////////////////////////////////////////// + // init to zero + Vector() = default; + + /////////////////////////////////////////////////////////////////////////// + // types + + /////////////////////////////////////////////////////////////////////////// + // constants + static constexpr std::size_t size() { return Size; } + + /////////////////////////////////////////////////////////////////////////// + // constant Vectors + explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R; + explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R; + explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R; + static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); } + static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); } + static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero() + { + return Vector(Vc::IndexesFromZero); + } + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/iif.h vc-1.3.0/common/iif.h --- vc-0.7.4/common/iif.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/iif.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,32 +1,42 @@ /* This file is part of the Vc library. {{{ +Copyright © 2012-2015 Matthias Kretz - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ -#ifndef VC_COMMON_IIF_H -#define VC_COMMON_IIF_H +#ifndef VC_COMMON_IIF_H_ +#define VC_COMMON_IIF_H_ +#include #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { /** - * Function to mimic the ternary operator '?:'. + * \ingroup Utilities + * + * Function to mimic the ternary operator '?:' (inline-if). * * \param condition Determines which values are returned. This is analog to the first argument to * the ternary operator. @@ -42,21 +52,46 @@ * \code * float_v x = Vc::iif (a > 1.f, b, b + c); * \endcode + * + * Assuming \c a has the values [0, 3, 5, 1], \c b is [1, 1, 1, 1], and \c c is [1, 2, 3, 4], then x + * will be [2, 2, 3, 5]. */ -#ifndef VC_MSVC -template static Vc_ALWAYS_INLINE Vector iif (typename Vector::Mask condition, Vector trueValue, Vector falseValue) +template +Vc_ALWAYS_INLINE enable_if::value && is_simd_vector::value, T> iif( + const Mask &condition, const T &trueValue, const T &falseValue) { -#else -template static Vc_ALWAYS_INLINE Vector iif (const typename Vector::Mask &condition, const Vector &trueValue, const Vector &_falseValue) + T result(falseValue); + Vc::where(condition) | result = trueValue; + return result; +} + +/**\internal + * The following declaration makes it explicit that `iif (Mask, non-vector, non-vector)` + * is not supposed to work. Doing the same thing with \c static_assert would break SFINAE. + */ +template +enable_if::value && !is_simd_vector::value, T> iif( + const Mask &, const T &, const T &) = delete; + +/** + * \ingroup Utilities + * + * Overload of the above for boolean conditions. + * + * This typically results in direct use of the ternary operator. This function makes it easier to + * switch from a Vc type to a builtin type. + * + * \param condition Determines which value is returned. This is analog to the first argument to + * the ternary operator. + * \param trueValue The value to return if \p condition is \c true. + * \param falseValue The value to return if \p condition is \c false. + * \return Either \p trueValue or \p falseValue, depending on \p condition. + */ +template constexpr T iif (bool condition, const T &trueValue, const T &falseValue) { - Vector falseValue(_falseValue); -#endif - falseValue(condition) = trueValue; - return falseValue; + return condition ? trueValue : falseValue; } -} // namespace Vc -/*OUTER_NAMESPACE_END*/ -#include "undomacros.h" +} // namespace Vc -#endif // VC_COMMON_IIF_H +#endif // VC_COMMON_IIF_H_ diff -Nru vc-0.7.4/common/indexsequence.h vc-1.3.0/common/indexsequence.h --- vc-0.7.4/common/indexsequence.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/indexsequence.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,79 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_INDEXSEQUENCE_H_ +#define VC_COMMON_INDEXSEQUENCE_H_ + +#include + +namespace Vc_VERSIONED_NAMESPACE +{ +/** \internal + * Helper class for a sequence of size_t values from 0 to N. This type will be included in + * C++14. + */ +template struct index_sequence +{ + static constexpr std::size_t size() noexcept { return sizeof...(I); } +}; + +/** \internal + * This struct builds an index_sequence type from a given upper bound \p N. + * It does so recursively via concatenation of to index sequences of length N/2. + */ +template struct make_index_sequence_impl { + template + static index_sequence join(std::false_type, + index_sequence); + template + static index_sequence join( + std::true_type, index_sequence); + + using is_odd = std::integral_constant; + using half = typename make_index_sequence_impl::type; + using type = decltype(join<(N + 1) / 2>(is_odd(), half())); +}; +template <> struct make_index_sequence_impl<0> { + using type = index_sequence<>; +}; +template <> struct make_index_sequence_impl<1> { + using type = index_sequence<0>; +}; +template <> struct make_index_sequence_impl<2> { + using type = index_sequence<0, 1>; +}; + +/** \internal + * Creates an index_sequence type for the upper bound \p N. + */ +template +using make_index_sequence = typename make_index_sequence_impl::type; +} + +#endif // VC_COMMON_INDEXSEQUENCE_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/interleavedmemory.h vc-1.3.0/common/interleavedmemory.h --- vc-0.7.4/common/interleavedmemory.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/interleavedmemory.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,49 +1,50 @@ /* This file is part of the Vc library. {{{ +Copyright © 2012-2015 Matthias Kretz - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. }}}*/ -#ifndef VC_COMMON_INTERLEAVEDMEMORY_H -#define VC_COMMON_INTERLEAVEDMEMORY_H +#ifndef VC_COMMON_INTERLEAVEDMEMORY_H_ +#define VC_COMMON_INTERLEAVEDMEMORY_H_ #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { namespace Common { - -namespace Internal -{ -template struct CopyConst { typedef B Type; }; -template struct CopyConst { typedef const B Type; }; - -template struct EnableInterleaves { typedef R Type; }; -template struct EnableInterleaves; -} // namespace Internal - /** * \internal */ -template struct InterleavedMemoryAccessBase +template struct InterleavedMemoryAccessBase { - typedef typename V::EntryType T; - typedef typename V::IndexType I; + // Partial specialization doesn't work for functions without partial specialization of the whole + // class. Therefore we capture the contents of InterleavedMemoryAccessBase in a macro to easily + // copy it into its specializations. + typedef typename std::conditional< + Readonly, typename std::add_const::type, + typename V::EntryType>::type T; typedef typename V::AsArg VArg; typedef T Ta Vc_MAY_ALIAS; const I m_indexes; @@ -54,93 +55,114 @@ { } - // implementations of the following are in {scalar,sse,avx}/interleavedmemory.tcc - void deinterleave(V &v0, V &v1) const; - void deinterleave(V &v0, V &v1, V &v2) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) const; - - void interleave(VArg v0, VArg v1); - void interleave(VArg v0, VArg v1, VArg v2); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6, VArg v7); + // implementations of the following are in {scalar,sse,avx}/detail.h + template Vc_INTRINSIC void deinterleave(Vs &&... vs) const + { + Impl::deinterleave(m_data, m_indexes, std::forward(vs)...); + } + +protected: + using Impl = Vc::Detail::InterleaveImpl; + + template + Vc_INTRINSIC void callInterleave(T &&a, index_sequence) + { + Impl::interleave(m_data, m_indexes, a[Indexes]...); + } }; /** * \internal */ // delay execution of the deinterleaving gather until operator= -template struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase +template +struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase { - typedef InterleavedMemoryAccessBase Base; + typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; - typedef typename Base::I I; - Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(const Ta *data, typename I::AsArg indexes) - : Base(indexes * I(StructSize), const_cast(data)) // this needs to be refactored to properly keep the constness + Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(Ta *data, typename I::AsArg indexes) + : Base(StructSize == 1u + ? indexes + : StructSize == 2u + ? indexes << 1 + : StructSize == 4u + ? indexes << 2 + : StructSize == 8u + ? indexes << 3 + : StructSize == 16u ? indexes << 4 + : indexes * I(int(StructSize)), + data) { } + + template + Vc_ALWAYS_INLINE T deinterleave_unpack(index_sequence) const + { + T r; + Base::Impl::deinterleave(this->m_data, this->m_indexes, std::get(r)...); + return r; + } + + template ::value && + std::is_same( + std::declval()))>>::value)>> + Vc_ALWAYS_INLINE operator T() const + { + return deinterleave_unpack(make_index_sequence::value>()); + } +}; + +///\internal Runtime check (NDEBUG) for asserting unique indexes. +template struct CheckIndexesUnique +{ +#ifdef NDEBUG + static Vc_INTRINSIC void test(const I &) {} +#else + static void test(const I &indexes) + { + const I test = indexes.sorted(); + Vc_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty()) + } +#endif +}; +///\internal For SuccessiveEntries there can never be a problem. +template struct CheckIndexesUnique > +{ + static Vc_INTRINSIC void test(const SuccessiveEntries &) {} }; /** * \internal */ -template struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess +template +struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess { - typedef InterleavedMemoryAccessBase Base; + typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; - typedef typename Base::I I; Vc_ALWAYS_INLINE InterleavedMemoryAccess(Ta *data, typename I::AsArg indexes) - : InterleavedMemoryReadAccess(data, indexes) + : InterleavedMemoryReadAccess(data, indexes) { + CheckIndexesUnique::test(indexes); } -#define _VC_SCATTER_ASSIGNMENT(LENGTH, parameters) \ - Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ - { \ - VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ - this->interleave parameters ; \ - } \ - Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ - { \ - VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ - checkIndexesUnique(); \ - this->interleave parameters ; \ - } - _VC_SCATTER_ASSIGNMENT(2, (rhs.l, rhs.r)) - _VC_SCATTER_ASSIGNMENT(3, (rhs.l.l, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(4, (rhs.l.l.l, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(5, (rhs.l.l.l.l, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(6, (rhs.l.l.l.l.l, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(7, (rhs.l.l.l.l.l.l, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(8, (rhs.l.l.l.l.l.l.l, rhs.l.l.l.l.l.l.r, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); -#undef _VC_SCATTER_ASSIGNMENT - -private: -#ifdef NDEBUG - Vc_ALWAYS_INLINE void checkIndexesUnique() const {} -#else - void checkIndexesUnique() const + template Vc_ALWAYS_INLINE void operator=(VectorReferenceArray &&rhs) { - const I test = Base::m_indexes.sorted(); - VC_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty()) + static_assert(N <= StructSize, + "You_are_trying_to_scatter_more_data_into_the_struct_than_it_has"); + this->callInterleave(std::move(rhs), make_index_sequence()); + } + template Vc_ALWAYS_INLINE void operator=(VectorReferenceArray &&rhs) + { + static_assert(N <= StructSize, + "You_are_trying_to_scatter_more_data_into_the_struct_than_it_has"); + this->callInterleave(std::move(rhs), make_index_sequence()); } -#endif }; -#ifdef DOXYGEN -} // namespace Common -// in doxygen InterleavedMemoryWrapper should appear in the Vc namespace (see the using statement -// below) -#endif - /** * Wraps a pointer to memory with convenience functions to access it via vectors. * @@ -154,16 +176,22 @@ */ template class InterleavedMemoryWrapper { - typedef typename V::EntryType T; + typedef typename std::conditional::value, + const typename V::EntryType, + typename V::EntryType>::type T; typedef typename V::IndexType I; typedef typename V::AsArg VArg; - typedef typename I::AsArg IndexType; - typedef InterleavedMemoryAccess Access; - typedef InterleavedMemoryReadAccess ReadAccess; - typedef typename Internal::CopyConst::Type Ta Vc_MAY_ALIAS; + typedef const I &IndexType; + static constexpr std::size_t StructSize = sizeof(S) / sizeof(T); + typedef InterleavedMemoryAccess Access; + typedef InterleavedMemoryReadAccess ReadAccess; + typedef InterleavedMemoryAccess > AccessSuccessiveEntries; + typedef InterleavedMemoryReadAccess > ReadSuccessiveEntries; + typedef T Ta Vc_MAY_ALIAS; Ta *const m_data; - VC_STATIC_ASSERT((sizeof(S) / sizeof(T)) * sizeof(T) == sizeof(S), InterleavedMemoryAccess_does_not_support_packed_structs); + static_assert(StructSize * sizeof(T) == sizeof(S), + "InterleavedMemoryAccess_does_not_support_packed_structs"); public: /** @@ -228,41 +256,77 @@ * \warning If \p indexes contains non-unique entries on scatter, the result is undefined. If * \c NDEBUG is not defined the implementation will assert that the \p indexes entries are unique. */ -#ifdef DOXYGEN - Vc_ALWAYS_INLINE Access operator[](IndexType indexes) -#else - // need to SFINAE disable this for objects that wrap constant data - template - Vc_ALWAYS_INLINE typename Internal::EnableInterleaves::Type operator[]( - VC_ALIGNED_PARAMETER(U) indexes) -#endif + template + Vc_ALWAYS_INLINE enable_if< + std::is_convertible::value && !std::is_const::value, Access> + operator[](IT indexes) { return Access(m_data, indexes); } /// const overload (gathers only) of the above function - Vc_ALWAYS_INLINE ReadAccess operator[](VC_ALIGNED_PARAMETER(IndexType) indexes) const + Vc_ALWAYS_INLINE ReadAccess operator[](IndexType indexes) const { return ReadAccess(m_data, indexes); } /// alias of the above function - Vc_ALWAYS_INLINE ReadAccess gather(VC_ALIGNED_PARAMETER(IndexType) indexes) const + Vc_ALWAYS_INLINE ReadAccess gather(IndexType indexes) const { return operator[](indexes); } + + /** + * Interleaved access. + * + * This function is an optimization of the function above, for cases where the index vector + * contains consecutive values. It will load \p V::Size consecutive entries from memory and + * deinterleave them into Vc vectors. + * + * \param first The first of \p V::Size indizes to be accessed. + * + * \return A special (magic) object that executes the loads and deinterleave on assignment to a + * vector tuple. + * + * Example: + * \code + * struct Foo { + * float x, y, z; + * }; + * + * void foo(Foo *_data) + * { + * Vc::InterleavedMemoryWrapper data(_data); + * for (size_t i = 0; i < 32U; i += float_v::Size) { + * float_v x, y, z; + * (x, y, z) = data[i]; + * // now: + * // x = { _data[i].x, _data[i + 1].x, _data[i + 2].x, ... } + * // y = { _data[i].y, _data[i + 1].y, _data[i + 2].y, ... } + * // z = { _data[i].z, _data[i + 1].z, _data[i + 2].z, ... } + * ... + * } + * } + * \endcode + */ + Vc_ALWAYS_INLINE ReadSuccessiveEntries operator[](size_t first) const + { + return ReadSuccessiveEntries(m_data, first); + } + + Vc_ALWAYS_INLINE AccessSuccessiveEntries operator[](size_t first) { - return operator[](indexes); + return AccessSuccessiveEntries(m_data, first); } //Vc_ALWAYS_INLINE Access scatter(I indexes, VArg v0, VArg v1); }; -#ifndef DOXYGEN -} // namespace Common +} // namespace Common using Common::InterleavedMemoryWrapper; -#endif -} // namespace Vc -/*OUTER_NAMESPACE_END*/ - -#include "undomacros.h" +template +inline Common::InterleavedMemoryWrapper make_interleave_wrapper(S *s) +{ + return Common::InterleavedMemoryWrapper(s); +} +} // namespace Vc -#endif // VC_COMMON_INTERLEAVEDMEMORY_H +#endif // VC_COMMON_INTERLEAVEDMEMORY_H_ diff -Nru vc-0.7.4/common/interleave.h vc-1.3.0/common/interleave.h --- vc-0.7.4/common/interleave.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/interleave.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,63 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_INTERLEAVE_H_ +#define VC_COMMON_INTERLEAVE_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +/** \ingroup Utilities + Interleaves the entries from \p a and \p b into two vectors of the same type. The order + in the returned vector contains the elements `a[0], b[0], a[1], b[1], a[2], b[2], a[3], + b[3], ...`. + +Example: +\code +Vc::SimdArray a = { 1, 2, 3, 4 }; +Vc::SimdArray b = { 9, 8, 7, 6 }; +std::tie(a, b) = Vc::interleave(a, b); +std::cout << a << b; +// prints: +// <1 9 2 8><3 7 4 6> +\endcode + + \param a input vector whose data will appear at even indexes in the output + \param b input vector whose data will appear at odd indexes in the output + \return two vectors with data from \p a and \p b interleaved + */ +template ::value>> +std::pair interleave(const V &a, const V &b) +{ + return {a.interleaveLow(b), a.interleaveHigh(b)}; +} +} // namespace Vc + +#endif // VC_COMMON_INTERLEAVE_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/iterators.h vc-1.3.0/common/iterators.h --- vc-0.7.4/common/iterators.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/iterators.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,307 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_ITERATORS_H_ +#define VC_COMMON_ITERATORS_H_ + +#include +#include +#include "where.h" +#include "elementreference.h" +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +template class MemoryVector; +template class MemoryVectorIterator; + +template class Iterator; +template class IteratorBase; +template class IteratorBase +{ +public: + using iterator_category = std::input_iterator_tag; + using value_type = typename V::value_type; + using difference_type = int; + using reference = value_type; + Vc_ALWAYS_INLINE reference operator*() const { return v()[i()]; } + Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return v()[i2]; } + +private: + Vc_INTRINSIC V &v() const { return *static_cast *>(this)->v; } + Vc_INTRINSIC difference_type i() const + { + return static_cast *>(this)->i; + } +}; + +template class IteratorBase +{ +public: + using iterator_category = std::input_iterator_tag; + using value_type = typename V::value_type; + using difference_type = int; + using reference = Vc::Detail::ElementReference; + Vc_ALWAYS_INLINE reference operator*() const { return {*v(), i()}; } + Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return {*v(), i2}; } + +private: + Vc_INTRINSIC V *v() const { return static_cast *>(this)->v; } + Vc_INTRINSIC difference_type i() const + { + return static_cast *>(this)->i; + } + + friend reference; + static Vc_INTRINSIC value_type get(const V &o, int i) + { + return o[i]; + } + template static Vc_INTRINSIC void set(V &o, int i, T &&v) + { + o[i] = std::forward(v); + } +}; + +// class Iterator {{{ +template class Iterator : public IteratorBase::value> +{ + using Base = IteratorBase::value>; + friend Base; + +public: + using typename Base::iterator_category; + using typename Base::value_type; + using typename Base::difference_type; + using pointer = const Iterator *; + using typename Base::reference; + + constexpr Iterator() = default; + constexpr Iterator(V &_v, difference_type _i) : v(&_v), i(_i) {} + // rely on implicit copy constructor/assignment + + Vc_ALWAYS_INLINE pointer operator->() const { return this; } + using Base::operator*; + + Vc_ALWAYS_INLINE Iterator &operator++() { ++i; return *this; } + Vc_ALWAYS_INLINE Iterator operator++(int) { Iterator tmp = *this; ++i; return tmp; } + + // bidirectional iteration is supported + Vc_ALWAYS_INLINE Iterator &operator--() { --i; return *this; } + Vc_ALWAYS_INLINE Iterator operator--(int) { Iterator tmp = *this; --i; return tmp; } + + // RandomAccessIterator: + using Base::operator[]; + Vc_ALWAYS_INLINE Iterator &operator+=(difference_type d) { i += d; return *this; } + Vc_ALWAYS_INLINE Iterator &operator-=(difference_type d) { i -= d; return *this; } + Vc_ALWAYS_INLINE Iterator operator+(difference_type d) const { return {*v, i + d}; } + Vc_ALWAYS_INLINE Iterator operator-(difference_type d) const { return {*v, i - d}; } + Vc_ALWAYS_INLINE difference_type operator-(const Iterator &rhs) const { return i - rhs.i; } + friend Vc_ALWAYS_INLINE Iterator operator+(difference_type d, const Iterator &rhs) + { + return {*rhs.v, rhs.i + d}; + } + + // InputIterator would not need to test v == rhs.v, but except for `reference` this + // class implements a complete RandomAccessIterator + Vc_ALWAYS_INLINE bool operator==(const Iterator &rhs) const { return v == rhs.v && i == rhs.i; } + Vc_ALWAYS_INLINE bool operator!=(const Iterator &rhs) const { return v == rhs.v && i != rhs.i; } + Vc_ALWAYS_INLINE bool operator< (const Iterator &rhs) const { return v == rhs.v && i < rhs.i; } + Vc_ALWAYS_INLINE bool operator<=(const Iterator &rhs) const { return v == rhs.v && i <= rhs.i; } + Vc_ALWAYS_INLINE bool operator> (const Iterator &rhs) const { return v == rhs.v && i > rhs.i; } + Vc_ALWAYS_INLINE bool operator>=(const Iterator &rhs) const { return v == rhs.v && i >= rhs.i; } + +private: + V *v = nullptr; + difference_type i = 0; +};/*}}}*/ + +template using ConstIterator = Iterator; + +#ifdef Vc_IMPL_MIC + class BitmaskIterator/*{{{*/ + { + const int mask; + int bit; + public: + Vc_ALWAYS_INLINE BitmaskIterator(int m) : mask(m), bit(_mm_tzcnt_32(mask)) {} + Vc_ALWAYS_INLINE BitmaskIterator(const BitmaskIterator &) = default; + Vc_ALWAYS_INLINE BitmaskIterator(BitmaskIterator &&) = default; + + Vc_ALWAYS_INLINE size_t operator->() const { return bit; } + Vc_ALWAYS_INLINE size_t operator*() const { return bit; } + + Vc_ALWAYS_INLINE BitmaskIterator &operator++() { + bit = _mm_tzcnti_32(bit, mask); + return *this; + } + Vc_ALWAYS_INLINE BitmaskIterator operator++(int) { + BitmaskIterator tmp = *this; + bit = _mm_tzcnti_32(bit, mask); + return tmp; + } + + Vc_ALWAYS_INLINE bool operator==(const BitmaskIterator &rhs) const { return bit == rhs.bit; } + Vc_ALWAYS_INLINE bool operator!=(const BitmaskIterator &rhs) const { return bit != rhs.bit; } + };/*}}}*/ +#else + class BitmaskIterator/*{{{*/ + { +#ifdef Vc_MSVC + unsigned long mask; + unsigned long bit; +#else + size_t mask; + size_t bit; +#endif + + void nextBit() + { +#ifdef Vc_GNU_ASM + bit = __builtin_ctzl(mask); +#elif defined(Vc_MSVC) + _BitScanForward(&bit, mask); +#else +#error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de" +#endif + } + void resetLsb() + { + // 01100100 - 1 = 01100011 + mask &= (mask - 1); + /* +#ifdef Vc_GNU_ASM + __asm__("btr %1,%0" : "+r"(mask) : "r"(bit)); +#elif defined(_WIN64) + _bittestandreset64(&mask, bit); +#elif defined(_WIN32) + _bittestandreset(&mask, bit); +#else +#error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de" +#endif + */ + } + public: + BitmaskIterator(decltype(mask) m) : mask(m) { nextBit(); } + BitmaskIterator(const BitmaskIterator &) = default; + BitmaskIterator(BitmaskIterator &&) = default; + + Vc_ALWAYS_INLINE size_t operator->() const { return bit; } + Vc_ALWAYS_INLINE size_t operator*() const { return bit; } + + Vc_ALWAYS_INLINE BitmaskIterator &operator++() { resetLsb(); nextBit(); return *this; } + Vc_ALWAYS_INLINE BitmaskIterator operator++(int) { BitmaskIterator tmp = *this; resetLsb(); nextBit(); return tmp; } + + Vc_ALWAYS_INLINE bool operator==(const BitmaskIterator &rhs) const { return mask == rhs.mask; } + Vc_ALWAYS_INLINE bool operator!=(const BitmaskIterator &rhs) const { return mask != rhs.mask; } + };/*}}}*/ +#endif + +template +Vc_ALWAYS_INLINE + enable_if::value || Traits::is_simd_mask::value, + Iterator::type>> + begin(T &&x) +{ + return {std::forward(x), 0}; +} + +template +Vc_ALWAYS_INLINE + enable_if::value || Traits::is_simd_mask::value, + Iterator::type>> + end(T &&x) +{ + using TT = typename std::decay::type; + return {std::forward(x), int(TT::size())}; +} + +template +Vc_ALWAYS_INLINE enable_if< + Traits::is_simd_mask::value || Traits::is_simd_vector::value, ConstIterator> +cbegin(const T &v) +{ + return {v, 0}; +} + +template +Vc_ALWAYS_INLINE enable_if< + Traits::is_simd_mask::value || Traits::is_simd_vector::value, ConstIterator> +cend(const T &v) +{ + return {v, int(T::size())}; +} + +template Vc_ALWAYS_INLINE BitmaskIterator begin(const WhereImpl::WhereMask &w) +{ + return w.mask.toInt(); +} + +template Vc_ALWAYS_INLINE BitmaskIterator end(const WhereImpl::WhereMask &) +{ + return 0; +} + +template Vc_ALWAYS_INLINE MemoryVectorIterator + makeIterator(T *mem, Flags) +{ + return new(mem) MemoryVector; +} + +template Vc_ALWAYS_INLINE MemoryVectorIterator + makeIterator(const T *mem, Flags) +{ + return new(const_cast(mem)) MemoryVector; +} + +template Vc_ALWAYS_INLINE MemoryVectorIterator + makeIterator(MemoryVector &mv, Flags) +{ + return new(&mv) MemoryVector; +} + +template Vc_ALWAYS_INLINE MemoryVectorIterator + makeIterator(MemoryVector &mv, Flags) +{ + return new(&mv) MemoryVector; +} + +} // namespace Common + +using Common::begin; +using Common::end; +using Common::cbegin; +using Common::cend; +using Common::makeIterator; +} // namespace Vc + +#endif // VC_COMMON_ITERATORS_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/loadinterface.h vc-1.3.0/common/loadinterface.h --- vc-0.7.4/common/loadinterface.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/loadinterface.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,105 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +// load ctors{{{1 +/** + * Construct a vector from loading its entries from the array at \p mem. + * + * \param mem A pointer to data. The pointer must not be aligned on a + * MemoryAlignment boundary unless you add the Vc::Aligned flag as a second + * argument. + */ +explicit Vc_INTRINSIC Vector(const EntryType *mem) +{ + load(mem); +} +/** + * Construct a vector from loading its entries from the array at \p mem. + * + * \param mem A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer + * must be aligned on a MemoryAlignment boundary. + * \param flags A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming, + * Vc::Unaligned, and/or Vc::PrefetchDefault. + */ +template ::value>> +explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags) +{ + load(mem, flags); +} + +template ::value || !std::is_integral::value || + sizeof(EntryType) >= sizeof(U)) && + std::is_arithmetic::value &&Traits::is_load_store_flag::value>> +explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags()) +{ + load(x, flags); +} + +// load member functions{{{1 +/** + * Load the vector entries from \p mem, overwriting the previous values. + * + * \param mem + * A pointer to data. The pointer must not be aligned on a MemoryAlignment boundary unless + * you add the Vc::Aligned flag as a second argument. + */ +Vc_INTRINSIC void load(const EntryType *mem) +{ + load(mem, DefaultLoadTag()); +} +/** + * Load the vector entries from \p mem, overwriting the previous values. + * + * \param mem + * A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer must be + * aligned on a MemoryAlignment boundary. + * \param flags + * A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming, Vc::Unaligned, + * and/or Vc::PrefetchDefault. + */ +template +Vc_INTRINSIC enable_if::value, void> +load(const EntryType *mem, Flags flags) +{ + load(mem, flags); +} +private: +template +struct load_concept : public std::enable_if< + (!std::is_integral::value || !std::is_integral::value || + sizeof(EntryType) >= sizeof(U)) && + std::is_arithmetic::value && Traits::is_load_store_flag::value, void> +{}; + +public: +template +Vc_INTRINSIC_L typename load_concept::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R; +//}}}1 + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/loadstoreflags.h vc-1.3.0/common/loadstoreflags.h --- vc-0.7.4/common/loadstoreflags.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/loadstoreflags.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,248 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_LOADSTOREFLAGS_H_ +#define VC_COMMON_LOADSTOREFLAGS_H_ + +#include "../traits/type_traits.h" + +namespace Vc_VERSIONED_NAMESPACE +{ + +/** + * Hint for \ref Prefetch to select prefetches that mark the memory as exclusive. + * + * This hint may optimize the prefetch if the memory will subsequently be written to. + */ +struct Exclusive {}; +/** + * Hint for \ref Prefetch to select prefetches that mark the memory as shared. + */ +struct Shared {}; + +namespace LoadStoreFlags +{ + +struct StreamingFlag {}; +struct UnalignedFlag {}; +struct PrefetchFlagBase {}; +#ifdef Vc_IMPL_MIC +template struct PrefetchFlag : public PrefetchFlagBase +{ + typedef ExclusiveOrShared_ ExclusiveOrShared; + static constexpr size_t L1Stride = L1; + static constexpr size_t L2Stride = L2; + static constexpr bool IsExclusive = std::is_same::value; + static constexpr bool IsShared = std::is_same::value; +}; + +template struct ExtractType +{ + typedef Default type; +}; +template struct ExtractType +{ + typedef typename std::conditional::value, T, typename ExtractType::type>::type type; +}; + +// ICC warns about the constexpr members in LoadStoreFlags: member "LoadStoreFlags::IsAligned" was declared but never referenced +// who needs that warning, especially if it was referenced... +// The warning cannot be reenabled because it gets emitted whenever the LoadStoreFlags is instantiated +// somewhere, so it could be anywhere. +#ifdef Vc_ICC +#pragma warning(disable: 177) +#endif +/**\internal + * Implementation of the load/store flags mechanism. This is internal API. Only some + * concrete aliases are API-relevant types. + */ +template struct LoadStoreFlags +{ +private: + // ICC doesn't grok this line: + //template using TestFlag = std::is_same::type, void>; + typedef typename ExtractType, Flags...>::type Prefetch; + +public: + constexpr LoadStoreFlags() {} + + static constexpr bool IsStreaming = !std::is_same::type, void>::value; + static constexpr bool IsUnaligned = !std::is_same::type, void>::value; + static constexpr bool IsAligned = !IsUnaligned; + static constexpr bool IsPrefetch = !std::is_same::type, void>::value; + static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive; + static constexpr bool IsSharedPrefetch = Prefetch::IsShared; + static constexpr size_t L1Stride = Prefetch::L1Stride; + static constexpr size_t L2Stride = Prefetch::L2Stride; + + typedef LoadStoreFlags::value, void, Flags>::type...> UnalignedRemoved; + + // The following EnableIf* convenience types cannot use enable_if because then no LoadStoreFlags type + // could ever be instantiated. Instead these types are defined either as void* or void. The + // function that does SFINAE then assigns "= nullptr" to this type. Thus, the ones with just + // void result in substitution failure. + typedef typename std::conditional::type EnableIfAligned; + typedef typename std::conditional::type EnableIfStreaming; + typedef typename std::conditional::type EnableIfUnalignedNotStreaming; + typedef typename std::conditional::type EnableIfUnalignedAndStreaming; + typedef typename std::conditional::type EnableIfUnaligned; + typedef typename std::conditional::type EnableIfNotUnaligned; + typedef typename std::conditional::type EnableIfPrefetch; + typedef typename std::conditional::type EnableIfNotPrefetch; +}; + +/**\internal + * Specialization for no flags (i.e aligned, non-streaming, no prefetching) + */ +template<> struct LoadStoreFlags<> +{ + constexpr LoadStoreFlags() {} + + static constexpr bool IsStreaming = false; + static constexpr bool IsUnaligned = false; + static constexpr bool IsAligned = !IsUnaligned; + static constexpr bool IsPrefetch = false; + static constexpr bool IsExclusivePrefetch = false; + static constexpr bool IsSharedPrefetch = false; + static constexpr size_t L1Stride = 0; + static constexpr size_t L2Stride = 0; + typedef void* EnableIfAligned; + typedef void* EnableIfNotUnaligned; + typedef void* EnableIfNotPrefetch; +}; + +/** + * Operator for concatenation of LoadStoreFlags. + * + * Example: + * \code + * float_v x(mem, Vc::Aligned | Vc::Streaming); + * \endcode + */ +template +constexpr LoadStoreFlags operator|(LoadStoreFlags, LoadStoreFlags) +{ + return LoadStoreFlags(); +} + +} // LoadStoreFlags namespace + +using LoadStoreFlags::PrefetchFlag; + +typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag; +typedef LoadStoreFlags::LoadStoreFlags StreamingTag; +typedef LoadStoreFlags::LoadStoreFlags UnalignedTag; + +/// The default load tag type uses unaligned (non-streaming) loads. +typedef UnalignedTag DefaultLoadTag; +/// The default store tag type uses unaligned (non-streaming) stores. +typedef UnalignedTag DefaultStoreTag; + +/**\addtogroup Utilities + * @{ + */ +/** + * Use this object for a \p flags parameter to request aligned loads and stores. + * + * It specifies that a load/store can expect a memory address that is aligned on + * the correct boundary. (i.e. \p MemoryAlignment) + * + * \warning + * If you specify Aligned, but the memory address is not aligned the program + * will most likely crash. + */ +constexpr AlignedTag Aligned; + +/** + * Use this object for a \p flags parameter to request unaligned loads and stores. + * + * It specifies that a load/store can \em not expect a memory address that is + * aligned on the correct boundary. (i.e. alignment is less than + * \p MemoryAlignment) + * + * \note + * If you specify Unaligned, but the memory address is aligned the load/store + * will execute slightly slower than necessary. + */ +constexpr UnalignedTag Unaligned; + +/** + * Use this object for a \p flags parameter to request streaming loads and stores. + * + * It specifies that the cache should be bypassed for the given load/store. + * Whether this will actually be done depends on the target system's capabilities. + * + * Streaming stores can be interesting when the code calculates values that, after being + * written to memory, will not be used for a long time or used by a different thread. + * + * \note + * Expect that most target systems do not support unaligned streaming loads or stores. + * Therefore, make sure that you also specify Aligned. + */ +constexpr StreamingTag Streaming; + +/** + * Use this object for a \p flags parameter to request default software prefetches to be + * emitted. + */ +constexpr LoadStoreFlags::LoadStoreFlags> PrefetchDefault; +///@} + +/** + * \tparam L1 + * \tparam L2 + * \tparam ExclusiveOrShared + */ +template ::L1Stride, + size_t L2 = PrefetchFlag<>::L2Stride, + typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared> +struct Prefetch : public LoadStoreFlags::LoadStoreFlags> +{ +}; + +namespace Traits +{ +///\internal partial specialization for detecting LoadStoreFlags types +template +struct is_loadstoreflag_internal> : public std::true_type +{ +}; +///\internal partial specialization for detecting the derived Prefetch type as a +/// load/store flag. +template +struct is_loadstoreflag_internal> : public std::true_type +{ +}; +} // namespace Traits +} // namespace Vc + +#endif // VC_COMMON_LOADSTOREFLAGS_H_ diff -Nru vc-0.7.4/common/logarithm.h vc-1.3.0/common/logarithm.h --- vc-0.7.4/common/logarithm.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/logarithm.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,21 +1,29 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ +}}}*/ /* The log implementations are based on code from Julien Pommier which carries the following copyright information: @@ -45,33 +53,24 @@ (this is the zlib license) */ -#ifndef VC_COMMON_LOGARITHM_H -#define VC_COMMON_LOGARITHM_H +#ifdef Vc_COMMON_MATH_H_INTERNAL -#include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc -{ -namespace Common -{ -#ifdef VC__USE_NAMESPACE -using Vc::VC__USE_NAMESPACE::Const; -using Vc::VC__USE_NAMESPACE::Vector; -namespace Internal -{ - using namespace Vc::VC__USE_NAMESPACE::Internal; -} // namespace Internal -#endif enum LogarithmBase { BaseE, Base10, Base2 }; +namespace Detail +{ +template +using Const = typename std::conditional::value, + AVX::Const, SSE::Const>::type; + template struct LogImpl { - template static Vc_ALWAYS_INLINE void log_series(Vector &VC_RESTRICT x, typename Vector::AsArg exponent) { - typedef Vector V; - typedef Const C; + template static Vc_ALWAYS_INLINE void log_series(Vector &Vc_RESTRICT x, typename Vector::AsArg exponent) { + typedef Vector V; + typedef Detail::Const C; // Taylor series around x = 2^exponent // f(x) = ln(x) → exponent * ln(2) → C::ln2_small + C::ln2_large // f'(x) = x⁻¹ → x → 1 @@ -85,7 +84,7 @@ // P(0) is the smallest term and |x| < 1 ⇒ |xⁿ| > |xⁿ⁺¹| // The order of additions must go from smallest to largest terms const V x2 = x * x; // 0 → 4 -#ifdef VC_LOG_ILP +#ifdef Vc_LOG_ILP V y2 = (C::P(6) * /*4 → 8*/ x2 + /* 8 → 11*/ C::P(7) * /*1 → 5*/ x) + /*11 → 14*/ C::P(8); V y0 = (C::P(0) * /*5 → 9*/ x2 + /* 9 → 12*/ C::P(1) * /*2 → 6*/ x) + /*12 → 15*/ C::P(2); V y1 = (C::P(3) * /*6 → 10*/ x2 + /*10 → 13*/ C::P(4) * /*3 → 7*/ x) + /*13 → 16*/ C::P(5); @@ -93,7 +92,7 @@ const V x6 = x3 * x3; // 11 → 15 const V x9 = x6 * x3; // 15 → 19 V y = (y0 * /*19 → 23*/ x9 + /*23 → 26*/ y1 * /*16 → 20*/ x6) + /*26 → 29*/ y2 * /*14 → 18*/ x3; -#elif defined VC_LOG_ILP2 +#elif defined Vc_LOG_ILP2 /* * name start done * movaps %xmm0, %xmm1 ; x 0 1 @@ -138,9 +137,7 @@ + C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3; #else V y = C::P(0); - unrolled_loop16(i, 1, 9, - y = y * x + C::P(i); - ); + Vc::Common::unrolled_loop([&](int i) { y = y * x + C::P(i); }); y *= x * x2; #endif switch (Base) { @@ -171,16 +168,19 @@ } } - static Vc_ALWAYS_INLINE void log_series(Vector &VC_RESTRICT x, Vector::AsArg exponent) { - typedef Vector V; - typedef Const C; +template +static Vc_ALWAYS_INLINE void log_series(Vector &Vc_RESTRICT x, + typename Vector::AsArg exponent) +{ + typedef Vector V; + typedef Detail::Const C; const V x2 = x * x; V y = C::P(0); V y2 = C::Q(0) + x; - unrolled_loop16(i, 1, 5, - y = y * x + C::P(i); - y2 = y2 * x + C::Q(i); - ); + Vc::Common::unrolled_loop([&](int i) { + y = y * x + C::P(i); + y2 = y2 * x + C::Q(i); + }); y2 = x / y2; y = y * x + C::P(5); y = x2 * y * y2; @@ -213,10 +213,11 @@ } } - template static inline Vector calc(VC_ALIGNED_PARAMETER(Vector) _x) { - typedef Vector V; +template > +static inline Vector calc(V _x) +{ typedef typename V::Mask M; - typedef Const C; + typedef Detail::Const C; V x(_x); @@ -224,12 +225,13 @@ const M infinityMask = x == V::Zero(); const M denormal = x <= C::min(); - x(denormal) *= V(Vc_buildDouble(1, 0, 54)); // 2²⁵ - V exponent = Internal::exponent(x.data()); // = ⎣log₂(x)⎦ + x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>()); // 2²⁵ + V exponent = Detail::exponent(x.data()); // = ⎣log₂(x)⎦ exponent(denormal) -= 54; x.setZero(C::exponentMask()); // keep only the fractional part ⇒ x ∈ [1, 2[ - x |= C::_1_2(); // and set the exponent to 2⁻¹ ⇒ x ∈ [½, 1[ + x = Detail::operator|(x, + C::_1_2()); // and set the exponent to 2⁻¹ ⇒ x ∈ [½, 1[ // split calculation in two cases: // A: x ∈ [½, √½[ @@ -250,28 +252,22 @@ return x; } }; +} // namespace Detail -template static Vc_ALWAYS_INLINE Vc_CONST Vector log(VC_ALIGNED_PARAMETER(Vector) x) { - return LogImpl::calc(x); -} -template static Vc_ALWAYS_INLINE Vc_CONST Vector log10(VC_ALIGNED_PARAMETER(Vector) x) { - return LogImpl::calc(x); +template +Vc_INTRINSIC Vc_CONST Vector log(const Vector &x) +{ + return Detail::LogImpl::calc(x); } -template static Vc_ALWAYS_INLINE Vc_CONST Vector log2(VC_ALIGNED_PARAMETER(Vector) x) { - return LogImpl::calc(x); +template +Vc_INTRINSIC Vc_CONST Vector log10(const Vector &x) +{ + return Detail::LogImpl::calc(x); } -} // namespace Common -#ifdef VC__USE_NAMESPACE -namespace VC__USE_NAMESPACE +template +Vc_INTRINSIC Vc_CONST Vector log2(const Vector &x) { - using Vc::Common::log; - using Vc::Common::log10; - using Vc::Common::log2; -} // namespace VC__USE_NAMESPACE -#undef VC__USE_NAMESPACE -#endif -} // namespace Vc -/*OUTER_NAMESPACE_END*/ -#include "undomacros.h" + return Detail::LogImpl::calc(x); +} -#endif // VC_COMMON_LOGARITHM_H +#endif // Vc_COMMON_MATH_H_INTERNAL diff -Nru vc-0.7.4/common/macros.h vc-1.3.0/common/macros.h --- vc-0.7.4/common/macros.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/macros.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,56 +1,77 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2010-2015 Matthias Kretz - Copyright (C) 2010-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MACROS_H -#define VC_COMMON_MACROS_H -#undef VC_COMMON_UNDOMACROS_H +#ifndef VC_COMMON_MACROS_H_ +#define VC_COMMON_MACROS_H_ #include -#if defined(VC_GCC) && !defined(__OPTIMIZE__) -# if VC_GCC >= 0x40500 -# pragma GCC diagnostic push -# define Vc_POP_GCC_DIAGNOSTIC__ 1 -# endif -// GCC uses lots of old-style-casts in macros that disguise as intrinsics -# pragma GCC diagnostic ignored "-Wold-style-cast" -#endif - -#ifdef VC_MSVC -# define ALIGN(n) __declspec(align(n)) -# define STRUCT_ALIGN1(n) ALIGN(n) -# define STRUCT_ALIGN2(n) -# define ALIGNED_TYPEDEF(n, _type_, _newType_) typedef ALIGN(n) _type_ _newType_ -#else -# define ALIGN(n) __attribute__((aligned(n))) -# define STRUCT_ALIGN1(n) -# define STRUCT_ALIGN2(n) ALIGN(n) -# define ALIGNED_TYPEDEF(n, _type_, _newType_) typedef _type_ _newType_ ALIGN(n) -#endif - -#ifdef VC_CXX11 -#define Vc_ALIGNOF(_TYPE_) alignof(_TYPE_) -#else -#define Vc_ALIGNOF(_TYPE_) __alignof(_TYPE_) -#endif -#ifdef VC_CLANG +#ifdef Vc_MSVC +#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \ + typedef __declspec(align(n_)) type_ new_type_ +#elif __GNUC__ +#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \ + typedef type_ new_type_[[gnu::aligned(n_)]] +#else // the following is actually ill-formed according to C++1[14] +#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \ + using new_type_ alignas(sizeof(n_)) = type_ +#endif + +// On Windows (WIN32) we might see macros called min and max. Just undefine them and hope +// noone (re)defines them (NOMINMAX should help). +#ifdef WIN32 +#define NOMINMAX 1 +#if defined min +#undef min +#endif +#if defined max +#undef max +#endif +#endif // WIN32 + +#if defined Vc_GCC && Vc_GCC >= 0x60000 +// GCC 6 drops all attributes on types passed as template arguments. This is important +// if a may_alias gets lost and therefore needs to be readded in the implementation of +// the class template. +#define Vc_TEMPLATES_DROP_ATTRIBUTES 1 +#endif + +#if Vc_IS_VERSION_2 || (defined Vc_GCC && Vc_GCC >= 0x60000) +// GCC 6 optimizes the RowMemory::fromRawData hack away (common/memorybase.h). Therefore +// the 2D Memory class is implemented recursively using 1D Memory members. Since this is +// an ABI break this is only enabled for GCC 6. With Vc 2.x all implementations should do +// this. +#define Vc_RECURSIVE_MEMORY 1 +#endif + +#if defined Vc_CLANG || defined Vc_APPLECLANG +# define Vc_UNREACHABLE __builtin_unreachable +# define Vc_NEVER_INLINE [[gnu::noinline]] # define Vc_INTRINSIC_L inline # define Vc_INTRINSIC_R __attribute__((always_inline)) # define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R @@ -65,37 +86,33 @@ # define Vc_ALWAYS_INLINE_L inline # define Vc_ALWAYS_INLINE_R __attribute__((always_inline)) # define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R -# define VC_IS_UNLIKELY(x) __builtin_expect(x, 0) -# define VC_IS_LIKELY(x) __builtin_expect(x, 1) -# define VC_RESTRICT __restrict__ -# define VC_DEPRECATED(msg) +# define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0) +# define Vc_IS_LIKELY(x) __builtin_expect(x, 1) +# define Vc_RESTRICT __restrict__ +# define Vc_DEPRECATED(msg) +# define Vc_DEPRECATED_ALIAS(msg) +# define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) #elif defined(__GNUC__) -# if (defined(VC_GCC) && VC_GCC < 0x40300) || defined(VC_OPEN64) -// GCC 4.1 and 4.2 ICE on may_alias. Since Open64 uses the GCC 4.2 frontend it has the same problem. +# define Vc_UNREACHABLE __builtin_unreachable +# if defined Vc_GCC && !defined __OPTIMIZE__ # define Vc_MAY_ALIAS # else # define Vc_MAY_ALIAS __attribute__((__may_alias__)) # endif -# if (defined(VC_GCC) && VC_GCC < 0x40300) -// GCC 4.1 fails with "sorry unimplemented: inlining failed" -# define Vc_INTRINSIC_R __attribute__((__flatten__)) -# elif defined(VC_OPEN64) -// the GCC 4.2 frontend doesn't know the __artificial__ attribute -# define Vc_INTRINSIC_R __attribute__((__flatten__, __always_inline__)) -# else -# define Vc_INTRINSIC_R __attribute__((__flatten__, __always_inline__, __artificial__)) -# endif +# define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__)) # define Vc_INTRINSIC_L inline # define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R # define Vc_FLATTEN __attribute__((__flatten__)) # define Vc_ALWAYS_INLINE_L inline # define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__)) # define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R -# ifdef VC_ICC - // ICC miscompiles if there are functions marked as pure or const +# ifdef Vc_ICC +// ICC miscompiles if there are functions marked as pure or const # define Vc_PURE # define Vc_CONST +# define Vc_NEVER_INLINE # else +# define Vc_NEVER_INLINE [[gnu::noinline]] # define Vc_PURE __attribute__((__pure__)) # define Vc_CONST __attribute__((__const__)) # endif @@ -103,17 +120,25 @@ # define Vc_CONST_R Vc_CONST # define Vc_PURE_L # define Vc_PURE_R Vc_PURE -# define VC_IS_UNLIKELY(x) __builtin_expect(x, 0) -# define VC_IS_LIKELY(x) __builtin_expect(x, 1) -# define VC_RESTRICT __restrict__ -# define VC_DEPRECATED(msg) __attribute__((__deprecated__(msg))) +# define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0) +# define Vc_IS_LIKELY(x) __builtin_expect(x, 1) +# define Vc_RESTRICT __restrict__ +# ifdef Vc_ICC +# define Vc_DEPRECATED(msg) +# define Vc_DEPRECATED_ALIAS(msg) +# else +# define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg))) +# define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg))) +# endif +# define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) #else +# define Vc_NEVER_INLINE # define Vc_FLATTEN # ifdef Vc_PURE # undef Vc_PURE # endif # define Vc_MAY_ALIAS -# ifdef VC_MSVC +# ifdef Vc_MSVC # define Vc_ALWAYS_INLINE inline __forceinline # define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE # define Vc_ALWAYS_INLINE_R @@ -126,6 +151,13 @@ # define Vc_INTRINSIC inline __forceinline # define Vc_INTRINSIC_L Vc_INTRINSIC # define Vc_INTRINSIC_R +namespace Vc_VERSIONED_NAMESPACE { +namespace detail +{ +static Vc_INTRINSIC void unreachable() { __assume(0); } +} // namespace detail +} +# define Vc_UNREACHABLE Vc::detail::unreachable # else # define Vc_ALWAYS_INLINE # define Vc_ALWAYS_INLINE_L @@ -139,246 +171,162 @@ # define Vc_INTRINSIC # define Vc_INTRINSIC_L # define Vc_INTRINSIC_R +# define Vc_UNREACHABLE std::abort # endif -# define VC_IS_UNLIKELY(x) x -# define VC_IS_LIKELY(x) x -# define VC_RESTRICT __restrict -# define VC_DEPRECATED(msg) __declspec(deprecated(msg)) -#endif +# define Vc_IS_UNLIKELY(x) x +# define Vc_IS_LIKELY(x) x +# define Vc_RESTRICT __restrict +# define Vc_DEPRECATED(msg) __declspec(deprecated(msg)) +# define Vc_DEPRECATED_ALIAS(msg) +# define Vc_WARN_UNUSED_RESULT +#endif + +#ifdef Vc_CXX14 +#undef Vc_DEPRECATED +#define Vc_DEPRECATED(msg_) [[deprecated(msg_)]] +#endif + +#define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "") + +#define Vc_FREE_STORE_OPERATORS_ALIGNED(align_) \ + /**\name new/delete overloads for correct alignment */ \ + /**@{*/ \ + /*!\brief Allocates correctly aligned memory */ \ + Vc_ALWAYS_INLINE void *operator new(size_t size) \ + { \ + return Vc::Common::aligned_malloc(size); \ + } \ + /*!\brief Returns \p p. */ \ + Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \ + /*!\brief Allocates correctly aligned memory */ \ + Vc_ALWAYS_INLINE void *operator new[](size_t size) \ + { \ + return Vc::Common::aligned_malloc(size); \ + } \ + /*!\brief Returns \p p. */ \ + Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; } \ + /*!\brief Frees aligned memory. */ \ + Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); } \ + /*!\brief Does nothing. */ \ + Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \ + /*!\brief Frees aligned memory. */ \ + Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) \ + { \ + Vc::Common::free(ptr); \ + } \ + /*!\brief Does nothing. */ \ + Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} \ + /**@}*/ \ + Vc_NOTHING_EXPECTING_SEMICOLON -#if __cplusplus >= 201103 /*C++11*/ -#define _VC_CONSTEXPR constexpr -#define _VC_CONSTEXPR_L _VC_CONSTEXPR -#define _VC_CONSTEXPR_R -#else -#define _VC_CONSTEXPR Vc_INTRINSIC Vc_CONST -#define _VC_CONSTEXPR_L Vc_INTRINSIC_L Vc_CONST_L -#define _VC_CONSTEXPR_R Vc_INTRINSIC_R Vc_CONST_R -#endif - -#ifdef VC_CXX11 -# define _VC_NOEXCEPT noexcept -#else -# define _VC_NOEXCEPT throw() -#endif - -#define FREE_STORE_OPERATORS_ALIGNED(alignment) \ - Vc_ALWAYS_INLINE void *operator new(size_t size) { return _mm_malloc(size, alignment); } \ - Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \ - Vc_ALWAYS_INLINE void *operator new[](size_t size) { return _mm_malloc(size, alignment); } \ - Vc_ALWAYS_INLINE void *operator new[](size_t , void *p) { return p; } \ - Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { _mm_free(ptr); } \ - Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \ - Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) { _mm_free(ptr); } \ - Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} - -#ifdef VC_GCC -# define VC_WARN_INLINE -# define VC_WARN(msg) __attribute__((warning("\n\t" msg))) -#else -# define VC_WARN_INLINE inline -# define VC_WARN(msg) -#endif - -#define unrolled_loop16(_it_, _start_, _end_, _code_) \ -if (_start_ + 0 < _end_) { enum { _it_ = (_start_ + 0) < _end_ ? (_start_ + 0) : _start_ }; _code_ } \ -if (_start_ + 1 < _end_) { enum { _it_ = (_start_ + 1) < _end_ ? (_start_ + 1) : _start_ }; _code_ } \ -if (_start_ + 2 < _end_) { enum { _it_ = (_start_ + 2) < _end_ ? (_start_ + 2) : _start_ }; _code_ } \ -if (_start_ + 3 < _end_) { enum { _it_ = (_start_ + 3) < _end_ ? (_start_ + 3) : _start_ }; _code_ } \ -if (_start_ + 4 < _end_) { enum { _it_ = (_start_ + 4) < _end_ ? (_start_ + 4) : _start_ }; _code_ } \ -if (_start_ + 5 < _end_) { enum { _it_ = (_start_ + 5) < _end_ ? (_start_ + 5) : _start_ }; _code_ } \ -if (_start_ + 6 < _end_) { enum { _it_ = (_start_ + 6) < _end_ ? (_start_ + 6) : _start_ }; _code_ } \ -if (_start_ + 7 < _end_) { enum { _it_ = (_start_ + 7) < _end_ ? (_start_ + 7) : _start_ }; _code_ } \ -if (_start_ + 8 < _end_) { enum { _it_ = (_start_ + 8) < _end_ ? (_start_ + 8) : _start_ }; _code_ } \ -if (_start_ + 9 < _end_) { enum { _it_ = (_start_ + 9) < _end_ ? (_start_ + 9) : _start_ }; _code_ } \ -if (_start_ + 10 < _end_) { enum { _it_ = (_start_ + 10) < _end_ ? (_start_ + 10) : _start_ }; _code_ } \ -if (_start_ + 11 < _end_) { enum { _it_ = (_start_ + 11) < _end_ ? (_start_ + 11) : _start_ }; _code_ } \ -if (_start_ + 12 < _end_) { enum { _it_ = (_start_ + 12) < _end_ ? (_start_ + 12) : _start_ }; _code_ } \ -if (_start_ + 13 < _end_) { enum { _it_ = (_start_ + 13) < _end_ ? (_start_ + 13) : _start_ }; _code_ } \ -if (_start_ + 14 < _end_) { enum { _it_ = (_start_ + 14) < _end_ ? (_start_ + 14) : _start_ }; _code_ } \ -if (_start_ + 15 < _end_) { enum { _it_ = (_start_ + 15) < _end_ ? (_start_ + 15) : _start_ }; _code_ } \ -do {} while ( false ) - -#define for_all_vector_entries(_it_, _code_) \ - unrolled_loop16(_it_, 0, Size, _code_) - -#ifdef VC_ASSERT -#define VC_EXTERNAL_ASSERT 1 +#ifdef Vc_ASSERT +#define Vc_EXTERNAL_ASSERT 1 #else #ifdef NDEBUG -#define VC_ASSERT(x) +#define Vc_ASSERT(x) #else #include -#define VC_ASSERT(x) assert(x); +#define Vc_ASSERT(x) assert(x); #endif #endif -#ifdef VC_CLANG -#define VC_HAS_BUILTIN(x) __has_builtin(x) +#if defined Vc_CLANG || defined Vc_APPLECLANG +#define Vc_HAS_BUILTIN(x) __has_builtin(x) #else -#define VC_HAS_BUILTIN(x) 0 +#define Vc_HAS_BUILTIN(x) 0 #endif -#ifndef VC_COMMON_MACROS_H_ONCE -#define VC_COMMON_MACROS_H_ONCE +#define Vc_CAT_HELPER_(a, b, c, d) a##b##c##d +#define Vc_CAT(a, b, c, d) Vc_CAT_HELPER_(a, b, c, d) -#define _VC_CAT_HELPER(a, b, c, d) a##b##c##d -#define _VC_CAT(a, b, c, d) _VC_CAT_HELPER(a, b, c, d) +#define Vc_CAT_IMPL(a, b) a##b +#define Vc_CAT2(a, b) Vc_CAT_IMPL(a, b) -#if __cplusplus >= 201103 /*C++11*/ || (defined(VC_MSVC) && VC_MSVC >= 160000000) -#define VC_STATIC_ASSERT_NC(cond, msg) \ - static_assert(cond, #msg) -#define VC_STATIC_ASSERT(cond, msg) VC_STATIC_ASSERT_NC(cond, msg) -#else // C++98 -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc { - namespace { - template struct STATIC_ASSERT_FAILURE; - template<> struct STATIC_ASSERT_FAILURE {}; -}} -/*OUTER_NAMESPACE_END*/ - -#define VC_STATIC_ASSERT_NC(cond, msg) \ - typedef STATIC_ASSERT_FAILURE _VC_CAT(static_assert_failed_on_line_,__LINE__,_,msg); \ - enum { \ - _VC_CAT(static_assert_failed__on_line_,__LINE__,_,msg) = sizeof(_VC_CAT(static_assert_failed_on_line_,__LINE__,_,msg)) \ - } -#define VC_STATIC_ASSERT(cond, msg) VC_STATIC_ASSERT_NC(cond, msg) -#endif // C++11/98 - - template struct exponentToMultiplier { enum Values__ { - X = exponentToMultiplier::X * ((e - center < 31) ? 2 : 1), - Value = (X == 0 ? 1 : X) - }; }; - template struct exponentToMultiplier { enum Values__ { X = 1, Value = X }; }; - template struct exponentToMultiplier< -1, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -128, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -256, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -384, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -512, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -640, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -768, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -896, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier<-1024, center> { enum Values__ { X = 0, Value = 1 }; }; - - template struct exponentToDivisor { enum Values__ { - X = exponentToDivisor::X * ((center - e < 31) ? 2 : 1), - Value = (X == 0 ? 1 : X) - }; }; - template struct exponentToDivisor { enum Values__ { X = 1, Value = X }; }; - template struct exponentToDivisor< 1, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 128, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 256, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 384, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 512, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 640, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 768, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 896, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 1024, center> { enum Values__ { X = 0, Value = 1 }; }; -#endif // VC_COMMON_MACROS_H_ONCE - -#define _CAT_IMPL(a, b) a##b -#define CAT(a, b) _CAT_IMPL(a, b) - -#define Vc_buildDouble(sign, mantissa, exponent) \ - ((static_cast((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) / 0x0010000000000000ull) \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - * static_cast(sign)) -#define Vc_buildFloat(sign, mantissa, exponent) \ - ((static_cast((mantissa & 0x007fffffu) | 0x00800000) / 0x00800000) \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - * static_cast(sign)) - -#define _VC_APPLY_IMPL_1(macro, a, b, c, d, e) macro(a) -#define _VC_APPLY_IMPL_2(macro, a, b, c, d, e) macro(a, b) -#define _VC_APPLY_IMPL_3(macro, a, b, c, d, e) macro(a, b, c) -#define _VC_APPLY_IMPL_4(macro, a, b, c, d, e) macro(a, b, c, d) -#define _VC_APPLY_IMPL_5(macro, a, b, c, d, e) macro(a, b, c, d, e) +#define Vc_APPLY_IMPL_1_(macro, a, b, c, d, e) macro(a) +#define Vc_APPLY_IMPL_2_(macro, a, b, c, d, e) macro(a, b) +#define Vc_APPLY_IMPL_3_(macro, a, b, c, d, e) macro(a, b, c) +#define Vc_APPLY_IMPL_4_(macro, a, b, c, d, e) macro(a, b, c, d) +#define Vc_APPLY_IMPL_5_(macro, a, b, c, d, e) macro(a, b, c, d, e) -#define VC_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ +#define Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ size(macro, double_v, a, b, c, d) \ - size(macro, float_v, a, b, c, d) \ - size(macro, sfloat_v, a, b, c, d) -#define VC_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \ + size(macro, float_v, a, b, c, d) +#define Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \ size(macro, int_v, a, b, c, d) \ size(macro, uint_v, a, b, c, d) \ size(macro, short_v, a, b, c, d) \ size(macro, ushort_v, a, b, c, d) -#define VC_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \ - VC_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ - VC_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) -#define VC_LIST_COMPARES(size, macro, a, b, c, d) \ +#define Vc_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \ + Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ + Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) +#define Vc_LIST_COMPARES(size, macro, a, b, c, d) \ size(macro, ==, a, b, c, d) \ size(macro, !=, a, b, c, d) \ size(macro, <=, a, b, c, d) \ size(macro, >=, a, b, c, d) \ size(macro, < , a, b, c, d) \ size(macro, > , a, b, c, d) -#define VC_LIST_LOGICAL(size, macro, a, b, c, d) \ +#define Vc_LIST_LOGICAL(size, macro, a, b, c, d) \ size(macro, &&, a, b, c, d) \ size(macro, ||, a, b, c, d) -#define VC_LIST_BINARY(size, macro, a, b, c, d) \ +#define Vc_LIST_BINARY(size, macro, a, b, c, d) \ size(macro, |, a, b, c, d) \ size(macro, &, a, b, c, d) \ size(macro, ^, a, b, c, d) -#define VC_LIST_SHIFTS(size, macro, a, b, c, d) \ +#define Vc_LIST_SHIFTS(size, macro, a, b, c, d) \ size(macro, <<, a, b, c, d) \ size(macro, >>, a, b, c, d) -#define VC_LIST_ARITHMETICS(size, macro, a, b, c, d) \ +#define Vc_LIST_ARITHMETICS(size, macro, a, b, c, d) \ size(macro, +, a, b, c, d) \ size(macro, -, a, b, c, d) \ size(macro, *, a, b, c, d) \ size(macro, /, a, b, c, d) \ size(macro, %, a, b, c, d) -#define VC_APPLY_0(_list, macro) _list(_VC_APPLY_IMPL_1, macro, 0, 0, 0, 0) -#define VC_APPLY_1(_list, macro, a) _list(_VC_APPLY_IMPL_2, macro, a, 0, 0, 0) -#define VC_APPLY_2(_list, macro, a, b) _list(_VC_APPLY_IMPL_3, macro, a, b, 0, 0) -#define VC_APPLY_3(_list, macro, a, b, c) _list(_VC_APPLY_IMPL_4, macro, a, b, c, 0) -#define VC_APPLY_4(_list, macro, a, b, c, d) _list(_VC_APPLY_IMPL_5, macro, a, b, c, d) - -#define VC_ALL_COMPARES(macro) VC_APPLY_0(VC_LIST_COMPARES, macro) -#define VC_ALL_LOGICAL(macro) VC_APPLY_0(VC_LIST_LOGICAL, macro) -#define VC_ALL_BINARY(macro) VC_APPLY_0(VC_LIST_BINARY, macro) -#define VC_ALL_SHIFTS(macro) VC_APPLY_0(VC_LIST_SHIFTS, macro) -#define VC_ALL_ARITHMETICS(macro) VC_APPLY_0(VC_LIST_ARITHMETICS, macro) -#define VC_ALL_FLOAT_VECTOR_TYPES(macro) VC_APPLY_0(VC_LIST_FLOAT_VECTOR_TYPES, macro) -#define VC_ALL_VECTOR_TYPES(macro) VC_APPLY_0(VC_LIST_VECTOR_TYPES, macro) +#define Vc_APPLY_0(_list, macro) _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON +#define Vc_APPLY_1(_list, macro, a) _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON +#define Vc_APPLY_2(_list, macro, a, b) _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON +#define Vc_APPLY_3(_list, macro, a, b, c) _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON +#define Vc_APPLY_4(_list, macro, a, b, c, d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON -#define VC_EXACT_TYPE(_test, _reference, _type) \ - typename EnableIf::Value, _type>::Value +#define Vc_ALL_COMPARES(macro) Vc_APPLY_0(Vc_LIST_COMPARES, macro) +#define Vc_ALL_LOGICAL(macro) Vc_APPLY_0(Vc_LIST_LOGICAL, macro) +#define Vc_ALL_BINARY(macro) Vc_APPLY_0(Vc_LIST_BINARY, macro) +#define Vc_ALL_SHIFTS(macro) Vc_APPLY_0(Vc_LIST_SHIFTS, macro) +#define Vc_ALL_ARITHMETICS(macro) Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro) +#define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro) +#define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro) -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN -#define VC_ALIGNED_PARAMETER(_Type) const _Type & -#else -#define VC_ALIGNED_PARAMETER(_Type) const _Type -#endif +#define Vc_EXACT_TYPE(_test, _reference, _type) \ + typename std::enable_if::value, _type>::type + +#define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__) -#ifndef Vc__make_unique -#define Vc__make_unique(name) _VC_CAT(Vc__,name,_,__LINE__) +#if defined(Vc_ICC) || defined(Vc_CLANG) || defined Vc_APPLECLANG +#define Vc_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0)->member) - reinterpret_cast(0)) +#else +#define Vc_OFFSETOF(Type, member) offsetof(Type, member) #endif -#if defined(VC_ICC) || defined(VC_CLANG) -#define VC_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0)->member) - reinterpret_cast(0)) -#elif defined(VC_GCC) && VC_GCC < 0x40500 -#define VC_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0x1000)->member) - reinterpret_cast(0x1000)) +#if defined(Vc_NO_NOEXCEPT) +#define Vc_NOEXCEPT throw() #else -#define VC_OFFSETOF(Type, member) offsetof(Type, member) +#define Vc_NOEXCEPT noexcept #endif +#ifdef Vc_NO_ALWAYS_INLINE +#undef Vc_ALWAYS_INLINE +#undef Vc_ALWAYS_INLINE_L +#undef Vc_ALWAYS_INLINE_R +#define Vc_ALWAYS_INLINE inline +#define Vc_ALWAYS_INLINE_L inline +#define Vc_ALWAYS_INLINE_R +#undef Vc_INTRINSIC +#undef Vc_INTRINSIC_L +#undef Vc_INTRINSIC_R +#define Vc_INTRINSIC inline +#define Vc_INTRINSIC_L inline +#define Vc_INTRINSIC_R +#endif -#endif // VC_COMMON_MACROS_H +#endif // VC_COMMON_MACROS_H_ diff -Nru vc-0.7.4/common/makeContainer.h vc-1.3.0/common/makeContainer.h --- vc-0.7.4/common/makeContainer.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/makeContainer.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,151 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_MAKECONTAINER_H_ +#define VC_COMMON_MAKECONTAINER_H_ + +#include +#include +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ + + namespace + { + template struct make_container_helper + { + static constexpr Container help(std::initializer_list list) { return { list }; } + }; + + template class Container> + struct make_container_helper, Alloc>, + typename Vector::EntryType> { + typedef Vector V; + typedef typename V::EntryType T; + typedef Container C; + static inline C help(std::initializer_list list) { + const std::size_t size = (list.size() + (V::Size - 1)) / V::Size; + C v(size); + auto containerIt = v.begin(); + auto init = std::begin(list); + const auto initEnd = std::end(list); + for (std::size_t i = 0; i < size - 1; ++i) { + *containerIt++ = V(init, Vc::Unaligned); + init += V::Size; + } + Vc_ASSERT(all_of(*containerIt == V::Zero())); + int j = 0; + while (init != initEnd) { + (*containerIt)[j++] = *init++; + } + return v; + } + }; + + template class Container> + struct make_container_helper, N>, + typename Vector::EntryType> { + typedef Vector V; + typedef typename V::EntryType T; + static constexpr std::size_t size = (N + (V::Size - 1)) / V::Size; + typedef Container< + V, +#if defined Vc_CLANG && Vc_CLANG < 0x30700 // TODO: when did Vc_APPLECLANG fix it? + // clang before 3.7.0 has a bug when returning std::array<__m256x, 1>. So + // increase it to std::array<__m256x, 2> and fill it with zeros. Better + // than returning garbage. + (size == 1 && std::is_same::value) ? 2 : +#endif + size> C; + static inline C help(std::initializer_list list) { + Vc_ASSERT(N == list.size()) + Vc_ASSERT(size == (list.size() + (V::Size - 1)) / V::Size) + C v; + auto containerIt = v.begin(); + auto init = std::begin(list); + const auto initEnd = std::end(list); + for (std::size_t i = 0; i < size - 1; ++i) { + *containerIt++ = V(init, Vc::Unaligned); + init += V::Size; + } + Vc_ASSERT(all_of(*containerIt == V::Zero())); + int j = 0; + while (init != initEnd) { + (*containerIt)[j++] = *init++; + } + return v; + } + }; + } // anonymous namespace + + /** + * \ingroup Utilities + * \headerfile Utils + * + * Construct a container of Vc vectors from a std::initializer_list of scalar entries. + * + * \tparam Container The container type to construct. + * \tparam T The scalar type to use for the initializer_list. + * + * \param list An initializer list of arbitrary size. The type of the entries is important! + * If you pass a list of integers you will get a container filled with Vc::int_v objects. + * If, instead, you want to have a container of Vc::float_v objects, be sure the include a + * period (.) and the 'f' postfix in the literals. Alternatively, you can pass the + * type as second template argument to makeContainer. + * + * \return Returns a container of the requested class filled with the minimum number of SIMD + * vectors to hold the values in the initializer list. + * If the number of values in \p list does not match the number of values in the + * returned container object, the remaining values in the returned object will be + * zero-initialized. + * + * Example: + * \code + * auto data = Vc::makeContainer>({ 1.f, 2.f, 3.f, 4.f, 5.f }); + * // data.size() == 5 if float_v::Size == 1 (i.e. Vc_IMPL=Scalar) + * // data.size() == 2 if float_v::Size == 4 (i.e. Vc_IMPL=SSE) + * // data.size() == 1 if float_v::Size == 8 (i.e. Vc_IMPL=AVX) + * \endcode + */ + template + constexpr auto makeContainer(std::initializer_list list) -> decltype(make_container_helper::help(list)) + { + return make_container_helper::help(list); + } + + template + constexpr auto make_container(std::initializer_list list) -> decltype(makeContainer(list)) + { + return makeContainer(list); + } + +} // namespace Vc + +#endif // VC_COMMON_MAKECONTAINER_H_ diff -Nru vc-0.7.4/common/make_unique.h vc-1.3.0/common/make_unique.h --- vc-0.7.4/common/make_unique.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/make_unique.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,57 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_MAKE_UNIQUE_H_ +#define VC_COMMON_MAKE_UNIQUE_H_ + +#include + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +template struct Deleter +{ + Vc_ALWAYS_INLINE void operator()(T *ptr) { + ptr->~T(); + Vc::free(ptr); + } +}; + +template +inline std::unique_ptr> make_unique(Args&&... args) +{ + return std::unique_ptr>(new(Vc::malloc(1)) T(std::forward(args)...)); +} + +} // namespace Common +} // namespace Vc + +#endif // VC_COMMON_MAKE_UNIQUE_H_ diff -Nru vc-0.7.4/common/malloc.h vc-1.3.0/common/malloc.h --- vc-0.7.4/common/malloc.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/malloc.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,106 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_MALLOC_H_ +#define VC_COMMON_MALLOC_H_ + +#ifndef Vc_VECTOR_DECLARED_ +#error "Incorrect inclusion order. This header must be included from Vc/vector.h only." +#endif + +#if defined _WIN32 || defined _WIN64 +#include +#else +#include +#endif + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +template static constexpr size_t nextMultipleOf(size_t value) +{ + return (value % X) > 0 ? value + X - (value % X) : value; +} + +template Vc_INTRINSIC void *aligned_malloc(std::size_t n) +{ +#ifdef __MIC__ + return _mm_malloc(nextMultipleOf(n), alignment); +#elif defined(_WIN32) +# ifdef __GNUC__ + return __mingw_aligned_malloc(nextMultipleOf(n), alignment); +# else + return _aligned_malloc(nextMultipleOf(n), alignment); +# endif +#else + void *ptr = nullptr; + if (0 == posix_memalign(&ptr, alignment < sizeof(void *) ? sizeof(void *) : alignment, + nextMultipleOf(n))) { + return ptr; + } + return ptr; +#endif +} + +template Vc_ALWAYS_INLINE void *malloc(size_t n) +{ + switch (A) { + case Vc::AlignOnVector: + return aligned_malloc(n); + case Vc::AlignOnCacheline: + // TODO: hardcoding 64 is not such a great idea + return aligned_malloc<64>(n); + case Vc::AlignOnPage: + // TODO: hardcoding 4096 is not such a great idea + return aligned_malloc<4096>(n); + } + return nullptr; +} + +Vc_ALWAYS_INLINE void free(void *p) +{ +#ifdef __MIC__ + _mm_free(p); +#elif defined(_WIN32) +# ifdef __GNUC__ + return __mingw_aligned_free(p); +# else + return _aligned_free(p); +# endif +#else + std::free(p); +#endif +} + +} // namespace Common +} // namespace Vc + +#endif // VC_COMMON_MALLOC_H_ diff -Nru vc-0.7.4/common/maskbool.h vc-1.3.0/common/maskbool.h --- vc-0.7.4/common/maskbool.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/maskbool.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,102 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_MASKENTRY_H_ +#define VC_COMMON_MASKENTRY_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +namespace +{ + template struct MaskBoolStorage; + // the following for typedefs must use std::intN_t and NOT! Vc::intN_t. The latter + // segfaults ICC 15.0.3. + template<> struct MaskBoolStorage<1> { typedef std::int8_t type; }; + template<> struct MaskBoolStorage<2> { typedef std::int16_t type; }; + template<> struct MaskBoolStorage<4> { typedef std::int32_t type; }; + template<> struct MaskBoolStorage<8> { typedef std::int64_t type; }; +} // anonymous namespace + +template class MaskBool +{ + typedef typename MaskBoolStorage::type storage_type Vc_MAY_ALIAS; + storage_type data; +public: + constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {} + Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; } + template ::value && + std::is_fundamental::value)>> + Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept + { + data = reinterpret_cast(x); + return *this; + } + + Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default; + Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default; + + template ::value || + (std::is_fundamental::value && + sizeof(storage_type) == sizeof(T)))>> + constexpr operator T() const noexcept + { + return std::is_same::value ? T((data & 1) != 0) + : reinterpret_cast &>(data); + } +} Vc_MAY_ALIAS; + +template ::value &&std::is_convertible::value, + int>::type = 0> +constexpr bool operator==(A &&a, B &&b) +{ + return static_cast(a) == static_cast(b); +} +template ::value &&std::is_convertible::value, + int>::type = 0> +constexpr bool operator!=(A &&a, B &&b) +{ + return static_cast(a) != static_cast(b); +} + +static_assert(true == MaskBool<4>(true), "true == MaskBool<4>(true)"); +static_assert(true != MaskBool<4>(false), "true != MaskBool<4>(false)"); + +} // namespace Common +} // namespace Vc + +#endif // VC_COMMON_MASKENTRY_H_ diff -Nru vc-0.7.4/common/mask.h vc-1.3.0/common/mask.h --- vc-0.7.4/common/mask.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/mask.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,392 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_MASK_H_ +#define VC_COMMON_MASK_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +/** + * \class Mask mask.h + * \ingroup Masks + * + * The main SIMD mask class. + */ +template > class Mask +{ +public: + /** + * Returns the number of boolean components (\VSize{T}) in a mask of this type. + * + * The size of the mask. I.e. the number of boolean entries in the mask. Do not + * make any assumptions about the size of masks. + * + * In addition, you can easily use if clauses that compare sizes. The compiler can + * statically evaluate and fully optimize dead code away (very much like \#ifdef, but + * with syntax checking). + * + * \returns The number of components (i.e. \VSize{T}) objects of this mask type store + * and manipulate. + */ + static constexpr size_t size() { return VectorTraits::size(); } + ///\copydoc size + ///\deprecated Use Vc::Mask::size instead. + static constexpr size_t Size = VectorTraits::size(); + + /** + * Specifies the alignment requirement for aligned load and store calls for objects of + * this mask type. + */ + static constexpr size_t MemoryAlignment = VectorTraits::maskMemoryAlignment(); + + /// The ABI tag type of the current template instantiation. + using abi = Abi; + + /** + * The \c EntryType of masks is always \c bool, independent of \c T. + */ + using EntryType = bool; + /// \copydoc EntryType + using value_type = EntryType; + + /// The reference wrapper type used for accessing individual mask components. + using EntryReference = typename VectorTraits::EntryReference; + /// \copydoc EntryReference + using value_reference = EntryReference; + + /** + * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD + * implementation. + * This type is useful for the \c sizeof operator in generic functions. + */ + using VectorEntryType = typename VectorTraits::VectorEntryType; + + /**\internal + * The \c VectorType reveals the implementation-specific internal type used for the SIMD type. + */ + using VectorType = typename VectorTraits::VectorType; + /**\internal + * \copydoc VectorType + */ + using vector_type = VectorType; + + /* + * The associated Vector type. + */ + //using Vector = Vector; + + /// \name Generators + ///@{ + /** + * Creates a new mask object initialized to zero/\c false. + * + * \returns A mask object with zero-initialized components. + */ + Vc_INTRINSIC static Mask Zero(); + + /** + * Creates a mask object initialized to one/\c true. + * + * \returns A mask object with components initialized to \c true. + */ + Vc_INTRINSIC static Mask One(); + + /// Generate a mask object from booleans returned from the function \p gen. + template static Vc_INTRINSIC Mask generate(G &&gen); + ///@} + + /// \name Compile-Time Constant Initialization + ///@{ + /** + * Construct a zero-initialized vector object. + * + * This constructor follows the behavior of the underlying \c bool type in that the + * expression `bool()` zero-initializes the object (to \c false). On the other hand + * the variable \c x in `bool x;` is uninitialized. + * Since, for class types, both expressions call the default constructor `Mask x` + * must zero-initialize \c x as well. + */ + Vc_INTRINSIC Mask() = default; + + /// Zero-initialize the new mask object (\c false). + /// \see Vc::Zero, Zero() + Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero); + + /// Initialize the new mask object to one (\c true). + /// \see Vc::One, One() + Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne); + ///@} + + /// \name Conversion/Broadcast Constructors + ///@{ + /** + * Broadcast constructor. + * + * Set all components of the new mask object to \p b. + * + * \param b Determines the initial state of the mask. + */ + Vc_INTRINSIC explicit Mask(bool b); + + /** + * Implicit conversion from a compatible (equal \VSize{T} on every platform) mask + * object. + * + * \param otherMask The mask to be converted. + */ + template + Vc_INTRINSIC Mask(U &&otherMask, + Common::enable_if_mask_converts_implicitly = nullarg); + +#if Vc_IS_VERSION_1 + /** + * Explicit conversion (static_cast) from a mask object that potentially has a + * different \VSize{T}. + * + * \param otherMask The mask to be converted. + * + * \internal This is implemented via simd_cast in scalar/simd_cast_caller.h + */ + template + Vc_DEPRECATED( + "use simd_cast instead of explicit type casting to convert between mask types") + Vc_INTRINSIC_L + explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly = + nullarg) Vc_INTRINSIC_R; + ///@} +#endif + + /** + * \name Loads & Stores + */ + ///@{ + /** + * Load constructor from an array of \c bool. + * + * This constructor implements an explicit conversion from an array of booleans to a + * mask object. It corresponds to a Vector load constructor. + * + * \param mem A pointer to the start of the array of booleans. + * \see Mask(const bool *, Flags), load(const bool *) + */ + Vc_ALWAYS_INLINE explicit Mask(const bool *mem); + /** + * Overload of the above with a load/store flag argument. + * + * \param mem A pointer to the start of the array of booleans. + * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming, + * Vc::Unaligned, Vc::PrefetchDefault, ... + * \see load(const bool *, Flags) + */ + template Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags); + + /** + * Load the components of the mask from an array of \c bool. + * + * \param mem A pointer to the start of the array of booleans. + * \see load(const bool *, Flags), Mask(const bool *) + */ + Vc_ALWAYS_INLINE void load(const bool *mem); + /** + * Overload of the above with a load/store flag argument. + * + * \param mem A pointer to the start of the array of booleans. + * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming, + * Vc::Unaligned, Vc::PrefetchDefault, ... + * \see Mask(const bool *, Flags) + */ + template Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags); + + /** + * Store the values of the mask to an array of \c bool. + * + * \param mem A pointer to the start of the array of booleans. + * \see store(bool *, Flags) + */ + Vc_ALWAYS_INLINE void store(bool *mem) const; + /** + * Overload of the above with a load/store flag argument. + * + * \param mem A pointer to the start of the array of booleans. + * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming, + * Vc::Unaligned, Vc::PrefetchDefault, ... + */ + template Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const; + ///@} + + /// \name Comparison Operators + ///@{ + /** + * Returns whether the two masks are equal in all components. + * + * \param mask The other mask to compare against. + * \returns A scalar boolean value that says whether all components of the two masks + * are equal. + * + * \note If you expected a behavior similar to the compare operator of Vc::Vector, + * consider that the bitwise operators already implement such functionality. There is + * little use, typically, in having `a == b` return the same as `a ^ b`. In general, + * it is more useful to query `all_of(a ^ b)` which is the same as this equality + * operator. + */ + Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const; + + /** + * Returns whether the two masks are different in at least one component. + * + * \param mask The other mask to compare against. + * \returns A scalar boolean value that says whether at least one component of the two masks is different. + * + * \note `(a == b) == !(a != b)` holds + * \see Mask::operator==(const Mask &) + */ + Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const; + ///@} + + /** + * \name Logical and Binary Operators + * + * \brief Component-wise logical/binary operations on mask objects. + * + * The effect of logical and binary \c AND and \c OR is equivalent for mask types (as + * it is for \c bool). + */ + ///@{ + + /// Returns the component-wise application of a logical \c AND to \p mask. + Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const; + /// Returns the component-wise application of a binary \c AND to \p mask. + Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const; + /// Returns the component-wise application of a logical \c OR to \p mask. + Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const; + /// Returns the component-wise application of a binary \c OR to \p mask. + Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const; + /// Returns the component-wise application of a binary \c XOR to \p mask. + Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const; + /// Returns a mask with inverted components. + Vc_ALWAYS_INLINE Mask operator!() const; + + /// Modifies the mask using an \c AND operation with \p mask. + Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask); + /// Modifies the mask using an \c OR operation with \p mask. + Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask); + /// Modifies the mask using an \c XOR operation with \p mask. + Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask); + ///@} + + /** + * \name Reductions + * + * \see any_of, all_of, none_of, some_of + */ + ///@{ + + /// Returns a logical \c AND of all components. + Vc_ALWAYS_INLINE bool isFull() const; + /// Returns a logical \c OR of all components. + Vc_ALWAYS_INLINE bool isNotEmpty() const; + /// Returns \c true if components are \c false, \c false otherwise. + Vc_ALWAYS_INLINE bool isEmpty() const; + /// Returns `!isFull() && !isEmpty()`. + Vc_ALWAYS_INLINE bool isMix() const; + ///@} + + /**\internal + * \name Internal Data Access + */ + ///@{ + Vc_ALWAYS_INLINE bool data() const; + Vc_ALWAYS_INLINE bool dataI() const; + Vc_ALWAYS_INLINE bool dataD() const; + ///@} + + /// \name Scalar Subscript Operators + ///@{ + /** + * Lvalue-reference-like access to mask entries. + * + * \param index Determines the boolean to be accessed. + * \return a temporary proxy object referencing the \p index th entry of the mask. + * + * \warning This operator does not return an lvalue reference (to \c bool), but rather + * a temporary (rvalue) object that mimics an lvalue reference (as much as is possible + * with C++11/14). + */ + Vc_ALWAYS_INLINE EntryReference operator[](size_t index); + + /** + * Read-only access to mask entries. + * + * \param index Determines the boolean to be accessed. + * \return The \p index th entry of the mask as a \c bool (rvalue). + * + * \warning This operator does not return an lvalue reference (to `const bool`), but + * rather a temporary (rvalue) \c bool. + */ + Vc_ALWAYS_INLINE EntryType operator[](size_t index) const; + ///@} + + /// Returns how many components of the mask are \c true. + Vc_ALWAYS_INLINE int count() const; + + /** + * Returns the index of the first one in the mask. + * + * \returns the index of the first component that is \c true. + * + * \warning The return value is undefined if the mask is empty. + * + * Thus, unless `none_of(mask)`, `mask[mask.firstOne()] == true` holds and `mask[i] == + * false` for all `i < mask.firstOne()`. + */ + Vc_ALWAYS_INLINE int firstOne() const; + + /** + * Convert the boolean components of the mask into bits of an integer. + * + * \return An \c int where each bit corresponds to the boolean value in the mask. + * + * For example, the mask `[true, false, false, true]` results in a `9` (in binary: `1001`). + */ + Vc_ALWAYS_INLINE int toInt() const; + + /// Returns a mask with components shifted by \p amount places. + Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const; + + Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask)); + +private: + VectorType d; +}; + +} // namespace Vc + +#endif // VC_COMMON_MASK_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/math.h vc-1.3.0/common/math.h --- vc-0.7.4/common/math.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/math.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,130 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_MATH_H_ +#define VC_COMMON_MATH_H_ + +#define Vc_COMMON_MATH_H_INTERNAL 1 + +#include "trigonometric.h" + +#include "const.h" +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +#ifdef Vc_IMPL_SSE +// for SSE, AVX, and AVX2 +#include "logarithm.h" +#include "exponential.h" +#ifdef Vc_IMPL_AVX +inline AVX::double_v exp(AVX::double_v _x) +{ + AVX::Vector x = _x; + typedef AVX::Vector V; + typedef V::Mask M; + typedef AVX::Const C; + + const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log + const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log + + V px = floor(C::log2_e() * x + 0.5); + __m128i tmp = _mm256_cvttpd_epi32(px.data()); + const SimdArray n = SSE::int_v{tmp}; + x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>(); // ln2 + x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2 + + const double P[] = { + Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(), + Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(), + Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>() + }; + const double Q[] = { + Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(), + Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(), + Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(), + Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>() + }; + const V x2 = x * x; + px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); + x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); + x = V::One() + 2.0 * x; + + x = ldexp(x, n); // == x * 2ⁿ + + x(overflow) = std::numeric_limits::infinity(); + x.setZero(underflow); + + return x; + } +#endif // Vc_IMPL_AVX + +inline SSE::double_v exp(SSE::double_v::AsArg _x) { + SSE::Vector x = _x; + typedef SSE::Vector V; + typedef V::Mask M; + typedef SSE::Const C; + + const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log + const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log + + V px = floor(C::log2_e() * x + 0.5); + SimdArray n; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data())); + x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>(); // ln2 + x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2 + + const double P[] = { + Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(), + Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(), + Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>() + }; + const double Q[] = { + Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(), + Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(), + Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(), + Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>() + }; + const V x2 = x * x; + px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); + x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); + x = V::One() + 2.0 * x; + + x = ldexp(x, n); // == x * 2ⁿ + + x(overflow) = std::numeric_limits::infinity(); + x.setZero(underflow); + + return x; + } + +#endif +} // namespace Vc + +#undef Vc_COMMON_MATH_H_INTERNAL + +#endif // VC_COMMON_MATH_H_ diff -Nru vc-0.7.4/common/memorybase.h vc-1.3.0/common/memorybase.h --- vc-0.7.4/common/memorybase.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/memorybase.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,77 +1,51 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MEMORYBASE_H -#define VC_COMMON_MEMORYBASE_H +#ifndef VC_COMMON_MEMORYBASE_H_ +#define VC_COMMON_MEMORYBASE_H_ #include +#include +#include #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { - -#if __cplusplus >= 201103 || defined(VC_MSVC) -#define VC_DECLTYPE(T1, op, T2) decltype(T1() op T2()) -#elif defined(VC_OPEN64) || (defined(VC_GCC) && VC_GCC < 0x40300) -#define VC_DECLTYPE(T1, op, T2) T1 -#else -namespace +namespace Common { - struct one { char x; }; - struct two { one x, y; }; - template struct DecltypeHelper - { - static one test(const T1 &) { return one(); } - static two test(const T2 &) { return two(); } - //static void test(...) {} - }; - template struct DecltypeHelper - { - static one test(const T1 &) { return one(); } - //static void test(...) {} - }; - template struct Decltype { typedef T1 Value; }; - template struct Decltype { typedef T1 Value; }; - template struct Decltype { typedef T2 Value; }; -#ifdef VC_CLANG - // this special case is only necessary to silence a warning (which is rather a note that clang - // did the expected optimization): - // warning: variable 'SOME_PTR' is not needed and will not be emitted [-Wunneeded-internal-declaration] - // Then again, I don't remember why the SOME_PTR hack was necessary in the first place - some - // strange compiler quirk... -#define VC_DECLTYPE(T1, op, T2) typename Decltype::test(T1() op T2()))>::Value -#else - static const void *SOME_PTR; -#define VC_DECLTYPE(T1, op, T2) typename Decltype::test(*static_cast(SOME_PTR) op *static_cast(SOME_PTR)))>::Value -#endif -} // anonymous namespace -#endif -#define VC_MEM_OPERATOR_EQ(op) \ +#define Vc_MEM_OPERATOR_EQ(op) \ template \ - Vc_ALWAYS_INLINE VectorPointerHelper &operator op##=(const T &x) { \ - const V result = V(m_ptr, Internal::FlagObject::the()) op x; \ - result.store(m_ptr, Internal::FlagObject::the()); \ + Vc_ALWAYS_INLINE enable_if_mutable operator op##=(const T &x) { \ + const V v = value() op x; \ + v.store(&m_data[0], Flags()); \ return *this; \ } - +/*dox{{{*/ /** * Helper class for the Memory::vector(size_t) class of functions. * @@ -79,71 +53,149 @@ * Memory API. * * \headerfile memorybase.h - */ -template class VectorPointerHelperConst + *//*}}}*/ +template class MemoryVector/*{{{*/ { - typedef typename V::EntryType EntryType; - typedef typename V::Mask Mask; - public: - const EntryType *const m_ptr; + typedef typename std::remove_cv<_V>::type V; - explicit VectorPointerHelperConst(const EntryType *ptr) : m_ptr(ptr) {} - - /** - * Cast to \p V operator. - * - * This function allows to assign this object to any object of type \p V. - */ - Vc_ALWAYS_INLINE Vc_PURE operator const V() const { return V(m_ptr, Internal::FlagObject::the()); } -}; + template using enable_if_mutable = + typename std::enable_if::value && !std::is_const<_V>::value, R>::type; -/** - * Helper class for the Memory::vector(size_t) class of functions. - * - * You will never need to directly make use of this class. It is an implementation detail of the - * Memory API. - * - * \headerfile memorybase.h - */ -template class VectorPointerHelper -{ typedef typename V::EntryType EntryType; typedef typename V::Mask Mask; - public: - EntryType *const m_ptr; - explicit VectorPointerHelper(EntryType *ptr) : m_ptr(ptr) {} + EntryType m_data[V::Size]; +public: + // It is important that neither initialization nor cleanup is done as MemoryVector aliases + // other memory + Vc_ALWAYS_INLINE MemoryVector() {} + + // disable copies because this type is supposed to alias the data in a Memory object, + // nothing else + MemoryVector(const MemoryVector &) = delete; + MemoryVector(MemoryVector &&) = delete; + // Do not disable MemoryVector &operator=(const MemoryVector &) = delete; because it is + // covered nicely by the operator= below. + + //! \internal + Vc_ALWAYS_INLINE Vc_PURE V value() const { return V(&m_data[0], Flags()); } /** * Cast to \p V operator. * * This function allows to assign this object to any object of type \p V. */ - Vc_ALWAYS_INLINE Vc_PURE operator const V() const { return V(m_ptr, Internal::FlagObject::the()); } + Vc_ALWAYS_INLINE Vc_PURE operator V() const { return value(); } template - Vc_ALWAYS_INLINE VectorPointerHelper &operator=(const T &x) { + Vc_ALWAYS_INLINE enable_if_mutable operator=(const T &x) { V v; v = x; - v.store(m_ptr, Internal::FlagObject::the()); + v.store(&m_data[0], Flags()); return *this; } - VC_ALL_BINARY(VC_MEM_OPERATOR_EQ) - VC_ALL_ARITHMETICS(VC_MEM_OPERATOR_EQ) + Vc_ALL_BINARY(Vc_MEM_OPERATOR_EQ); + Vc_ALL_ARITHMETICS(Vc_MEM_OPERATOR_EQ); +}; + +template class MemoryVectorIterator +{ + typedef typename std::remove_cv<_V>::type V; + + template using enable_if_mutable = + typename std::enable_if::value && !std::is_const<_V>::value, R>::type; + + using iterator_traits = std::iterator_traits *>; + + MemoryVector<_V, Flags> *d; +public: + typedef typename iterator_traits::difference_type difference_type; + typedef typename iterator_traits::value_type value_type; + typedef typename iterator_traits::pointer pointer; + typedef typename iterator_traits::reference reference; + typedef typename iterator_traits::iterator_category iterator_category; + + constexpr MemoryVectorIterator(MemoryVector<_V, Flags> *dd) : d(dd) {} + constexpr MemoryVectorIterator(const MemoryVectorIterator &) = default; + constexpr MemoryVectorIterator(MemoryVectorIterator &&) = default; + Vc_ALWAYS_INLINE MemoryVectorIterator &operator=(const MemoryVectorIterator &) = default; + + Vc_ALWAYS_INLINE void *orderBy() const { return d; } + + Vc_ALWAYS_INLINE difference_type operator-(const MemoryVectorIterator &rhs) const { return d - rhs.d; } + Vc_ALWAYS_INLINE reference operator[](size_t i) const { return d[i]; } + Vc_ALWAYS_INLINE reference operator*() const { return *d; } + Vc_ALWAYS_INLINE pointer operator->() const { return d; } + Vc_ALWAYS_INLINE MemoryVectorIterator &operator++() { ++d; return *this; } + Vc_ALWAYS_INLINE MemoryVectorIterator operator++(int) { MemoryVectorIterator r(*this); ++d; return r; } + Vc_ALWAYS_INLINE MemoryVectorIterator &operator--() { --d; return *this; } + Vc_ALWAYS_INLINE MemoryVectorIterator operator--(int) { MemoryVectorIterator r(*this); --d; return r; } + Vc_ALWAYS_INLINE MemoryVectorIterator &operator+=(size_t n) { d += n; return *this; } + Vc_ALWAYS_INLINE MemoryVectorIterator &operator-=(size_t n) { d -= n; return *this; } + Vc_ALWAYS_INLINE MemoryVectorIterator operator+(size_t n) const { return MemoryVectorIterator(d + n); } + Vc_ALWAYS_INLINE MemoryVectorIterator operator-(size_t n) const { return MemoryVectorIterator(d - n); } }; -#undef VC_MEM_OPERATOR_EQ -#define VC_VPH_OPERATOR(op) \ -template \ -VC_DECLTYPE(V1, op, V2) operator op(const VectorPointerHelper &x, const VectorPointerHelper &y) { \ - return V1(x.m_ptr, Internal::FlagObject::the()) op V2(y.m_ptr, Internal::FlagObject::the()); \ +template +Vc_ALWAYS_INLINE bool operator==(const MemoryVectorIterator &l, const MemoryVectorIterator &r) +{ + return l.orderBy() == r.orderBy(); } -VC_ALL_ARITHMETICS(VC_VPH_OPERATOR) -VC_ALL_BINARY (VC_VPH_OPERATOR) -VC_ALL_COMPARES (VC_VPH_OPERATOR) -#undef VC_VPH_OPERATOR +template +Vc_ALWAYS_INLINE bool operator!=(const MemoryVectorIterator &l, const MemoryVectorIterator &r) +{ + return l.orderBy() != r.orderBy(); +} +template +Vc_ALWAYS_INLINE bool operator>=(const MemoryVectorIterator &l, const MemoryVectorIterator &r) +{ + return l.orderBy() >= r.orderBy(); +} +template +Vc_ALWAYS_INLINE bool operator<=(const MemoryVectorIterator &l, const MemoryVectorIterator &r) +{ + return l.orderBy() <= r.orderBy(); +} +template +Vc_ALWAYS_INLINE bool operator> (const MemoryVectorIterator &l, const MemoryVectorIterator &r) +{ + return l.orderBy() > r.orderBy(); +} +template +Vc_ALWAYS_INLINE bool operator< (const MemoryVectorIterator &l, const MemoryVectorIterator &r) +{ + return l.orderBy() < r.orderBy(); +} +/*}}}*/ +#undef Vc_MEM_OPERATOR_EQ + +#define Vc_VPH_OPERATOR(op) \ + template \ + decltype(std::declval() op std::declval()) operator op( \ + const MemoryVector &x, const MemoryVector &y) \ + { \ + return x.value() op y.value(); \ + } +Vc_ALL_ARITHMETICS(Vc_VPH_OPERATOR); +Vc_ALL_BINARY (Vc_VPH_OPERATOR); +Vc_ALL_COMPARES (Vc_VPH_OPERATOR); +#undef Vc_VPH_OPERATOR +template> class MemoryRange/*{{{*/ +{ + Parent *m_parent; + size_t m_first; + size_t m_last; + +public: + MemoryRange(Parent *p, size_t firstIndex, size_t lastIndex) + : m_parent(p), m_first(firstIndex), m_last(lastIndex) + {} + + MemoryVectorIterator begin() const { return &m_parent->vector(m_first , Flags()); } + MemoryVectorIterator end() const { return &m_parent->vector(m_last + 1, Flags()); } +};/*}}}*/ template class MemoryDimensionBase; template class MemoryDimensionBase // {{{1 { @@ -170,6 +222,7 @@ /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i) const { return entries()[i]; } +#ifdef DOXYGEN /** * Cast operator to the scalar type. This allows to use the object very much like a standard * C array. @@ -177,17 +230,52 @@ Vc_ALWAYS_INLINE Vc_PURE operator EntryType*() { return entries(); } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE operator const EntryType*() const { return entries(); } +#else + // The above conversion operator allows implicit conversion to bool. To prohibit this + // conversion we use SFINAE to allow only conversion to EntryType* and void*. + template ::type, EntryType *>::value || + std::is_same::type, void *>::value, + int>::type = 0> + Vc_ALWAYS_INLINE Vc_PURE operator T() + { + return entries(); + } + template ::value || + std::is_same::value, + int>::type = 0> + Vc_ALWAYS_INLINE Vc_PURE operator T() const + { + return entries(); + } +#endif + + /** + * + */ + template + Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex, Flags) { + return MemoryRange(p(), firstIndex, lastIndex); + } + Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex) { + return MemoryRange(p(), firstIndex, lastIndex); + } + template + Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex, Flags) const { + return MemoryRange(p(), firstIndex, lastIndex); + } + Vc_ALWAYS_INLINE MemoryRange range(size_t firstIndex, size_t lastIndex) const { + return MemoryRange(p(), firstIndex, lastIndex); + } - // omit operator[] because the EntryType* cast operator suffices, for dox it makes sense to - // show it, though because it helps API discoverability. -#ifdef DOXYGEN /** * Returns the \p i-th scalar value in the memory. */ - inline EntryType &operator[](size_t i); + Vc_ALWAYS_INLINE EntryType &operator[](size_t i) { return entries()[i]; } /// Const overload of the above function. - inline const EntryType &operator[](size_t i) const; -#endif + Vc_ALWAYS_INLINE const EntryType &operator[](size_t i) const { return entries()[i]; } /** * Uses a vector gather to combine the entries at the indexes in \p i into the returned @@ -216,7 +304,7 @@ */ typedef typename V::EntryType EntryType; - static _VC_CONSTEXPR size_t rowCount() { return Parent::RowCount; } + static constexpr size_t rowCount() { return Parent::RowCount; } /** * Returns a pointer to the start of the allocated memory. @@ -236,11 +324,19 @@ * Returns the \p i-th row in the memory. */ Vc_ALWAYS_INLINE Vc_PURE RowMemory &operator[](size_t i) { +#ifdef Vc_RECURSIVE_MEMORY + return p()->m_mem[i]; +#else return RowMemory::fromRawData(entries(i)); +#endif } /// Const overload of the above function. Vc_ALWAYS_INLINE Vc_PURE const RowMemory &operator[](size_t i) const { +#ifdef Vc_RECURSIVE_MEMORY + return p()->m_mem[i]; +#else return RowMemory::fromRawData(const_cast(entries(i))); +#endif } /** @@ -251,7 +347,7 @@ Vc_ALWAYS_INLINE Vc_PURE size_t rowsCount() const { return p()->rowsCount(); } }; -//{{{1 +//dox{{{1 /** * \headerfile memorybase.h * @@ -264,6 +360,10 @@ */ template class MemoryBase : public MemoryDimensionBase //{{{1 { + static_assert((V::size() * sizeof(typename V::EntryType)) % V::MemoryAlignment == 0, + "Vc::Memory can only be used for data-parallel types storing a number " + "of values that's a multiple of the memory alignment."); + private: Parent *p() { return static_cast(this); } const Parent *p() const { return static_cast(this); } @@ -288,6 +388,24 @@ using MemoryDimensionBase::scalar; /** + * Return a (vectorized) iterator to the start of this memory object. + */ + template + Vc_ALWAYS_INLINE MemoryVectorIterator< V, Flags> begin(Flags flags = Flags()) { return &firstVector(flags); } + //! const overload of the above + template + Vc_ALWAYS_INLINE MemoryVectorIterator begin(Flags flags = Flags()) const { return &firstVector(flags); } + + /** + * Return a (vectorized) iterator to the end of this memory object. + */ + template + Vc_ALWAYS_INLINE MemoryVectorIterator< V, Flags> end(Flags flags = Flags()) { return &lastVector(flags) + 1; } + //! const overload of the above + template + Vc_ALWAYS_INLINE MemoryVectorIterator end(Flags flags = Flags()) const { return &lastVector(flags) + 1; } + + /** * \param i Selects the offset, where the vector should be read. * * \return a smart object to wrap the \p i-th vector in the memory. @@ -307,8 +425,9 @@ * access memory at fixed strides. If access to known offsets from the aligned vectors is * needed the vector(size_t, int) function can be used. */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vector(size_t i) { - return VectorPointerHelper(&entries()[i * V::Size]); + template + Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if::value, MemoryVector>::type &vector(size_t i, Flags = Flags()) { + return *new(&entries()[i * V::Size]) MemoryVector; } /** \brief Const overload of the above function * @@ -316,8 +435,9 @@ * * \return a smart object to wrap the \p i-th vector in the memory. */ - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vector(size_t i) const { - return VectorPointerHelperConst(&entries()[i * V::Size]); + template + Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if::value, MemoryVector>::type &vector(size_t i, Flags = Flags()) const { + return *new(const_cast(&entries()[i * V::Size])) MemoryVector; } /** @@ -335,12 +455,14 @@ * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. * - * \param align You must take care to determine whether an unaligned load/store is - * required. Per default an aligned load/store is used. If \p i is not a multiple of \c V::Size - * you must pass Vc::Unaligned here. - */ -#ifdef DOXYGEN - template inline VectorPointerHelper vectorAt(size_t i, A align = Vc::Aligned); + * \param flags You must take care to determine whether an unaligned load/store is + * required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size + * you may want to pass Vc::Aligned here. + */ + template + Vc_ALWAYS_INLINE Vc_PURE MemoryVector &vectorAt(size_t i, Flags flags = Flags()) { + return *new(&entries()[i]) MemoryVector; + } /** \brief Const overload of the above function * * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory. @@ -348,28 +470,14 @@ * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. * - * \param align You must take care to determine whether an unaligned load/store is - * required. Per default an aligned load/store is used. If \p i is not a multiple of \c V::Size - * you must pass Vc::Unaligned here. - */ - template inline const VectorPointerHelperConst vectorAt(size_t i, A align = Vc::Aligned) const; -#else - template - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vectorAt(size_t i, A) { - return VectorPointerHelper(&entries()[i]); - } - template - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vectorAt(size_t i, A) const { - return VectorPointerHelperConst(&entries()[i]); - } - - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vectorAt(size_t i) { - return VectorPointerHelper(&entries()[i]); - } - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vectorAt(size_t i) const { - return VectorPointerHelperConst(&entries()[i]); + * \param flags You must take care to determine whether an unaligned load/store is + * required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size + * you may want to pass Vc::Aligned here. + */ + template + Vc_ALWAYS_INLINE Vc_PURE MemoryVector &vectorAt(size_t i, Flags flags = Flags()) const { + return *new(const_cast(&entries()[i])) MemoryVector; } -#endif /** * \return a smart object to wrap the \p i-th vector + \p shift in the memory. @@ -398,12 +506,24 @@ * mem.vector(0, i) += 1; * \endcode */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vector(size_t i, int shift) { - return VectorPointerHelper(&entries()[i * V::Size + shift]); + template + Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if< + std::is_convertible::value, + MemoryVector() | Unaligned)>>::type & + vector(size_t i, ShiftT shift, Flags = Flags()) + { + return *new (&entries()[i * V::Size + shift]) + MemoryVector() | Unaligned)>; } /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vector(size_t i, int shift) const { - return VectorPointerHelperConst(&entries()[i * V::Size + shift]); + template + Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if< + std::is_convertible::value, + MemoryVector() | Unaligned)>>::type & + vector(size_t i, ShiftT shift, Flags = Flags()) const + { + return *new (const_cast(&entries()[i * V::Size + shift])) + MemoryVector() | Unaligned)>; } /** @@ -411,12 +531,14 @@ * * This function is simply a shorthand for vector(0). */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper firstVector() { - return VectorPointerHelper(entries()); + template + Vc_ALWAYS_INLINE Vc_PURE MemoryVector &firstVector(Flags = Flags()) { + return *new(entries()) MemoryVector; } /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst firstVector() const { - return VectorPointerHelperConst(entries()); + template + Vc_ALWAYS_INLINE Vc_PURE MemoryVector &firstVector(Flags = Flags()) const { + return *new(const_cast(entries())) MemoryVector; } /** @@ -424,12 +546,14 @@ * * This function is simply a shorthand for vector(vectorsCount() - 1). */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper lastVector() { - return VectorPointerHelper(&entries()[vectorsCount() * V::Size - V::Size]); + template + Vc_ALWAYS_INLINE Vc_PURE MemoryVector &lastVector(Flags = Flags()) { + return *new(&entries()[vectorsCount() * V::Size - V::Size]) MemoryVector; } /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst lastVector() const { - return VectorPointerHelperConst(&entries()[vectorsCount() * V::Size - V::Size]); + template + Vc_ALWAYS_INLINE Vc_PURE MemoryVector &lastVector(Flags = Flags()) const { + return *new(const_cast(&entries()[vectorsCount() * V::Size - V::Size])) MemoryVector; } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned char *indexes) const { return V(entries(), indexes); } @@ -437,6 +561,9 @@ Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned int *indexes) const { return V(entries(), indexes); } Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned long *indexes) const { return V(entries(), indexes); } + /** + * Zero the whole memory area. + */ Vc_ALWAYS_INLINE void setZero() { V zero(Vc::Zero); for (size_t i = 0; i < vectorsCount(); ++i) { @@ -444,6 +571,19 @@ } } + /** + * Assign a value to all vectors in the array. + */ + template + Vc_ALWAYS_INLINE Parent &operator=(U &&x) { + for (size_t i = 0; i < vectorsCount(); ++i) { + vector(i) = std::forward(x); + } + } + + /** + * (Inefficient) shorthand to add up two arrays. + */ template inline Parent &operator+=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); @@ -452,6 +592,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand to subtract two arrays. + */ template inline Parent &operator-=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); @@ -460,6 +604,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand to multiply two arrays. + */ template inline Parent &operator*=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); @@ -468,6 +616,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand to divide two arrays. + */ template inline Parent &operator/=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); @@ -476,6 +628,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand to add a value to an array. + */ inline Parent &operator+=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { @@ -483,6 +639,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand to subtract a value from an array. + */ inline Parent &operator-=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { @@ -490,6 +650,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand to multiply a value to an array. + */ inline Parent &operator*=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { @@ -497,6 +661,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand to divide an array with a value. + */ inline Parent &operator/=(EntryType rhs) { V v(rhs); for (size_t i = 0; i < vectorsCount(); ++i) { @@ -504,6 +672,10 @@ } return static_cast(*this); } + + /** + * (Inefficient) shorthand compare equality of two arrays. + */ template inline bool operator==(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); @@ -514,6 +686,10 @@ } return true; } + + /** + * (Inefficient) shorthand compare two arrays. + */ template inline bool operator!=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); @@ -524,6 +700,10 @@ } return true; } + + /** + * (Inefficient) shorthand compare two arrays. + */ template inline bool operator<(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); @@ -534,6 +714,10 @@ } return true; } + + /** + * (Inefficient) shorthand compare two arrays. + */ template inline bool operator<=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); @@ -544,6 +728,10 @@ } return true; } + + /** + * (Inefficient) shorthand compare two arrays. + */ template inline bool operator>(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); @@ -554,6 +742,10 @@ } return true; } + + /** + * (Inefficient) shorthand compare two arrays. + */ template inline bool operator>=(const MemoryBase &rhs) const { assert(vectorsCount() == rhs.vectorsCount()); @@ -566,7 +758,7 @@ } }; -namespace Internal +namespace Detail { template - Copyright (C) 2011 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. +#ifndef VC_COMMON_MEMORYFWD_H_ +#define VC_COMMON_MEMORYFWD_H_ - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MEMORYFWD_H -#define VC_COMMON_MEMORYFWD_H - -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { - template class Memory; -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +namespace Common +{ +template +class Memory; + +template +class MemoryBase; +} // namespace Common + +using Common::Memory; +} // namespace Vc -#endif // VC_COMMON_MEMORYFWD_H +#endif // VC_COMMON_MEMORYFWD_H_ diff -Nru vc-0.7.4/common/memory.h vc-1.3.0/common/memory.h --- vc-0.7.4/common/memory.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/memory.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,37 +1,45 @@ -/* This file is part of the Vc library. +/* This file is part of the Vc library. {{{ +Copyright © 2009-2015 Matthias Kretz - Copyright (C) 2009-2012 Matthias Kretz +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. +}}}*/ - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MEMORY_H -#define VC_COMMON_MEMORY_H +#ifndef VC_COMMON_MEMORY_H_ +#define VC_COMMON_MEMORY_H_ #include "memorybase.h" #include #include #include #include +#include #include "memoryfwd.h" +#include "malloc.h" #include "macros.h" -/*OUTER_NAMESPACE_BEGIN*/ -namespace Vc +namespace Vc_VERSIONED_NAMESPACE { - /** * Allocates memory on the Heap with alignment and padding suitable for vectorized access. * @@ -64,9 +72,9 @@ * \headerfile memory.h */ template -Vc_ALWAYS_INLINE_L T *Vc_ALWAYS_INLINE_R malloc(size_t n) +Vc_ALWAYS_INLINE T *malloc(size_t n) { - return static_cast(Internal::Helper::malloc(n * sizeof(T))); + return static_cast(Common::malloc(n * sizeof(T))); } /** @@ -93,9 +101,11 @@ template Vc_ALWAYS_INLINE void free(T *p) { - Internal::Helper::free(p); + Common::free(p); } +namespace Common +{ template struct _MemorySizeCalculation { enum AlignmentCalculations { @@ -117,28 +127,42 @@ * \param Size1 Number of rows * \param Size2 Number of columns */ -template class Memory : public VectorAlignedBaseT, public MemoryBase, 2, Memory > +template +#ifdef Vc_RECURSIVE_MEMORY +class Memory : public MemoryBase, 2, + Memory> +#else +class Memory : public AlignedBase, + public MemoryBase, 2, + Memory> +#endif { - public: - typedef typename V::EntryType EntryType; - private: - typedef MemoryBase, 2, Memory > Base; - friend class MemoryBase, 2, Memory >; - friend class MemoryDimensionBase, 2, Memory >; - enum InternalConstants { - PaddedSize2 = _MemorySizeCalculation::PaddedSize - }; -#if defined(VC_ICC) && defined(_WIN32) - __declspec(align(__alignof(VectorAlignedBaseT))) -#elif defined(VC_CLANG) - __attribute__((aligned(__alignof(VectorAlignedBaseT)))) -#elif defined(VC_MSVC) - VectorAlignedBaseT _force_alignment; - // __declspec(align(#)) accepts only numbers not __alignof nor just VectorAlignment - // by putting VectorAlignedBaseT here _force_alignment is aligned correctly. - // the downside is that there's a lot of padding before m_mem (32 Bytes with SSE) :( +public: + typedef typename V::EntryType EntryType; + +private: +#ifdef Vc_RECURSIVE_MEMORY + using RowMemory = Memory; +#else + using RowMemory = Memory; #endif - EntryType m_mem[Size1][PaddedSize2]; + typedef MemoryBase, 2, RowMemory> Base; + friend class MemoryBase, 2, RowMemory>; + friend class MemoryDimensionBase, 2, + RowMemory>; + enum : size_t { + Alignment = V::MemoryAlignment, + PaddedSize2 = _MemorySizeCalculation::PaddedSize + }; + alignas(static_cast(Alignment)) // GCC complains about 'is not an + // integer constant' unless the + // static_cast is present +#ifdef Vc_RECURSIVE_MEMORY + RowMemory m_mem[Size1]; +#else + EntryType m_mem[Size1][PaddedSize2]; +#endif + public: using Base::vector; enum Constants { @@ -146,12 +170,26 @@ VectorsCount = PaddedSize2 / V::Size }; +#ifdef Vc_RECURSIVE_MEMORY + Memory() = default; +#else + Memory() + { + if (InitPadding) { + if (Size1 > 32) + for (size_t i = 0; i < Size1; ++i) { + V::Zero().store(&m_mem[i][PaddedSize2 - V::Size], Vc::Streaming); + } + } + } +#endif + /** * \return the number of rows in the array. * * \note This function can be eliminated by an optimizing compiler. */ - _VC_CONSTEXPR size_t rowsCount() const { return RowCount; } + static constexpr size_t rowsCount() { return RowCount; } /** * \return the number of scalar entries in the whole array. * @@ -160,13 +198,13 @@ * * \note This function can be optimized into a compile-time constant. */ - _VC_CONSTEXPR size_t entriesCount() const { return Size1 * Size2; } + static constexpr size_t entriesCount() { return Size1 * Size2; } /** * \return the number of vectors in the whole array. * * \note This function can be optimized into a compile-time constant. */ - _VC_CONSTEXPR size_t vectorsCount() const { return VectorsCount * Size1; } + static constexpr size_t vectorsCount() { return VectorsCount * Size1; } /** * Copies the data from a different object. @@ -180,12 +218,12 @@ template Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); return *this; } @@ -202,11 +240,7 @@ } return *this; } - } -#if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32) - __attribute__((__aligned__(__alignof(VectorAlignedBaseT)))) -#endif - ; +}; /** * A helper class to simplify usage of correctly aligned and padded memory, allowing both vector and @@ -251,32 +285,31 @@ * \ingroup Utilities * \headerfile memory.h */ - template class Memory : public VectorAlignedBaseT, public MemoryBase, 1, void> +template +class Memory : +#ifndef Vc_RECURSIVE_MEMORY + public AlignedBase, +#endif + public MemoryBase, 1, void> { public: typedef typename V::EntryType EntryType; private: - typedef MemoryBase, 1, void> Base; - friend class MemoryBase, 1, void>; - friend class MemoryDimensionBase, 1, void>; - enum InternalConstants { - Alignment = V::Size, - AlignmentMask = Alignment - 1, - MaskedSize = Size & AlignmentMask, - Padding = Alignment - MaskedSize, + typedef MemoryBase, 1, void> Base; + friend class MemoryBase, 1, void>; + friend class MemoryDimensionBase, 1, void>; + enum : size_t { + Alignment = V::MemoryAlignment, // in Bytes + MaskedSize = Size & (V::Size - 1), // the fraction of Size that exceeds + // an integral multiple of V::Size + Padding = V::Size - MaskedSize, PaddedSize = MaskedSize == 0 ? Size : Size + Padding }; -#if defined(VC_ICC) && defined(_WIN32) - __declspec(align(__alignof(VectorAlignedBaseT))) -#elif defined(VC_CLANG) - __attribute__((aligned(__alignof(VectorAlignedBaseT)))) -#elif defined(VC_MSVC) - VectorAlignedBaseT _force_alignment; - // __declspec(align(#)) accepts only numbers not __alignof nor just VectorAlignment - // by putting VectorAlignedBaseT here _force_alignment is aligned correctly. - // the downside is that there's a lot of padding before m_mem (32 Bytes with SSE) :( -#endif - EntryType m_mem[PaddedSize]; + alignas(static_cast(Alignment)) // GCC complains about 'is not an + // integer constant' unless the + // static_cast is present + EntryType m_mem[PaddedSize]; + public: using Base::vector; enum Constants { @@ -284,6 +317,20 @@ VectorsCount = PaddedSize / V::Size }; + Memory() + { + if (InitPadding) { + Base::lastVector() = V::Zero(); + } + } + + Memory(std::initializer_list init) + { + Vc_ASSERT(init.size() <= Size); + Base::lastVector() = V::Zero(); + std::copy(init.begin(), init.end(), &m_mem[0]); + } + /** * Wrap existing data with the Memory convenience class. * @@ -306,14 +353,14 @@ * (not too early/not leaked). This function simply adds convenience functions to \em * access the memory. */ - static Vc_ALWAYS_INLINE Vc_CONST Memory &fromRawData(EntryType *ptr) + static Vc_ALWAYS_INLINE Vc_CONST Memory &fromRawData(EntryType *ptr) { // DANGER! This placement new has to use the right address. If the compiler decides // RowMemory requires padding before the actual data then the address has to be adjusted // accordingly char *addr = reinterpret_cast(ptr); - typedef Memory MM; - addr -= VC_OFFSETOF(MM, m_mem); + typedef Memory MM; + addr -= Vc_OFFSETOF(MM, m_mem); return *new(addr) MM; } @@ -322,42 +369,36 @@ * * \note This function can be optimized into a compile-time constant. */ - _VC_CONSTEXPR size_t entriesCount() const { return EntriesCount; } + static constexpr size_t entriesCount() { return EntriesCount; } /** * \return the number of vectors in the whole array. * * \note This function can be optimized into a compile-time constant. */ - _VC_CONSTEXPR size_t vectorsCount() const { return VectorsCount; } - -#ifdef VC_CXX11 - Vc_ALWAYS_INLINE Memory() = default; -#else - Vc_ALWAYS_INLINE Memory() {} -#endif + static constexpr size_t vectorsCount() { return VectorsCount; } inline Memory(const Memory &rhs) { - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); } template inline Memory(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); } inline Memory &operator=(const Memory &rhs) { - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); return *this; } template inline Memory &operator=(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); return *this; } @@ -371,11 +412,7 @@ } return *this; } - } -#if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32) - __attribute__((__aligned__(__alignof(VectorAlignedBaseT)) )) -#endif - ; + }; /** * A helper class that is very similar to Memory but with dynamically allocated memory and @@ -417,7 +454,7 @@ * \ingroup Utilities * \headerfile memory.h */ - template class Memory : public MemoryBase, 1, void> + template class Memory : public MemoryBase, 1, void> { public: typedef typename V::EntryType EntryType; @@ -453,6 +490,7 @@ m_mem(Vc::malloc(m_vectorsCount)) { m_vectorsCount /= V::Size; + Base::lastVector() = V::Zero(); } /** @@ -468,7 +506,7 @@ m_vectorsCount(rhs.vectorsCount()), m_mem(Vc::malloc(m_vectorsCount * V::Size)) { - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); } /** @@ -483,7 +521,7 @@ m_vectorsCount(rhs.vectorsCount()), m_mem(Vc::malloc(m_vectorsCount * V::Size)) { - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); } /** @@ -527,13 +565,13 @@ template Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); return *this; } Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); + Detail::copyVectors(*this, rhs); return *this; } @@ -564,7 +602,7 @@ */ Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr) { - Internal::Helper::prefetchForOneRead(addr); + Vc::Detail::prefetchForOneRead(addr, VectorAbi::Best()); } /** @@ -581,7 +619,7 @@ */ Vc_ALWAYS_INLINE void prefetchForModify(const void *addr) { - Internal::Helper::prefetchForModify(addr); + Vc::Detail::prefetchForModify(addr, VectorAbi::Best()); } /** @@ -596,7 +634,7 @@ */ Vc_ALWAYS_INLINE void prefetchClose(const void *addr) { - Internal::Helper::prefetchClose(addr); + Vc::Detail::prefetchClose(addr, VectorAbi::Best()); } /** @@ -611,7 +649,7 @@ */ Vc_ALWAYS_INLINE void prefetchMid(const void *addr) { - Internal::Helper::prefetchMid(addr); + Vc::Detail::prefetchMid(addr, VectorAbi::Best()); } /** @@ -626,17 +664,21 @@ */ Vc_ALWAYS_INLINE void prefetchFar(const void *addr) { - Internal::Helper::prefetchFar(addr); + Vc::Detail::prefetchFar(addr, VectorAbi::Best()); } +} // namespace Common -} // namespace Vc -/*OUTER_NAMESPACE_END*/ +using Common::Memory; +using Common::prefetchForOneRead; +using Common::prefetchForModify; +using Common::prefetchClose; +using Common::prefetchMid; +using Common::prefetchFar; +} // namespace Vc namespace std { template Vc_ALWAYS_INLINE void swap(Vc::Memory &a, Vc::Memory &b) { a.swap(b); } } // namespace std -#include "undomacros.h" - -#endif // VC_COMMON_MEMORY_H +#endif // VC_COMMON_MEMORY_H_ diff -Nru vc-0.7.4/common/operators.h vc-1.3.0/common/operators.h --- vc-0.7.4/common/operators.h 2014-05-15 06:38:24.000000000 -0500 +++ vc-1.3.0/common/operators.h 2016-10-27 02:05:02.000000000 -0500 @@ -1,209 +1,257 @@ -#ifndef VC_ICC -// ICC ICEs if the following type-traits are in the anonymous namespace -namespace -{ -#endif -template struct EnableIfNeitherIntegerNorVector : public EnableIf::Value, T> {}; -template struct EnableIfNeitherIntegerNorVector, T>; +/* This file is part of the Vc library. {{{ +Copyright © 2012-2016 Matthias Kretz -template struct IsVector { enum { Value = false }; }; -template struct IsVector > { enum { Value = true }; }; +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef COMMON_OPERATORS_H_ +#define COMMON_OPERATORS_H_ +#include "macros.h" -template struct IsTypeCombinationOf +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Detail { - enum { - Value = IsVector::Value ? (IsVector::Value ? ( // Vec × Vec - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) || - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) - ) : ( // Vec × Scalar - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) || - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) - )) : (IsVector::Value ? ( // Scalar × Vec - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) - ) : ( // Scalar × Scalar - ( IsEqualType::Value && IsEqualType::Value) || - ( IsEqualType::Value && IsEqualType::Value) - )) - }; +template +enable_if::value, U> is_convertible_to_any_vector(Vector); +template T is_convertible_to_any_vector(Vector); +template void is_convertible_to_any_vector(...); + +template ::value, + bool = std::is_integral::value> +struct FundamentalReturnType; +template struct FundamentalReturnType { + using type = typename std::conditional< + std::is_arithmetic::value, + typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type, + // U is not arithmetic, e.g. an enum or a type with e.g. operator int() + T>::type; +}; +template struct FundamentalReturnType { + using type = typename std::conditional< + std::is_arithmetic::value, U, + // U is not arithmetic, e.g. an enum or a type with e.g. operator int() + T>::type; +}; +template struct FundamentalReturnType { + using type = T; }; -template struct IsVectorOperands -{ - enum { - Value = (HasImplicitCast::Value && !HasImplicitCast::Value && !IsEqualType::Value && IsEqualType::Value) - || (HasImplicitCast::Value && !HasImplicitCast::Value && !IsEqualType::Value && IsEqualType::Value) - }; +template struct my_make_signed : public std::make_signed { +}; +template <> struct my_make_signed { + using type = bool; +}; + +template +struct higher_conversion_rank { + template + using fix_sign = + typename std::conditional<(std::is_unsigned::value || + std::is_unsigned::value), + typename std::make_unsigned::type, A>::type; + using T = typename my_make_signed::type; + using U = typename my_make_signed::type; + template + using c = typename std::conditional::value || + std::is_same::value, + Test, Otherwise>::type; + + using type = fix_sign>>>>>; +}; + +template struct FundamentalReturnType { + template + using c = typename std::conditional::type; + using type = + c<(sizeof(T) > sizeof(U)), T, + c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank::type>>; +}; +static_assert(std::is_same::type>::value, ""); + +template struct ReturnTypeImpl { + // no type => SFINAE +}; +template +struct ReturnTypeImpl, Vector, false, Deduced, false> { + using type = Vc::Vector::type, Abi>; +}; +template +struct ReturnTypeImpl, int, true, T, true> { + using type = Vc::Vector; +}; +template +struct ReturnTypeImpl, unsigned int, true, T, true> { + using type = Vc::Vector::type, Abi>; +}; +template +struct ReturnTypeImpl, U, true, T, Integral> { + using type = Vc::Vector::type, Abi>; +}; +template +struct ReturnTypeImpl, U, false, void, Integral> { + // no type => SFINAE +}; +template +struct ReturnTypeImpl, U, false, V, Integral> { + using type = Vc::Vector::type, Abi>; +}; +template +using ReturnType = ReturnTypeImpl< + V, T, std::is_arithmetic::value || std::is_convertible::value, + decltype(is_convertible_to_any_vector( + std::declval())), + std::is_integral::value>; + +template struct is_a_type : public std::true_type { }; -#ifndef VC_ICC -} -#endif -// float-int arithmetic operators //{{{1 -// These operators must be very picky about the exact types they want to handle. Once (uncontrolled) -// implicit type conversions get involved, ambiguous overloads will occur. E.g. a simple int × enum -// will become ambiguous because it can convert both to a vector type, which then can execute the -// operator. We can't argue that such code should not be used - it could break existing code, not -// under control of the developer, just by putting the Vc header somewhere on top. -// -// The following type combinations are safe (always symmetric): -// 1. Vector × Vector -// 2. Vector × Scalar (int, float, enum value, ...) -// 3. Some object that has a vector cast operator × Vector -// 4. Some object that has a vector cast operator × Scalar -// -// Additionally there are restrictions on which types combine to what resulting type: -// 1.a. float × double_v -> double_v -// 1.b. any int × double_v -> double_v -// 2.a. (u)int_v × float_v -> float_v -// 2.b. (u)int_v × float -> float_v -// 2.c. any int × float_v -> float_v -// 3.a. (u)short_v × sfloat_v -> sfloat_v -// 3.b. (u)short_v × float -> sfloat_v -// 3.c. short × sfloat_v -> sfloat_v -// 4.a. int_v × uint_v -> uint_v -// 4.b. any int × uint_v -> uint_v -// 4.c. unsigned int × int_v -> uint_v -// 4.d. signed int × int_v -> int_v -// 5. shorts like ints - -#define VC_OPERATOR_FORWARD_(ret, op) \ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - ((IsEqualType::Value || IsLikeInteger::Value) && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - ((IsEqualType::Value || IsLikeInteger::Value) && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, double_##ret>::Value operator op(const T0 &x, const T1 &y) { return double_v(x) op double_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, float_##ret>::Value operator op(const T0 &x, const T1 &y) { return float_v(x) op float_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, sfloat_##ret>::Value operator op(const T0 &x, const T1 &y) { return sfloat_v(x) op sfloat_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, uint_##ret>::Value operator op(const T0 &x, const T1 &y) { return uint_v(x) op uint_v(y); } \ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, int_##ret>::Value operator op(const T0 &x, const T1 &y) { return int_v(x) op int_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, ushort_##ret>::Value operator op(const T0 &x, const T1 &y) { return ushort_v(x) op ushort_v(y); } \ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, short_##ret>::Value operator op(const T0 &x, const T1 &y) { return short_v(x) op short_v(y); } - - -// break incorrect combinations -#define VC_OPERATOR_INTENTIONAL_ERROR_1(V, op) \ -template static inline typename EnableIfNeitherIntegerNorVector >::Value operator op(const V &, const T &) { return Vc::Error::invalid_operands_of_types(); } \ -template static inline typename EnableIfNeitherIntegerNorVector >::Value operator op(const T &, const V &) { return Vc::Error::invalid_operands_of_types(); } - -#define VC_OPERATOR_INTENTIONAL_ERROR_2(V1, V2, op) \ -static inline Vc::Error::invalid_operands_of_types operator op(V1::AsArg, V2::AsArg) { return Vc::Error::invalid_operands_of_types(); } \ -static inline Vc::Error::invalid_operands_of_types operator op(V2::AsArg, V1::AsArg) { return Vc::Error::invalid_operands_of_types(); } - -#define VC_OPERATOR_INTENTIONAL_ERROR_3(V, _T, op) \ -template static inline typename EnableIf::Value, Vc::Error::invalid_operands_of_types >::Value operator op(const V &, const T &) { return Vc::Error::invalid_operands_of_types(); } \ -template static inline typename EnableIf::Value, Vc::Error::invalid_operands_of_types >::Value operator op(const T &, const V &) { return Vc::Error::invalid_operands_of_types(); } - -//#define VC_EXTRA_CHECKING -#ifdef VC_EXTRA_CHECKING -#define VC_OPERATOR_INTENTIONAL_ERROR(op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, sfloat_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, float_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, int_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, uint_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, ushort_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( int_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( uint_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( int_v, ushort_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( uint_v, ushort_v, op) \ - VC_APPLY_1(VC_LIST_VECTOR_TYPES, VC_OPERATOR_INTENTIONAL_ERROR_1, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( float_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( float_v, ushort_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, float_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, int_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, uint_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_3( float_v, double, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_3(sfloat_v, double, op) +#ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS +#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true #else -#define VC_OPERATOR_INTENTIONAL_ERROR(op) +#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) \ + Detail::is_a_type, U>::type::EntryType>() \ + op_ std::declval, \ + U>::type::EntryType>())>::value #endif +} // namespace Detail -#define VC_OPERATOR_FORWARD_COMMUTATIVE(ret, op, op2) \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, double, double_##ret) operator op(T x, double_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, sfloat_##ret) operator op(T x, sfloat_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, float_##ret) operator op(T x, float_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, int, int_##ret) operator op(T x, int_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned int, uint_##ret) operator op(T x, uint_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, short, short_##ret) operator op(T x, short_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned short, ushort_##ret) operator op(T x, ushort_v::AsArg y) { return y op2 x; } \ -VC_OPERATOR_FORWARD_(ret, op) \ -VC_OPERATOR_INTENTIONAL_ERROR(op) - -#define VC_OPERATOR_FORWARD(ret, op) \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, double, double_##ret) operator op(T x, double_v::AsArg y) { return double_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, sfloat_##ret) operator op(T x, sfloat_v::AsArg y) { return sfloat_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, float_##ret) operator op(T x, float_v::AsArg y) { return float_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, int, int_##ret) operator op(T x, int_v::AsArg y) { return int_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned int, uint_##ret) operator op(T x, uint_v::AsArg y) { return uint_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, short, short_##ret) operator op(T x, short_v::AsArg y) { return short_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned short, ushort_##ret) operator op(T x, ushort_v::AsArg y) { return ushort_v(x) op y; } \ -VC_OPERATOR_FORWARD_(ret, op) \ -VC_OPERATOR_INTENTIONAL_ERROR(op) - -VC_OPERATOR_FORWARD_COMMUTATIVE(v, *, *) -VC_OPERATOR_FORWARD(v, /) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, +, +) -VC_OPERATOR_FORWARD(v, -) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, |, |) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, &, &) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, ^, ^) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, <, >) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, >, <) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, <=, >=) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, >=, <=) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, ==, ==) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, !=, !=) - -#undef VC_OPERATOR_FORWARD_ -#undef VC_OPERATOR_INTENTIONAL_ERROR_1 -#undef VC_OPERATOR_INTENTIONAL_ERROR_2 -#undef VC_OPERATOR_INTENTIONAL_ERROR -#undef VC_OPERATOR_FORWARD_COMMUTATIVE -#undef VC_OPERATOR_FORWARD +#define Vc_GENERIC_OPERATOR(op_) \ + template \ + Vc_ALWAYS_INLINE enable_if< \ + Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \ + std::is_convertible, typename Detail::ReturnType< \ + Vector, U>::type>::value && \ + std::is_convertible< \ + U, typename Detail::ReturnType, U>::type>::value, \ + typename Detail::ReturnType, U>::type> \ + operator op_(Vector x, const U &y) \ + { \ + using V = typename Detail::ReturnType, U>::type; \ + return Detail::operator op_(V(x), V(y)); \ + } \ + template \ + Vc_ALWAYS_INLINE enable_if< \ + Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \ + !Traits::is_simd_vector_internal::value && \ + std::is_convertible, typename Detail::ReturnType< \ + Vector, U>::type>::value && \ + std::is_convertible< \ + U, typename Detail::ReturnType, U>::type>::value, \ + typename Detail::ReturnType, U>::type> \ + operator op_(const U &x, Vector y) \ + { \ + using V = typename Detail::ReturnType, U>::type; \ + return Detail::operator op_(V(x), V(y)); \ + } \ + template \ + Vc_ALWAYS_INLINE enable_if< \ + Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \ + std::is_convertible, typename Detail::ReturnType< \ + Vector, U>::type>::value && \ + std::is_convertible< \ + U, typename Detail::ReturnType, U>::type>::value, \ + Vector &> \ + operator op_##=(Vector &x, const U &y) \ + { \ + using V = typename Detail::ReturnType, U>::type; \ + x = Detail::operator op_(V(x), V(y)); \ + return x; \ + } + +#define Vc_LOGICAL_OPERATOR(op_) \ + template \ + Vc_ALWAYS_INLINE typename Vector::Mask operator op_(Vector x, \ + Vector y) \ + { \ + return !!x op_ !!y; \ + } \ + template \ + Vc_ALWAYS_INLINE enable_if< \ + std::is_convertible, Vector>::value && \ + std::is_convertible, Vector>::value, \ + typename Detail::ReturnType, Vector>::type::Mask> \ + operator op_(Vector x, Vector y) \ + { \ + return !!x op_ !!y; \ + } \ + template \ + Vc_ALWAYS_INLINE \ + enable_if())>::value, \ + typename Vector::Mask> \ + operator op_(Vector x, const U &y) \ + { \ + using M = typename Vector::Mask; \ + return !!x op_ M(!!y); \ + } \ + template \ + Vc_ALWAYS_INLINE \ + enable_if())>::value, \ + typename Vector::Mask> \ + operator op_(const U &x, Vector y) \ + { \ + using M = typename Vector::Mask; \ + return M(!!x) op_ !!y; \ + } + +#define Vc_COMPARE_OPERATOR(op_) \ + template \ + Vc_ALWAYS_INLINE enable_if< \ + std::is_convertible, typename Detail::ReturnType< \ + Vector, U>::type>::value && \ + std::is_convertible< \ + U, typename Detail::ReturnType, U>::type>::value, \ + typename Detail::ReturnType, U>::type::Mask> \ + operator op_(Vector x, const U &y) \ + { \ + using V = typename Detail::ReturnType, U>::type; \ + return Detail::operator op_(V(x), V(y)); \ + } \ + template \ + Vc_ALWAYS_INLINE enable_if< \ + !Traits::is_simd_vector_internal::value && \ + std::is_convertible, typename Detail::ReturnType< \ + Vector, U>::type>::value && \ + std::is_convertible< \ + U, typename Detail::ReturnType, U>::type>::value, \ + typename Detail::ReturnType, U>::type::Mask> \ + operator op_(const U &x, Vector y) \ + { \ + using V = typename Detail::ReturnType, U>::type; \ + return Detail::operator op_(V(x), V(y)); \ + } + +Vc_ALL_LOGICAL (Vc_LOGICAL_OPERATOR); +Vc_ALL_BINARY (Vc_GENERIC_OPERATOR); +Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR); +Vc_ALL_COMPARES (Vc_COMPARE_OPERATOR); + +#undef Vc_LOGICAL_OPERATOR +#undef Vc_GENERIC_OPERATOR +#undef Vc_COMPARE_OPERATOR +#undef Vc_INVALID_OPERATOR -// }}}1 +} // namespace Vc +#endif // COMMON_OPERATORS_H_ diff -Nru vc-0.7.4/common/permutation.h vc-1.3.0/common/permutation.h --- vc-0.7.4/common/permutation.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/permutation.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,44 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_PERMUTATION_H_ +#define VC_COMMON_PERMUTATION_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Permutation +{ +struct ReversedTag {}; +constexpr ReversedTag Reversed{}; +} // namespace Permutation +} + +#endif // VC_COMMON_PERMUTATION_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/scatterimplementation.h vc-1.3.0/common/scatterimplementation.h --- vc-0.7.4/common/scatterimplementation.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/scatterimplementation.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,270 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_ +#define VC_COMMON_SCATTERIMPLEMENTATION_H_ + +#include "gatherimplementation.h" +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +template +Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT, + V &v, + MT *mem, + IT indexes, + typename V::MaskArgument mask) +{ + indexes.setZeroInverted(static_cast(mask)); + // Huh? + const V tmp(mem, indexes); + where(mask) | v = tmp; +} + +template +Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT, + V &v, + MT *mem, + const IT &indexes, + typename V::MaskArgument mask) +{ + if (Vc_IS_UNLIKELY(mask.isEmpty())) { + return; + } + Common::unrolled_loop([&](std::size_t i) { + if (mask[i]) + mem[indexes[i]] = v[i]; + }); +} + +template +Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT, + V &v, + MT *mem, + const IT &indexes, + typename V::MaskArgument mask) +{ + size_t bits = mask.toInt(); + while (Vc_IS_LIKELY(bits > 0)) { + size_t i, j; + asm("bsf %[bits],%[i]\n\t" + "bsr %[bits],%[j]\n\t" + "btr %[i],%[bits]\n\t" + "btr %[j],%[bits]\n\t" + : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits)); + mem[indexes[i]] = v[i]; + mem[indexes[j]] = v[j]; + } + + /* Alternative from Vc::SSE (0.7) + int bits = mask.toInt(); + while (bits) { + const int i = _bit_scan_forward(bits); + bits ^= (1 << i); // btr? + mem[indexes[i]] = v[i]; + } + */ +} + +template +Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, + V &v, + MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low, high = 0; + switch (Vc::Detail::popcnt16(bits)) { + case 16: + v.scatter(mem, indexes); + break; + case 15: + low = _bit_scan_forward(bits); + bits ^= 1 << low; + mem[indexes[low]] = v[low]; + case 14: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 13: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 12: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 11: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 10: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 9: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 8: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 7: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 6: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 5: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 4: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 3: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 2: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + case 1: + low = _bit_scan_forward(bits); + mem[indexes[low]] = v[low]; + case 0: + break; + } +} +template +Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, + V &v, + MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low, high = 0; + switch (Vc::Detail::popcnt8(bits)) { + case 8: + v.scatter(mem, indexes); + break; + case 7: + low = _bit_scan_forward(bits); + bits ^= 1 << low; + mem[indexes[low]] = v[low]; + case 6: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 5: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 4: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + high = (1 << high); + case 3: + low = _bit_scan_forward(bits); + bits ^= high | (1 << low); + mem[indexes[low]] = v[low]; + case 2: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + case 1: + low = _bit_scan_forward(bits); + mem[indexes[low]] = v[low]; + case 0: + break; + } +} +template +Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, + V &v, + MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low, high = 0; + switch (Vc::Detail::popcnt4(bits)) { + case 4: + v.scatter(mem, indexes); + break; + case 3: + low = _bit_scan_forward(bits); + bits ^= 1 << low; + mem[indexes[low]] = v[low]; + case 2: + high = _bit_scan_reverse(bits); + mem[indexes[high]] = v[high]; + case 1: + low = _bit_scan_forward(bits); + mem[indexes[low]] = v[low]; + case 0: + break; + } +} +template +Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, + V &v, + MT *mem, + const IT &indexes, + typename V::MaskArgument mask, + enable_if = nullarg) +{ + unsigned int bits = mask.toInt(); + unsigned int low; + switch (Vc::Detail::popcnt4(bits)) { + case 2: + v.scatter(mem, indexes); + break; + case 1: + low = _bit_scan_forward(bits); + mem[indexes[low]] = v[low]; + case 0: + break; + } +} + +} // namespace Common +} // namespace Vc + +#endif // VC_COMMON_SCATTERIMPLEMENTATION_H_ diff -Nru vc-0.7.4/common/scatterinterface.h vc-1.3.0/common/scatterinterface.h --- vc-0.7.4/common/scatterinterface.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/scatterinterface.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,282 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +/////////////////////////////////////////////////////////////////////////////////////////// +// scatters +// A scatter takes the following arguments: +// 1. A pointer to memory of any type that EntryType can convert to. +// 2. An indexes “vector”. The requirement is that the type implements the subscript operator, +// stores «Size» valid index values, and each offset to the pointer above yields a valid +// memory location for reading. +// 3. Optionally the third argument may be a mask. The mask disables several memory stores and +// thus removes the requirements in (2.) for the disabled entries. + +private: + /**\internal + * This function implements a scatter given a pointer to memory \p mem and some + * container object storing the scatter \p indexes. + * + * \param mem This pointer must be aligned correctly for the type \p MT. This is the + * natural behavior of C++, so this is typically the case. + * \param indexes This object contains at least \VSize{T} indexes that denote the + * offset in \p mem where the components for the current vector should be copied to. + * The offset is not in Bytes, but in multiples of `sizeof(MT)`. + */ + // enable_if::value && has_subscript_operator::value> + template + inline void scatterImplementation(MT *mem, IT &&indexes) const; + + /**\internal + * This overload of the above function adds a \p mask argument to disable memory + * accesses at the \p indexes offsets where \p mask is \c false. + */ + template + inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; + +public: +#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ + static_assert( \ + std::is_convertible::value, \ + "The memory pointer needs to point to a type that the EntryType of this " \ + "SIMD vector type can be converted to."); \ + static_assert( \ + Vc::Traits::has_subscript_operator::value, \ + "The indexes argument must be a type that implements the subscript operator."); \ + static_assert( \ + !Traits::is_simd_vector::value || \ + Traits::simd_vector_size::value >= Size, \ + "If you use a SIMD vector for the indexes parameter, the index vector must " \ + "have at least as many entries as this SIMD vector."); \ + static_assert( \ + !std::is_array::value || \ + (std::rank::value == 1 && \ + (std::extent::value == 0 || std::extent::value >= Size)), \ + "If you use a simple array for the indexes parameter, the array must have " \ + "at least as many entries as this SIMD vector.") + + /** + * \name Scatter functions + * + * Stores a vector to the objects at `mem[indexes[0]]`, `mem[indexes[1]]`, + * `mem[indexes[2]]`, ... + * + * \param mem A pointer to memory which contains objects of type \p MT at the offsets + * given by \p indexes. + * \param indexes + * \param mask + */ + ///@{ + + /// Scatter function + template ::value>> + Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const + { + Vc_ASSERT_SCATTER_PARAMETER_TYPES_; + scatterImplementation(mem, std::forward(indexes)); + } + + /// Masked scatter function + template ::value>> + Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const + { + Vc_ASSERT_SCATTER_PARAMETER_TYPES_; + scatterImplementation(mem, std::forward(indexes), mask); + } + ///@} + + /// \name Deprecated Members + ///@{ + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void scatter(S1 *array, EntryType S1::*member1, + IT indexes) const + { + scatter(Common::SubscriptOperation, true>( + array, indexes)[member1] + .scatterArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void scatter(S1 *array, EntryType S1::*member1, + IT indexes, MaskArgument mask) const + { + scatter(Common::SubscriptOperation, true>( + array, indexes)[member1] + .scatterArguments(), + mask); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that + * struct (i.e. array[i].*member1.*member2 is read). + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void scatter(S1 *array, S2 S1::*member1, + EntryType S2::*member2, + IT indexes) const + { + scatter(Common::SubscriptOperation, true>( + array, indexes)[member1][member2] + .scatterArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that + * struct (i.e. array[i].*member1.*member2 is read). + * \param indexes Determines the offsets into \p array where the values are gathered from/scattered + * to. The type of indexes can either be an integer vector or a type that supports + * operator[] access. + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void scatter(S1 *array, S2 S1::*member1, + EntryType S2::*member2, IT indexes, + MaskArgument mask) const + { + scatter(Common::SubscriptOperation, true>( + array, indexes)[member1][member2] + .scatterArguments(), + mask); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param outerIndexes + * \param innerIndexes + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1, + IT1 outerIndexes, + IT2 innerIndexes) const + { + scatter(Common::SubscriptOperation, true>( + array, outerIndexes)[ptrMember1][innerIndexes] + .scatterArguments()); + } + + /** + * \deprecated Use Vc::array or Vc::vector subscripting instead. + * + * \param array A pointer into memory (without alignment restrictions). + * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to + * be read. Thus the offsets in \p indexes are relative to the \p array and not to + * the size of the gathered type (i.e. array[i].*member1 is accessed instead of + * (&(array->*member1))[i]) + * \param outerIndexes + * \param innerIndexes + * \param mask If a mask is given only the active entries will be gathered/scattered. + */ + template + Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector " + "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1, + IT1 outerIndexes, IT2 innerIndexes, + MaskArgument mask) const + { + scatter(Common::SubscriptOperation, true>( + array, outerIndexes)[ptrMember1][innerIndexes] + .scatterArguments(), + mask); + } + ///@} + + /**\internal + * \name Scatter function to use from Vc::Common::subscript_operator + * + * \param args + * \param mask + */ + ///@{ + template + Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const + { + scatter(args.address, args.indexes); + } + + template + Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const + { + scatter(args.address, args.indexes, mask); + } + ///@} +#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ diff -Nru vc-0.7.4/common/set.h vc-1.3.0/common/set.h --- vc-0.7.4/common/set.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/set.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,92 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SET_H_ +#define VC_COMMON_SET_H_ + +#include "macros.h" +namespace Vc_VERSIONED_NAMESPACE +{ +namespace +{ + static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3, + unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7) + { +#if defined(Vc_GNU_ASM) +#if 0 // defined(__x86_64__) + // it appears that the 32bit variant is always faster + __m128i r; + unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2; + unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0; + asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1)); + unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6; + unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4; + asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3)); + return r; +#elif defined(Vc_USE_VEX_CODING) + __m128i r0, r1; + unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; + unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; + unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; + unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; + asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0)); + asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1)); + asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2)); + asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3)); + asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1)); + return r0; +#else + __m128i r0, r1; + unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; + unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; + unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; + unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; + asm("movd %1,%0" : "=x"(r0) : "r"(tmp0)); + asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1)); + asm("movd %1,%0" : "=x"(r1) : "r"(tmp2)); + asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3)); + asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1)); + return r0; +#endif +#else + unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; + unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; + unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; + unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; + return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3); +#endif + } + static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7) + { + return set(static_cast(x0), static_cast(x1), static_cast(x2), + static_cast(x3), static_cast(x4), static_cast(x5), + static_cast(x6), static_cast(x7)); + } +} // anonymous namespace +} // namespace Vc + +#endif // VC_COMMON_SET_H_ diff -Nru vc-0.7.4/common/simdarrayfwd.h vc-1.3.0/common/simdarrayfwd.h --- vc-0.7.4/common/simdarrayfwd.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/simdarrayfwd.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,172 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SIMDARRAYFWD_H_ +#define VC_COMMON_SIMDARRAYFWD_H_ + +#include "../scalar/types.h" +#include "../sse/types.h" +#include "../avx/types.h" +#include "../mic/types.h" + +#include "utility.h" +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ +/// \addtogroup SimdArray +/// @{ +/*select_best_vector_type{{{*/ +/** + * \internal + * Selects the best SIMD type out of a typelist to store N scalar values. + */ +template struct select_best_vector_type_impl; + +template struct select_best_vector_type_impl +{ + using type = T; +}; +template struct select_best_vector_type_impl +{ + using type = typename std::conditional< + (N < T::Size), typename select_best_vector_type_impl::type, + T>::type; +}; +template +using select_best_vector_type = + typename select_best_vector_type_impl, + Vc::SSE::Vector, + Vc::Scalar::Vector +#elif defined(Vc_IMPL_AVX) + Vc::AVX::Vector, + Vc::SSE::Vector, + Vc::Scalar::Vector +#elif defined(Vc_IMPL_Scalar) + Vc::Scalar::Vector +#elif defined(Vc_IMPL_SSE) + Vc::SSE::Vector, + Vc::Scalar::Vector +#elif defined(Vc_IMPL_MIC) + Vc::MIC::Vector, + Vc::Scalar::Vector +#endif + >::type; +//}}} +/// @} +} // namespace Common + +// === having SimdArray in the Vc namespace leads to a ABI bug === +// +// SimdArray can be { double[4] }, { __m128d[2] }, or { __m256d } even though the type +// is the same. +// The question is, what should SimdArray focus on? +// a) A type that makes interfacing between different implementations possible? +// b) Or a type that makes fixed size SIMD easier and efficient? +// +// a) can be achieved by using a union with T[N] as one member. But this may have more serious +// performance implications than only less efficient parameter passing (because compilers have a +// much harder time wrt. aliasing issues). Also alignment would need to be set to the sizeof in +// order to be compatible with targets with larger alignment requirements. +// But, the in-memory representation of masks is not portable. Thus, at the latest with AVX-512, +// there would be a problem with requiring SimdMaskArray to be an ABI compatible type. +// AVX-512 uses one bit per boolean, whereas SSE/AVX use sizeof(T) Bytes per boolean. Conversion +// between the two representations is not a trivial operation. Therefore choosing one or the other +// representation will have a considerable impact for the targets that do not use this +// representation. Since the future probably belongs to one bit per boolean representation, I would +// go with that choice. +// +// b) requires that SimdArray != SimdArray if +// SimdArray::vector_type != SimdArray::vector_type +// +// Therefore use SimdArray, where V follows from the above. +template , + size_t Wt = V::Size // this last parameter is only used for specialization of N + // == VectorSize + > +class SimdArray; + +template , + size_t Wt = V::Size // this last parameter is only used for specialization of N + // == VectorSize + > +class SimdMaskArray; + +/** \internal + * Simple traits for SimdArray to easily access internal types of non-atomic SimdArray + * types. + */ +template struct SimdArrayTraits { + static constexpr std::size_t N0 = Common::left_size(); + static constexpr std::size_t N1 = Common::right_size(); + + using storage_type0 = SimdArray; + using storage_type1 = SimdArray; +}; + +template +Vc_INTRINSIC_L typename SimdArrayTraits::storage_type0 &internal_data0( + SimdArray &x) Vc_INTRINSIC_R; +template +Vc_INTRINSIC_L typename SimdArrayTraits::storage_type1 &internal_data1( + SimdArray &x) Vc_INTRINSIC_R; +template +Vc_INTRINSIC_L const typename SimdArrayTraits::storage_type0 &internal_data0( + const SimdArray &x) Vc_INTRINSIC_R; +template +Vc_INTRINSIC_L const typename SimdArrayTraits::storage_type1 &internal_data1( + const SimdArray &x) Vc_INTRINSIC_R; + +template +Vc_INTRINSIC_L V &internal_data(SimdArray &x) Vc_INTRINSIC_R; +template +Vc_INTRINSIC_L const V &internal_data(const SimdArray &x) Vc_INTRINSIC_R; + +namespace Traits +{ +template struct is_atomic_simdarray_internal> : public std::true_type {}; +template struct is_atomic_simd_mask_array_internal> : public std::true_type {}; + +template struct is_simdarray_internal> : public std::true_type {}; +template struct is_simd_mask_array_internal> : public std::true_type {}; +template struct is_integral_internal , false> : public std::is_integral {}; +template struct is_floating_point_internal, false> : public std::is_floating_point {}; +template struct is_signed_internal , false> : public std::is_signed {}; +template struct is_unsigned_internal , false> : public std::is_unsigned {}; + +template struct has_no_allocated_data_impl> : public std::true_type {}; +} // namespace Traits + +} // namespace Vc + +#endif // VC_COMMON_SIMDARRAYFWD_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/simdarray.h vc-1.3.0/common/simdarray.h --- vc-0.7.4/common/simdarray.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/simdarray.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,2608 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SIMDARRAY_H_ +#define VC_COMMON_SIMDARRAY_H_ + +//#define Vc_DEBUG_SIMD_CAST 1 +//#define Vc_DEBUG_SORTED 1 +#if defined Vc_DEBUG_SIMD_CAST || defined Vc_DEBUG_SORTED +#include +#endif + +#include + +#include "writemaskedvector.h" +#include "simdarrayhelper.h" +#include "simdmaskarray.h" +#include "utility.h" +#include "interleave.h" +#include "indexsequence.h" +#include "transpose.h" +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +// internal namespace (product & sum helper) {{{1 +namespace internal +{ +template T Vc_INTRINSIC Vc_PURE product_helper_(const T &l, const T &r) { return l * r; } +template T Vc_INTRINSIC Vc_PURE sum_helper_(const T &l, const T &r) { return l + r; } +} // namespace internal + +// min & max declarations {{{1 +template +inline SimdArray min(const SimdArray &x, + const SimdArray &y); +template +inline SimdArray max(const SimdArray &x, + const SimdArray &y); + +// SimdArray class {{{1 +/// \addtogroup SimdArray +/// @{ + +// atomic SimdArray {{{1 +#define Vc_CURRENT_CLASS_NAME SimdArray +/**\internal + * Specialization of `SimdArray` for the case where `N == + * VectorSize`. + * + * This is specialized for implementation purposes: Since the general implementation uses + * two SimdArray data members it recurses over different SimdArray instantiations. The + * recursion is ended by this specialization, which has a single \p VectorType_ data + * member to which all functions are forwarded more or less directly. + */ +template +class SimdArray +{ + static_assert(std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value, + "SimdArray may only be used with T = { double, float, int32_t, uint32_t, " + "int16_t, uint16_t }"); + +public: + using VectorType = VectorType_; + using vector_type = VectorType; + using storage_type = vector_type; + using vectorentry_type = typename vector_type::VectorEntryType; + using value_type = T; + using mask_type = SimdMaskArray; + using index_type = SimdArray; + static constexpr std::size_t size() { return N; } + using Mask = mask_type; + using MaskType = Mask; + using MaskArgument = const MaskType &; + using VectorEntryType = vectorentry_type; + using EntryType = value_type; + using IndexType = index_type; + using AsArg = const SimdArray &; + using reference = Detail::ElementReference; + static constexpr std::size_t Size = size(); + static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment; + + // zero init +#ifndef Vc_MSVC // bogus error C2580 + Vc_INTRINSIC SimdArray() = default; +#endif + + // default copy ctor/operator + Vc_INTRINSIC SimdArray(const SimdArray &) = default; + Vc_INTRINSIC SimdArray(SimdArray &&) = default; + Vc_INTRINSIC SimdArray &operator=(const SimdArray &) = default; + + // broadcast + Vc_INTRINSIC SimdArray(const value_type &a) : data(a) {} + Vc_INTRINSIC SimdArray(value_type &a) : data(a) {} + Vc_INTRINSIC SimdArray(value_type &&a) : data(a) {} + template < + typename U, + typename = enable_if::value && !std::is_same::value>> + Vc_INTRINSIC SimdArray(U a) + : SimdArray(static_cast(a)) + { + } + + // implicit casts + template + Vc_INTRINSIC SimdArray(const SimdArray &x, enable_if = nullarg) + : data(simd_cast(internal_data(x))) + { + } + template + Vc_INTRINSIC SimdArray(const SimdArray &x, + enable_if<(N > V::Size && N <= 2 * V::Size)> = nullarg) + : data(simd_cast(internal_data(internal_data0(x)), internal_data(internal_data1(x)))) + { + } + template + Vc_INTRINSIC SimdArray(const SimdArray &x, + enable_if<(N > 2 * V::Size && N <= 4 * V::Size)> = nullarg) + : data(simd_cast(internal_data(internal_data0(internal_data0(x))), + internal_data(internal_data1(internal_data0(x))), + internal_data(internal_data0(internal_data1(x))), + internal_data(internal_data1(internal_data1(x))))) + { + } + + template + Vc_INTRINSIC SimdArray(Common::Segment &&x) + : data(simd_cast(x.data)) + { + } + + Vc_INTRINSIC SimdArray(const std::initializer_list &init) + : data(init.begin(), Vc::Unaligned) + { +#if defined Vc_CXX14 && 0 // doesn't compile yet + static_assert(init.size() == size(), "The initializer_list argument to " + "SimdArray must contain exactly N " + "values."); +#else + Vc_ASSERT(init.size() == size()); +#endif + } + + // implicit conversion from underlying vector_type + template < + typename V, + typename = enable_if::value && !Traits::isSimdArray::value>> + explicit Vc_INTRINSIC SimdArray(const V &x) + : data(simd_cast(x)) + { + } + + // implicit conversion to Vector for if Vector::size() == N and + // T implicitly convertible to U + template < + typename U, typename A, + typename = enable_if::value && Vector::Size == N>> + Vc_INTRINSIC operator Vector() const + { + return simd_cast>(data); + } + +#include "gatherinterface.h" +#include "scatterinterface.h" + + // forward all remaining ctors + template ::value && + !Traits::is_gather_signature::value && + !Traits::is_initializer_list::value>> + explicit Vc_INTRINSIC SimdArray(Args &&... args) + : data(std::forward(args)...) + { + } + + template + explicit Vc_INTRINSIC SimdArray( + Common::AddOffset) + : data(Vc::IndexesFromZero) + { + data += value_type(Offset); + } + + Vc_INTRINSIC void setZero() { data.setZero(); } + Vc_INTRINSIC void setZero(mask_type k) { data.setZero(internal_data(k)); } + Vc_INTRINSIC void setZeroInverted() { data.setZeroInverted(); } + Vc_INTRINSIC void setZeroInverted(mask_type k) { data.setZeroInverted(internal_data(k)); } + + Vc_INTRINSIC void setQnan() { data.setQnan(); } + Vc_INTRINSIC void setQnan(mask_type m) { data.setQnan(internal_data(m)); } + + // internal: execute specified Operation + template + static Vc_INTRINSIC SimdArray fromOperation(Op op, Args &&... args) + { + SimdArray r; + Common::unpackArgumentsAuto(op, r.data, std::forward(args)...); + return r; + } + + template + static Vc_INTRINSIC void callOperation(Op op, Args &&... args) + { + Common::unpackArgumentsAuto(op, nullptr, std::forward(args)...); + } + + static Vc_INTRINSIC SimdArray Zero() + { + return SimdArray(Vc::Zero); + } + static Vc_INTRINSIC SimdArray One() + { + return SimdArray(Vc::One); + } + static Vc_INTRINSIC SimdArray IndexesFromZero() + { + return SimdArray(Vc::IndexesFromZero); + } + static Vc_INTRINSIC SimdArray Random() + { + return fromOperation(Common::Operations::random()); + } + + template Vc_INTRINSIC void load(Args &&... args) + { + data.load(std::forward(args)...); + } + + template Vc_INTRINSIC void store(Args &&... args) const + { + data.store(std::forward(args)...); + } + + Vc_INTRINSIC mask_type operator!() const + { + return {!data}; + } + + Vc_INTRINSIC SimdArray operator-() const + { + return {-data}; + } + + /// Returns a copy of itself + Vc_INTRINSIC SimdArray operator+() const { return *this; } + + Vc_INTRINSIC SimdArray operator~() const + { + return {~data}; + } + + template ::value && std::is_integral::value>> + Vc_INTRINSIC Vc_CONST SimdArray operator<<(U x) const + { + return {data << x}; + } + template ::value && std::is_integral::value>> + Vc_INTRINSIC SimdArray &operator<<=(U x) + { + data <<= x; + return *this; + } + template ::value && std::is_integral::value>> + Vc_INTRINSIC Vc_CONST SimdArray operator>>(U x) const + { + return {data >> x}; + } + template ::value && std::is_integral::value>> + Vc_INTRINSIC SimdArray &operator>>=(U x) + { + data >>= x; + return *this; + } + +#define Vc_BINARY_OPERATOR_(op) \ + Vc_INTRINSIC Vc_CONST SimdArray operator op(const SimdArray &rhs) const \ + { \ + return {data op rhs.data}; \ + } \ + Vc_INTRINSIC SimdArray &operator op##=(const SimdArray &rhs) \ + { \ + data op## = rhs.data; \ + return *this; \ + } + Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_); + Vc_ALL_BINARY(Vc_BINARY_OPERATOR_); + Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_); +#undef Vc_BINARY_OPERATOR_ + +#define Vc_COMPARES(op) \ + Vc_INTRINSIC mask_type operator op(const SimdArray &rhs) const \ + { \ + return {data op rhs.data}; \ + } + Vc_ALL_COMPARES(Vc_COMPARES); +#undef Vc_COMPARES + + /// \copydoc Vector::isNegative + Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const + { + return {isnegative(data)}; + } + +private: + friend reference; + Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept + { + return o.data[i]; + } + template + Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept( + noexcept(std::declval() = v)) + { + o.data[i] = v; + } + +public: + Vc_INTRINSIC reference operator[](size_t i) noexcept + { + static_assert(noexcept(reference{std::declval(), int()}), ""); + return {*this, int(i)}; + } + Vc_INTRINSIC value_type operator[](size_t i) const noexcept + { + return get(*this, int(i)); + } + + Vc_INTRINSIC Common::WriteMaskedVector operator()(const mask_type &k) + { + return {*this, k}; + } + + Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k) + { + data.assign(v.data, internal_data(k)); + } + + // reductions //////////////////////////////////////////////////////// +#define Vc_REDUCTION_FUNCTION_(name_) \ + Vc_INTRINSIC Vc_PURE value_type name_() const { return data.name_(); } \ + Vc_INTRINSIC Vc_PURE value_type name_(mask_type mask) const \ + { \ + return data.name_(internal_data(mask)); \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + Vc_REDUCTION_FUNCTION_(min); + Vc_REDUCTION_FUNCTION_(max); + Vc_REDUCTION_FUNCTION_(product); + Vc_REDUCTION_FUNCTION_(sum); +#undef Vc_REDUCTION_FUNCTION_ + Vc_INTRINSIC Vc_PURE SimdArray partialSum() const { return data.partialSum(); } + + template Vc_INTRINSIC SimdArray apply(F &&f) const + { + return {data.apply(std::forward(f))}; + } + template Vc_INTRINSIC SimdArray apply(F &&f, const mask_type &k) const + { + return {data.apply(std::forward(f), k)}; + } + + Vc_INTRINSIC SimdArray shifted(int amount) const + { + return {data.shifted(amount)}; + } + + template + Vc_INTRINSIC SimdArray shifted(int amount, const SimdArray &shiftIn) + const + { + return {data.shifted(amount, simd_cast(shiftIn))}; + } + + Vc_INTRINSIC SimdArray rotated(int amount) const + { + return {data.rotated(amount)}; + } + + /// \copydoc Vector::exponent + Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC SimdArray exponent() const + { + return {exponent(data)}; + } + + Vc_INTRINSIC SimdArray interleaveLow(SimdArray x) const + { + return {data.interleaveLow(x.data)}; + } + Vc_INTRINSIC SimdArray interleaveHigh(SimdArray x) const + { + return {data.interleaveHigh(x.data)}; + } + + Vc_INTRINSIC SimdArray reversed() const + { + return {data.reversed()}; + } + + Vc_INTRINSIC SimdArray sorted() const + { + return {data.sorted()}; + } + + template static Vc_INTRINSIC SimdArray generate(const G &gen) + { + return {VectorType::generate(gen)}; + } + + Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC SimdArray + copySign(const SimdArray &reference) const + { + return {Vc::copysign(data, reference.data)}; + } + + friend VectorType &internal_data<>(SimdArray &x); + friend const VectorType &internal_data<>(const SimdArray &x); + + /// \internal + Vc_INTRINSIC SimdArray(VectorType &&x) : data(std::move(x)) {} + + Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type)); + +private: + // The alignas attribute attached to the class declaration above is ignored by ICC + // 17.0.0 (at least). So just move the alignas attribute down here where it works for + // all compilers. + alignas(static_cast( + Common::BoundedAlignment::value * sizeof(VectorType_) / + VectorType_::size()>::value)) storage_type data; +}; +template constexpr std::size_t SimdArray::Size; +template +constexpr std::size_t SimdArray::MemoryAlignment; +template +#ifndef Vc_MSVC +Vc_INTRINSIC +#endif +VectorType &internal_data(SimdArray &x) +{ + return x.data; +} +template +#ifndef Vc_MSVC +Vc_INTRINSIC +#endif +const VectorType &internal_data(const SimdArray &x) +{ + return x.data; +} + +// unpackIfSegment {{{2 +template T unpackIfSegment(T &&x) { return std::forward(x); } +template +auto unpackIfSegment(Common::Segment &&x) -> decltype(x.asSimdArray()) +{ + return x.asSimdArray(); +} + +// gatherImplementation {{{2 +template +template +inline void SimdArray::gatherImplementation(const MT *mem, + IT &&indexes) +{ + data.gather(mem, unpackIfSegment(std::forward(indexes))); +} +template +template +inline void SimdArray::gatherImplementation(const MT *mem, + IT &&indexes, + MaskArgument mask) +{ + data.gather(mem, unpackIfSegment(std::forward(indexes)), mask); +} + +// scatterImplementation {{{2 +template +template +inline void SimdArray::scatterImplementation(MT *mem, + IT &&indexes) const +{ + data.scatter(mem, unpackIfSegment(std::forward(indexes))); +} +template +template +inline void SimdArray::scatterImplementation(MT *mem, + IT &&indexes, + MaskArgument mask) const +{ + data.scatter(mem, unpackIfSegment(std::forward(indexes)), mask); +} + +// generic SimdArray {{{1 +/** + * Data-parallel arithmetic type with user-defined number of elements. + * + * \tparam T The type of the vector's elements. The supported types currently are limited + * to the types supported by Vc::Vector. + * + * \tparam N The number of elements to store and process concurrently. You can choose an + * arbitrary number, though not every number is a good idea. + * Generally, a power of two value or the sum of two power of two values might + * work efficiently, though this depends a lot on the target system. + * + * \tparam V Don't change the default value unless you really know what you are doing. + * This type is set to the underlying native Vc::Vector type used in the + * implementation of the type. + * Having it as part of the type name guards against some cases of ODR + * violations (i.e. linking incompatible translation units / libraries). + * + * \tparam Wt Don't ever change the default value. + * This parameter is an unfortunate implementation detail shining through. + * + * \warning Choosing \p N too large (what “too large” means depends on the target) will + * result in excessive compilation times and high (or too high) register + * pressure, thus potentially negating the improvement from concurrent execution. + * As a rule of thumb, keep \p N less or equal to `2 * float_v::size()`. + * + * \warning A special portability concern arises from a current limitation in the MIC + * implementation (Intel Knights Corner), where SimdArray types with \p T = \p + * (u)short require an \p N either less than short_v::size() or a multiple of + * short_v::size(). + * + * \headerfile simdarray.h + */ +template class SimdArray +{ + static_assert(std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value, "SimdArray may only be used with T = { double, float, int32_t, uint32_t, int16_t, uint16_t }"); + static_assert( + // either the EntryType and VectorEntryType of the main V are equal + std::is_same::value || + // or N is a multiple of V::size() + (N % V::size() == 0), + "SimdArray<(un)signed short, N> on MIC only works correctly for N = k * " + "MIC::(u)short_v::size(), i.e. k * 16."); + + using my_traits = SimdArrayTraits; + static constexpr std::size_t N0 = my_traits::N0; + static constexpr std::size_t N1 = my_traits::N1; + using Split = Common::Split; + template using CArray = U[K]; + +public: + using storage_type0 = typename my_traits::storage_type0; + using storage_type1 = typename my_traits::storage_type1; + static_assert(storage_type0::size() == N0, ""); + + /**\internal + * This type reveals the implementation-specific type used for the data member. + */ + using vector_type = V; + using vectorentry_type = typename storage_type0::vectorentry_type; + typedef vectorentry_type alias_type Vc_MAY_ALIAS; + + /// The type of the elements (i.e.\ \p T) + using value_type = T; + + /// The type of the mask used for masked operations and returned from comparisons. + using mask_type = SimdMaskArray; + + /// The type of the vector used for indexes in gather and scatter operations. + using index_type = SimdArray; + + /** + * Returns \p N, the number of scalar components in an object of this type. + * + * The size of the SimdArray, i.e. the number of scalar elements in the vector. In + * contrast to Vector::size() you have control over this value via the \p N template + * parameter of the SimdArray class template. + * + * \returns The number of scalar values stored and manipulated concurrently by objects + * of this type. + */ + static constexpr std::size_t size() { return N; } + + /// \copydoc mask_type + using Mask = mask_type; + /// \copydoc mask_type + using MaskType = Mask; + using MaskArgument = const MaskType &; + using VectorEntryType = vectorentry_type; + /// \copydoc value_type + using EntryType = value_type; + /// \copydoc index_type + using IndexType = index_type; + using AsArg = const SimdArray &; + + using reference = Detail::ElementReference; + + ///\copydoc Vector::MemoryAlignment + static constexpr std::size_t MemoryAlignment = + storage_type0::MemoryAlignment > storage_type1::MemoryAlignment + ? storage_type0::MemoryAlignment + : storage_type1::MemoryAlignment; + + /// \name Generators + ///@{ + + ///\copybrief Vector::Zero + static Vc_INTRINSIC SimdArray Zero() + { + return SimdArray(Vc::Zero); + } + + ///\copybrief Vector::One + static Vc_INTRINSIC SimdArray One() + { + return SimdArray(Vc::One); + } + + ///\copybrief Vector::IndexesFromZero + static Vc_INTRINSIC SimdArray IndexesFromZero() + { + return SimdArray(Vc::IndexesFromZero); + } + + ///\copydoc Vector::Random + static Vc_INTRINSIC SimdArray Random() + { + return fromOperation(Common::Operations::random()); + } + + ///\copybrief Vector::generate + template static Vc_INTRINSIC SimdArray generate(const G &gen) // {{{2 + { + auto tmp = storage_type0::generate(gen); // GCC bug: the order of evaluation in + // an initializer list is well-defined + // (front to back), but GCC 4.8 doesn't + // implement this correctly. Therefore + // we enforce correct order. + return {std::move(tmp), + storage_type1::generate([&](std::size_t i) { return gen(i + N0); })}; + } + ///@} + + /// \name Compile-Time Constant Initialization + ///@{ + + ///\copydoc Vector::Vector() +#ifndef Vc_MSVC // bogus error C2580 + SimdArray() = default; +#endif + ///@} + + /// \name Conversion/Broadcast Constructors + ///@{ + + ///\copydoc Vector::Vector(EntryType) + Vc_INTRINSIC SimdArray(value_type a) : data0(a), data1(a) {} + template < + typename U, + typename = enable_if::value && !std::is_same::value>> + SimdArray(U a) + : SimdArray(static_cast(a)) + { + } + ///@} + + // default copy ctor/operator + SimdArray(const SimdArray &) = default; + SimdArray(SimdArray &&) = default; + SimdArray &operator=(const SimdArray &) = default; + + // load ctor + template ::value>> + explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = Flags()) + : data0(mem, f), data1(mem + storage_type0::size(), f) + { + } + +// MSVC does overload resolution differently and takes the const U *mem overload (I hope) +#ifndef Vc_MSVC + /**\internal + * Load from a C-array. This is basically the same function as the load constructor + * above, except that the forwarding reference overload would steal the deal and the + * constructor above doesn't get called. This overload is required to enable loads + * from C-arrays. + */ + template ::value>> + explicit Vc_INTRINSIC SimdArray(CArray &mem, Flags f = Flags()) + : data0(&mem[0], f), data1(&mem[storage_type0::size()], f) + { + } + /**\internal + * Const overload of the above. + */ + template ::value>> + explicit Vc_INTRINSIC SimdArray(const CArray &mem, Flags f = Flags()) + : data0(&mem[0], f), data1(&mem[storage_type0::size()], f) + { + } +#endif + + // initializer list + Vc_INTRINSIC SimdArray(const std::initializer_list &init) + : data0(init.begin(), Vc::Unaligned) + , data1(init.begin() + storage_type0::size(), Vc::Unaligned) + { +#if defined Vc_CXX14 && 0 // doesn't compile yet + static_assert(init.size() == size(), "The initializer_list argument to " + "SimdArray must contain exactly N " + "values."); +#else + Vc_ASSERT(init.size() == size()); +#endif + } + +#include "gatherinterface.h" +#include "scatterinterface.h" + + // forward all remaining ctors + template ::value && + !Traits::is_initializer_list::value && + !Traits::is_gather_signature::value && + !Traits::is_load_arguments::value>> + explicit Vc_INTRINSIC SimdArray(Args &&... args) + : data0(Split::lo(args)...) // no forward here - it could move and thus + // break the next line + , data1(Split::hi(std::forward(args))...) + { + } + + // explicit casts + template + Vc_INTRINSIC explicit SimdArray( + W &&x, + enable_if<(Traits::is_simd_vector::value && Traits::simd_vector_size::value == N && + !(std::is_convertible, T>::value && + Traits::isSimdArray::value))> = nullarg) + : data0(Split::lo(x)), data1(Split::hi(x)) + { + } + + // implicit casts + template + Vc_INTRINSIC SimdArray( + W &&x, + enable_if<(Traits::isSimdArray::value && Traits::simd_vector_size::value == N && + std::is_convertible, T>::value)> = nullarg) + : data0(Split::lo(x)), data1(Split::hi(x)) + { + } + + // implicit conversion to Vector for if Vector::size() == N and + // T implicitly convertible to U + template < + typename U, typename A, + typename = enable_if::value && Vector::Size == N>> + operator Vector() const + { + return simd_cast>(data0, data1); + } + + //////////////////// other functions /////////////// + + Vc_INTRINSIC void setZero() + { + data0.setZero(); + data1.setZero(); + } + Vc_INTRINSIC void setZero(const mask_type &k) + { + data0.setZero(Split::lo(k)); + data1.setZero(Split::hi(k)); + } + Vc_INTRINSIC void setZeroInverted() + { + data0.setZeroInverted(); + data1.setZeroInverted(); + } + Vc_INTRINSIC void setZeroInverted(const mask_type &k) + { + data0.setZeroInverted(Split::lo(k)); + data1.setZeroInverted(Split::hi(k)); + } + + + Vc_INTRINSIC void setQnan() { + data0.setQnan(); + data1.setQnan(); + } + Vc_INTRINSIC void setQnan(const mask_type &m) { + data0.setQnan(Split::lo(m)); + data1.setQnan(Split::hi(m)); + } + + ///\internal execute specified Operation + template + static Vc_INTRINSIC SimdArray fromOperation(Op op, Args &&... args) + { + SimdArray r = { + storage_type0::fromOperation(op, Split::lo(args)...), // no forward here - it + // could move and thus + // break the next line + storage_type1::fromOperation(op, Split::hi(std::forward(args))...)}; + return r; + } + + ///\internal + template + static Vc_INTRINSIC void callOperation(Op op, Args &&... args) + { + storage_type0::callOperation(op, Split::lo(args)...); + storage_type1::callOperation(op, Split::hi(std::forward(args))...); + } + + + template Vc_INTRINSIC void load(const U *mem, Args &&... args) + { + data0.load(mem, Split::lo(args)...); // no forward here - it could move and thus + // break the next line + data1.load(mem + storage_type0::size(), Split::hi(std::forward(args))...); + } + + template Vc_INTRINSIC void store(U *mem, Args &&... args) const + { + data0.store(mem, Split::lo(args)...); // no forward here - it could move and thus + // break the next line + data1.store(mem + storage_type0::size(), Split::hi(std::forward(args))...); + } + + Vc_INTRINSIC mask_type operator!() const + { + return {!data0, !data1}; + } + + Vc_INTRINSIC SimdArray operator-() const + { + return {-data0, -data1}; + } + + /// Returns a copy of itself + Vc_INTRINSIC SimdArray operator+() const { return *this; } + + Vc_INTRINSIC SimdArray operator~() const + { + return {~data0, ~data1}; + } + + // left/right shift operators {{{2 + template ::value && std::is_integral::value>> + Vc_INTRINSIC Vc_CONST SimdArray operator<<(U x) const + { + return {data0 << x, data1 << x}; + } + template ::value && std::is_integral::value>> + Vc_INTRINSIC SimdArray &operator<<=(U x) + { + data0 <<= x; + data1 <<= x; + return *this; + } + template ::value && std::is_integral::value>> + Vc_INTRINSIC Vc_CONST SimdArray operator>>(U x) const + { + return {data0 >> x, data1 >> x}; + } + template ::value && std::is_integral::value>> + Vc_INTRINSIC SimdArray &operator>>=(U x) + { + data0 >>= x; + data1 >>= x; + return *this; + } + + // binary operators {{{2 +#define Vc_BINARY_OPERATOR_(op) \ + Vc_INTRINSIC Vc_CONST SimdArray operator op(const SimdArray &rhs) const \ + { \ + return {data0 op rhs.data0, data1 op rhs.data1}; \ + } \ + Vc_INTRINSIC SimdArray &operator op##=(const SimdArray &rhs) \ + { \ + data0 op## = rhs.data0; \ + data1 op## = rhs.data1; \ + return *this; \ + } + Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_); + Vc_ALL_BINARY(Vc_BINARY_OPERATOR_); + Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_); +#undef Vc_BINARY_OPERATOR_ + +#define Vc_COMPARES(op) \ + Vc_INTRINSIC mask_type operator op(const SimdArray &rhs) const \ + { \ + return {data0 op rhs.data0, data1 op rhs.data1}; \ + } + Vc_ALL_COMPARES(Vc_COMPARES); +#undef Vc_COMPARES + + // operator[] {{{2 + /// \name Scalar Subscript Operators + ///@{ + +private: + friend reference; + Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept + { + return reinterpret_cast(&o)[i]; + } + template + Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept( + noexcept(std::declval() = v)) + { + reinterpret_cast(&o)[i] = v; + } + +public: + ///\copydoc Vector::operator[](size_t) + Vc_INTRINSIC reference operator[](size_t i) noexcept + { + static_assert(noexcept(reference{std::declval(), int()}), ""); + return {*this, int(i)}; + } + + ///\copydoc Vector::operator[](size_t) const + Vc_INTRINSIC value_type operator[](size_t index) const noexcept + { + return get(*this, int(index)); + } + ///@} + + // operator(){{{2 + ///\copydoc Vector::operator()(MaskType) + Vc_INTRINSIC Common::WriteMaskedVector operator()( + const mask_type &mask) + { + return {*this, mask}; + } + + ///\internal + Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k) //{{{2 + { + data0.assign(v.data0, internal_data0(k)); + data1.assign(v.data1, internal_data1(k)); + } + + // reductions {{{2 +#define Vc_REDUCTION_FUNCTION_(name_, binary_fun_, scalar_fun_) \ +private: \ + template \ + Vc_INTRINSIC enable_if::value && \ + storage_type0::Size == storage_type1::Size, \ + value_type> name_##_impl() const \ + { \ + return binary_fun_(data0, data1).name_(); \ + } \ + \ + template \ + Vc_INTRINSIC enable_if::value && \ + storage_type0::Size != storage_type1::Size, \ + value_type> name_##_impl() const \ + { \ + return scalar_fun_(data0.name_(), data1.name_()); \ + } \ + \ +public: \ + /**\copybrief Vector::##name_ */ \ + Vc_INTRINSIC value_type name_() const { return name_##_impl(); } \ + /**\copybrief Vector::##name_ */ \ + Vc_INTRINSIC value_type name_(const mask_type &mask) const \ + { \ + if (Vc_IS_UNLIKELY(Split::lo(mask).isEmpty())) { \ + return data1.name_(Split::hi(mask)); \ + } else if (Vc_IS_UNLIKELY(Split::hi(mask).isEmpty())) { \ + return data0.name_(Split::lo(mask)); \ + } else { \ + return scalar_fun_(data0.name_(Split::lo(mask)), \ + data1.name_(Split::hi(mask))); \ + } \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + Vc_REDUCTION_FUNCTION_(min, Vc::min, std::min); + Vc_REDUCTION_FUNCTION_(max, Vc::max, std::max); + Vc_REDUCTION_FUNCTION_(product, internal::product_helper_, internal::product_helper_); + Vc_REDUCTION_FUNCTION_(sum, internal::sum_helper_, internal::sum_helper_); +#undef Vc_REDUCTION_FUNCTION_ + ///\copybrief Vector::partialSum + Vc_INTRINSIC Vc_PURE SimdArray partialSum() const //{{{2 + { + auto ps0 = data0.partialSum(); + auto tmp = data1; + tmp[0] += ps0[data0.size() - 1]; + return {std::move(ps0), tmp.partialSum()}; + } + + // apply {{{2 + ///\copybrief Vector::apply(F &&) const + template inline SimdArray apply(F &&f) const + { + return {data0.apply(f), data1.apply(f)}; + } + ///\copybrief Vector::apply(F &&, MaskType) const + template inline SimdArray apply(F &&f, const mask_type &k) const + { + return {data0.apply(f, Split::lo(k)), data1.apply(f, Split::hi(k))}; + } + + // shifted {{{2 + ///\copybrief Vector::shifted(int) const + inline SimdArray shifted(int amount) const + { + constexpr int SSize = Size; + constexpr int SSize0 = storage_type0::Size; + constexpr int SSize1 = storage_type1::Size; + if (amount == 0) { + return *this; + } + if (amount < 0) { + if (amount > -SSize0) { + return {data0.shifted(amount), data1.shifted(amount, data0)}; + } + if (amount == -SSize0) { + return {storage_type0::Zero(), simd_cast(data0)}; + } + if (amount < -SSize0) { + return {storage_type0::Zero(), simd_cast(data0.shifted( + amount + SSize0))}; + } + return Zero(); + } else { + if (amount >= SSize) { + return Zero(); + } else if (amount >= SSize0) { + return { + simd_cast(data1).shifted(amount - SSize0), + storage_type1::Zero()}; + } else if (amount >= SSize1) { + return {data0.shifted(amount, data1), storage_type1::Zero()}; + } else { + return {data0.shifted(amount, data1), data1.shifted(amount)}; + } + } + } + + template + inline enable_if< + !(std::is_same::value && // not bisectable + N == NN), + SimdArray> + shifted(int amount, const SimdArray &shiftIn) const + { + constexpr int SSize = Size; + if (amount < 0) { + return SimdArray::generate([&](int i) -> value_type { + i += amount; + if (i >= 0) { + return operator[](i); + } else if (i >= -SSize) { + return shiftIn[i + SSize]; + } + return 0; + }); + } + return SimdArray::generate([&](int i) -> value_type { + i += amount; + if (i < SSize) { + return operator[](i); + } else if (i < 2 * SSize) { + return shiftIn[i - SSize]; + } + return 0; + }); + } + +private: + // workaround for MSVC not understanding the simpler and shorter expression of the boolean + // expression directly in the enable_if below + template struct bisectable_shift + : public std::integral_constant::value && // bisectable + N == NN> + { + }; + +public: + template + inline SimdArray shifted(enable_if::value, int> amount, + const SimdArray &shiftIn) const + { + constexpr int SSize = Size; + if (amount < 0) { + if (amount > -static_cast(storage_type0::Size)) { + return {data0.shifted(amount, internal_data1(shiftIn)), + data1.shifted(amount, data0)}; + } + if (amount == -static_cast(storage_type0::Size)) { + return {storage_type0(internal_data1(shiftIn)), storage_type1(data0)}; + } + if (amount > -SSize) { + return { + internal_data1(shiftIn) + .shifted(amount + static_cast(storage_type0::Size), internal_data0(shiftIn)), + data0.shifted(amount + static_cast(storage_type0::Size), internal_data1(shiftIn))}; + } + if (amount == -SSize) { + return shiftIn; + } + if (amount > -2 * SSize) { + return shiftIn.shifted(amount + SSize); + } + } + if (amount == 0) { + return *this; + } + if (amount < static_cast(storage_type0::Size)) { + return {data0.shifted(amount, data1), + data1.shifted(amount, internal_data0(shiftIn))}; + } + if (amount == static_cast(storage_type0::Size)) { + return {storage_type0(data1), storage_type1(internal_data0(shiftIn))}; + } + if (amount < SSize) { + return {data1.shifted(amount - static_cast(storage_type0::Size), internal_data0(shiftIn)), + internal_data0(shiftIn) + .shifted(amount - static_cast(storage_type0::Size), internal_data1(shiftIn))}; + } + if (amount == SSize) { + return shiftIn; + } + if (amount < 2 * SSize) { + return shiftIn.shifted(amount - SSize); + } + return Zero(); + } + + // rotated {{{2 + ///\copybrief Vector::rotated + Vc_INTRINSIC SimdArray rotated(int amount) const + { + amount %= int(size()); + if (amount == 0) { + return *this; + } else if (amount < 0) { + amount += size(); + } + +#ifdef Vc_MSVC + // MSVC fails to find a SimdArray::shifted function with 2 arguments. So use store + // -> + // load to implement the function instead. + alignas(MemoryAlignment) T tmp[N + data0.size()]; + data0.store(&tmp[0], Vc::Aligned); + data1.store(&tmp[data0.size()], Vc::Aligned); + data0.store(&tmp[N], Vc::Unaligned); + SimdArray r; + r.data0.load(&tmp[amount], Vc::Unaligned); + r.data1.load(&tmp[(amount + data0.size()) % size()], Vc::Unaligned); + return r; +#else + auto &&d0cvtd = simd_cast(data0); + auto &&d1cvtd = simd_cast(data1); + constexpr int size0 = storage_type0::size(); + constexpr int size1 = storage_type1::size(); + + if (amount == size0 && std::is_same::value) { + return {std::move(d1cvtd), std::move(d0cvtd)}; + } else if (amount < size1) { + return {data0.shifted(amount, d1cvtd), data1.shifted(amount, d0cvtd)}; + } else if (amount == size1) { + return {data0.shifted(amount, d1cvtd), std::move(d0cvtd)}; + } else if (int(size()) - amount < size1) { + return {data0.shifted(amount - int(size()), d1cvtd.shifted(size1 - size0)), + data1.shifted(amount - int(size()), data0.shifted(size0 - size1))}; + } else if (int(size()) - amount == size1) { + return {data0.shifted(-size1, d1cvtd.shifted(size1 - size0)), + simd_cast(data0.shifted(size0 - size1))}; + } else if (amount <= size0) { + return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0), + simd_cast(data0.shifted(amount - size1))}; + } else { + return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0), + simd_cast(data0.shifted(amount - size1, d1cvtd))}; + } + return *this; +#endif + } + + // interleaveLow/-High {{{2 + ///\internal \copydoc Vector::interleaveLow + Vc_INTRINSIC SimdArray interleaveLow(const SimdArray &x) const + { + // return data0[0], x.data0[0], data0[1], x.data0[1], ... + return {data0.interleaveLow(x.data0), + simd_cast(data0.interleaveHigh(x.data0))}; + } + ///\internal \copydoc Vector::interleaveHigh + Vc_INTRINSIC SimdArray interleaveHigh(const SimdArray &x) const + { + return interleaveHighImpl( + x, + std::integral_constant()); + } + +private: + ///\internal + Vc_INTRINSIC SimdArray interleaveHighImpl(const SimdArray &x, std::true_type) const + { + return {data1.interleaveLow(x.data1), data1.interleaveHigh(x.data1)}; + } + ///\internal + inline SimdArray interleaveHighImpl(const SimdArray &x, std::false_type) const + { + return {data0.interleaveHigh(x.data0) + .shifted(storage_type1::Size, + simd_cast(data1.interleaveLow(x.data1))), + data1.interleaveHigh(x.data1)}; + } + +public: + ///\copybrief Vector::reversed + inline SimdArray reversed() const //{{{2 + { + if (std::is_same::value) { + return {simd_cast(data1).reversed(), + simd_cast(data0).reversed()}; + } else { +#ifdef Vc_MSVC + // MSVC fails to find a SimdArray::shifted function with 2 arguments. So use + // store + // -> load to implement the function instead. + alignas(MemoryAlignment) T tmp[N]; + data1.reversed().store(&tmp[0], Vc::Aligned); + data0.reversed().store(&tmp[data1.size()], Vc::Unaligned); + return SimdArray{&tmp[0], Vc::Aligned}; +#else + return {data0.shifted(storage_type1::Size, data1).reversed(), + simd_cast(data0.reversed().shifted( + storage_type0::Size - storage_type1::Size))}; +#endif + } + } + ///\copydoc Vector::sorted + inline SimdArray sorted() const //{{{2 + { + return sortedImpl( + std::integral_constant()); + } + + ///\internal + Vc_INTRINSIC SimdArray sortedImpl(std::true_type) const + { +#ifdef Vc_DEBUG_SORTED + std::cerr << "-- " << data0 << data1 << '\n'; +#endif + const auto a = data0.sorted(); + const auto b = data1.sorted().reversed(); + const auto lo = Vc::min(a, b); + const auto hi = Vc::max(a, b); + return {lo.sorted(), hi.sorted()}; + } + + ///\internal + Vc_INTRINSIC SimdArray sortedImpl(std::false_type) const + { + using SortableArray = + SimdArray::value>; + auto sortable = simd_cast(*this); + for (std::size_t i = Size; i < SortableArray::Size; ++i) { + using limits = std::numeric_limits; + if (limits::has_infinity) { + sortable[i] = limits::infinity(); + } else { + sortable[i] = std::numeric_limits::max(); + } + } + return simd_cast(sortable.sorted()); + + /* The following implementation appears to be less efficient. But this may need further + * work. + const auto a = data0.sorted(); + const auto b = data1.sorted(); +#ifdef Vc_DEBUG_SORTED + std::cerr << "== " << a << b << '\n'; +#endif + auto aIt = Vc::begin(a); + auto bIt = Vc::begin(b); + const auto aEnd = Vc::end(a); + const auto bEnd = Vc::end(b); + return SimdArray::generate([&](std::size_t) { + if (aIt == aEnd) { + return *(bIt++); + } + if (bIt == bEnd) { + return *(aIt++); + } + if (*aIt < *bIt) { + return *(aIt++); + } else { + return *(bIt++); + } + }); + */ + } + + /// \name Deprecated Members + ///@{ + + ///\copydoc size + ///\deprecated Use size() instead. + static constexpr std::size_t Size = size(); + + /// \copydoc Vector::exponent + Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC SimdArray exponent() const + { + return {exponent(data0), exponent(data1)}; + } + + /// \copydoc Vector::isNegative + Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const + { + return {isnegative(data0), isnegative(data1)}; + } + + ///\copydoc Vector::copySign + Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC SimdArray + copySign(const SimdArray &reference) const + { + return {Vc::copysign(data0, reference.data0), + Vc::copysign(data1, reference.data1)}; + } + ///@} + + // internal_data0/1 {{{2 + friend storage_type0 &internal_data0<>(SimdArray &x); + friend storage_type1 &internal_data1<>(SimdArray &x); + friend const storage_type0 &internal_data0<>(const SimdArray &x); + friend const storage_type1 &internal_data1<>(const SimdArray &x); + + /// \internal + Vc_INTRINSIC SimdArray(storage_type0 &&x, storage_type1 &&y) //{{{2 + : data0(std::move(x)), data1(std::move(y)) + { + } + + Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type0)); + +private: //{{{2 + // The alignas attribute attached to the class declaration above is ignored by ICC + // 17.0.0 (at least). So just move the alignas attribute down here where it works for + // all compilers. + alignas(static_cast( + Common::BoundedAlignment::value * sizeof(V) / + V::size()>::value)) storage_type0 data0; + storage_type1 data1; +}; +#undef Vc_CURRENT_CLASS_NAME +template +constexpr std::size_t SimdArray::Size; +template +constexpr std::size_t SimdArray::MemoryAlignment; + +// gatherImplementation {{{2 +template +template +inline void SimdArray::gatherImplementation(const MT *mem, + IT &&indexes) +{ + data0.gather(mem, Split::lo(Common::Operations::gather(), + indexes)); // don't forward indexes - it could move and + // thus break the next line + data1.gather(mem, Split::hi(Common::Operations::gather(), std::forward(indexes))); +} +template +template +inline void SimdArray::gatherImplementation(const MT *mem, + IT &&indexes, MaskArgument mask) +{ + data0.gather(mem, Split::lo(Common::Operations::gather(), indexes), + Split::lo(mask)); // don't forward indexes - it could move and + // thus break the next line + data1.gather(mem, Split::hi(Common::Operations::gather(), std::forward(indexes)), + Split::hi(mask)); +} + +// scatterImplementation {{{2 +template +template +inline void SimdArray::scatterImplementation(MT *mem, + IT &&indexes) const +{ + data0.scatter(mem, Split::lo(Common::Operations::gather(), + indexes)); // don't forward indexes - it could move and + // thus break the next line + data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward(indexes))); +} +template +template +inline void SimdArray::scatterImplementation(MT *mem, + IT &&indexes, MaskArgument mask) const +{ + data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes), + Split::lo(mask)); // don't forward indexes - it could move and + // thus break the next line + data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward(indexes)), + Split::hi(mask)); +} + +// internal_data0/1 (SimdArray) {{{1 +///\internal Returns the first data member of a generic SimdArray +template +#ifndef Vc_MSVC +Vc_INTRINSIC +#endif +typename SimdArrayTraits::storage_type0 &internal_data0( + SimdArray &x) +{ + return x.data0; +} +///\internal Returns the second data member of a generic SimdArray +template +#ifndef Vc_MSVC +Vc_INTRINSIC +#endif +typename SimdArrayTraits::storage_type1 &internal_data1( + SimdArray &x) +{ + return x.data1; +} +///\internal Returns the first data member of a generic SimdArray (const overload) +template +#ifndef Vc_MSVC +Vc_INTRINSIC +#endif +const typename SimdArrayTraits::storage_type0 &internal_data0( + const SimdArray &x) +{ + return x.data0; +} +///\internal Returns the second data member of a generic SimdArray (const overload) +template +#ifndef Vc_MSVC +Vc_INTRINSIC +#endif +const typename SimdArrayTraits::storage_type1 &internal_data1( + const SimdArray &x) +{ + return x.data1; +} + +// MSVC workaround for SimdArray(storage_type0, storage_type1) ctor{{{1 +// MSVC sometimes stores x to data1. By first broadcasting 0 and then assigning y +// in the body the bug is supressed. +#if defined Vc_MSVC && defined Vc_IMPL_SSE +template <> +Vc_INTRINSIC SimdArray, 2>::SimdArray( + SimdArray &&x, SimdArray &&y) + : data0(x), data1(0) +{ + data1 = y; +} +#endif + +// binary operators {{{1 +namespace result_vector_type_internal +{ +template +using type = typename std::remove_cv::type>::type; + +template +using is_integer_larger_than_int = std::integral_constant< + bool, std::is_integral::value &&(sizeof(T) > sizeof(int) || + std::is_same::value || + std::is_same::value)>; + +template < + typename L, typename R, + std::size_t N = Traits::isSimdArray::value ? Traits::simd_vector_size::value + : Traits::simd_vector_size::value, + bool = + (Traits::isSimdArray::value || + Traits::isSimdArray::value) // one of the operands must be a SimdArray + && !std::is_same, type>::value // if the operands are of the same type + // use the member function + && + ((std::is_arithmetic>::value && + !is_integer_larger_than_int>::value) || + (std::is_arithmetic>::value && + !is_integer_larger_than_int>::value) // one of the operands is a scalar + // type + || + ( // or one of the operands is Vector with Vector::size() == + // SimdArray::size() + Traits::simd_vector_size::value == Traits::simd_vector_size::value && + ((Traits::is_simd_vector::value && !Traits::isSimdArray::value) || + (Traits::is_simd_vector::value && !Traits::isSimdArray::value))))> +struct evaluate; + +template struct evaluate +{ +private: + using LScalar = Traits::entry_type_of; + using RScalar = Traits::entry_type_of; + + template + using conditional = typename std::conditional::type; + +public: + // In principle we want the exact same rules for SimdArray ⨉ SimdArray as the standard + // defines for T ⨉ U. BUT: short ⨉ short returns int (because all integral types smaller than + // int are promoted to int before any operation). This would imply that SIMD types with integral + // types smaller than int are more or less useless - and you could use SimdArray from the + // start. Therefore we special-case those operations where the scalar type of both operands is + // integral and smaller than int. + // In addition to that there is no generic support for 64-bit int SIMD types. Therefore + // promotion to a 64-bit integral type (including `long` because it can potentially have 64 + // bits) also is not done. But if one of the operands is a scalar type that is larger than int + // then the operator is disabled altogether. We do not want an implicit demotion. + using type = SimdArray< + conditional<(std::is_integral::value &&std::is_integral::value && + sizeof(LScalar) < sizeof(int) && + sizeof(RScalar) < sizeof(int)), + conditional<(sizeof(LScalar) == sizeof(RScalar)), + conditional::value, LScalar, RScalar>, + conditional<(sizeof(LScalar) > sizeof(RScalar)), LScalar, RScalar>>, + decltype(std::declval() + std::declval())>, + N>; +}; + +} // namespace result_vector_type_internal + +template +using result_vector_type = typename result_vector_type_internal::evaluate::type; + +static_assert( + std::is_same>, + Vc::SimdArray>::value, + "result_vector_type does not work"); + +#define Vc_BINARY_OPERATORS_(op_) \ + /*!\brief Applies op_ component-wise and concurrently. */ \ + template \ + Vc_INTRINSIC result_vector_type operator op_(L &&lhs, R &&rhs) \ + { \ + using Return = result_vector_type; \ + return Return(std::forward(lhs)) op_ Return(std::forward(rhs)); \ + } +/** + * \name Arithmetic and Bitwise Operators + * + * Applies the operator component-wise and concurrently on \p lhs and \p rhs and returns + * a new SimdArray object containing the result values. + * + * This operator only participates in overload resolution if: + * \li At least one of the template parameters \p L or \p R is a SimdArray type. + * \li Either \p L or \p R is a fundamental arithmetic type but not an integral type + * larger than \c int \n + * or \n + * \p L or \p R is a Vc::Vector type with equal number of elements (Vector::size() == + * SimdArray::size()). + * + * The return type of the operator is a SimdArray type using the more precise EntryType of + * \p L or \p R and the same number of elements as the SimdArray argument(s). + */ +///@{ +Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATORS_); +Vc_ALL_BINARY(Vc_BINARY_OPERATORS_); +///@} +#undef Vc_BINARY_OPERATORS_ +#define Vc_BINARY_OPERATORS_(op_) \ + /*!\brief Applies op_ component-wise and concurrently. */ \ + template \ + Vc_INTRINSIC typename result_vector_type::mask_type operator op_(L &&lhs, \ + R &&rhs) \ + { \ + using Promote = result_vector_type; \ + return Promote(std::forward(lhs)) op_ Promote(std::forward(rhs)); \ + } +/** + * \name Compare Operators + * + * Applies the operator component-wise and concurrently on \p lhs and \p rhs and returns + * a new SimdMaskArray object containing the result values. + * + * This operator only participates in overload resolution if (same rules as above): + * \li At least one of the template parameters \p L or \p R is a SimdArray type. + * \li Either \p L or \p R is a fundamental arithmetic type but not an integral type + * larger than \c int \n + * or \n + * \p L or \p R is a Vc::Vector type with equal number of elements (Vector::size() == + * SimdArray::size()). + * + * The return type of the operator is a SimdMaskArray type using the more precise EntryType of + * \p L or \p R and the same number of elements as the SimdArray argument(s). + */ +///@{ +Vc_ALL_COMPARES(Vc_BINARY_OPERATORS_); +///@} +#undef Vc_BINARY_OPERATORS_ + +// math functions {{{1 +#define Vc_FORWARD_UNARY_OPERATOR(name_) \ + /*!\brief Applies the std::name_ function component-wise and concurrently. */ \ + template \ + inline SimdArray name_(const SimdArray &x) \ + { \ + return SimdArray::fromOperation( \ + Common::Operations::Forward_##name_(), x); \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + +#define Vc_FORWARD_UNARY_BOOL_OPERATOR(name_) \ + /*!\brief Applies the std::name_ function component-wise and concurrently. */ \ + template \ + inline SimdMaskArray name_(const SimdArray &x) \ + { \ + return SimdMaskArray::fromOperation( \ + Common::Operations::Forward_##name_(), x); \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + +#define Vc_FORWARD_BINARY_OPERATOR(name_) \ + /*!\brief Applies the std::name_ function component-wise and concurrently. */ \ + template \ + inline SimdArray name_(const SimdArray &x, \ + const SimdArray &y) \ + { \ + return SimdArray::fromOperation( \ + Common::Operations::Forward_##name_(), x, y); \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + +/** + * \name Math functions + * These functions evaluate the + */ +///@{ +Vc_FORWARD_UNARY_OPERATOR(abs); +Vc_FORWARD_UNARY_OPERATOR(asin); +Vc_FORWARD_UNARY_OPERATOR(atan); +Vc_FORWARD_BINARY_OPERATOR(atan2); +Vc_FORWARD_UNARY_OPERATOR(ceil); +Vc_FORWARD_BINARY_OPERATOR(copysign); +Vc_FORWARD_UNARY_OPERATOR(cos); +Vc_FORWARD_UNARY_OPERATOR(exp); +Vc_FORWARD_UNARY_OPERATOR(exponent); +Vc_FORWARD_UNARY_OPERATOR(floor); +/// Applies the std::fma function component-wise and concurrently. +template +inline SimdArray fma(const SimdArray &a, const SimdArray &b, + const SimdArray &c) +{ + return SimdArray::fromOperation(Common::Operations::Forward_fma(), a, b, c); +} +Vc_FORWARD_UNARY_BOOL_OPERATOR(isfinite); +Vc_FORWARD_UNARY_BOOL_OPERATOR(isinf); +Vc_FORWARD_UNARY_BOOL_OPERATOR(isnan); +#if defined Vc_MSVC && defined Vc_IMPL_SSE +inline SimdMaskArray, 2> isnan( + const SimdArray, 2> &x) +{ + using V = SSE::Vector; + const SimdArray &x0 = internal_data0(x); + const SimdArray &x1 = internal_data1(x); + SimdMaskArray r0; + SimdMaskArray r1; + internal_data(internal_data0(r0)) = isnan(internal_data(internal_data0(x0))); + internal_data(internal_data1(r0)) = isnan(internal_data(internal_data1(x0))); + internal_data(internal_data0(r1)) = isnan(internal_data(internal_data0(x1))); + internal_data(internal_data1(r1)) = isnan(internal_data(internal_data1(x1))); + return {std::move(r0), std::move(r1)}; +} +#endif +Vc_FORWARD_UNARY_BOOL_OPERATOR(isnegative); +/// Applies the std::frexp function component-wise and concurrently. +template +inline SimdArray frexp(const SimdArray &x, SimdArray *e) +{ + return SimdArray::fromOperation(Common::Operations::Forward_frexp(), x, e); +} +/// Applies the std::ldexp function component-wise and concurrently. +template +inline SimdArray ldexp(const SimdArray &x, const SimdArray &e) +{ + return SimdArray::fromOperation(Common::Operations::Forward_ldexp(), x, e); +} +Vc_FORWARD_UNARY_OPERATOR(log); +Vc_FORWARD_UNARY_OPERATOR(log10); +Vc_FORWARD_UNARY_OPERATOR(log2); +Vc_FORWARD_UNARY_OPERATOR(reciprocal); +Vc_FORWARD_UNARY_OPERATOR(round); +Vc_FORWARD_UNARY_OPERATOR(rsqrt); +Vc_FORWARD_UNARY_OPERATOR(sin); +/// Determines sine and cosine concurrently and component-wise on \p x. +template +void sincos(const SimdArray &x, SimdArray *sin, SimdArray *cos) +{ + SimdArray::callOperation(Common::Operations::Forward_sincos(), x, sin, cos); +} +Vc_FORWARD_UNARY_OPERATOR(sqrt); +Vc_FORWARD_UNARY_OPERATOR(trunc); +Vc_FORWARD_BINARY_OPERATOR(min); +Vc_FORWARD_BINARY_OPERATOR(max); +///@} +#undef Vc_FORWARD_UNARY_OPERATOR +#undef Vc_FORWARD_UNARY_BOOL_OPERATOR +#undef Vc_FORWARD_BINARY_OPERATOR + +// simd_cast {{{1 +#ifdef Vc_MSVC +#define Vc_DUMMY_ARG0 , int = 0 +#define Vc_DUMMY_ARG1 , long = 0 +#define Vc_DUMMY_ARG2 , short = 0 +#define Vc_DUMMY_ARG3 , char = '0' +#define Vc_DUMMY_ARG4 , unsigned = 0u +#define Vc_DUMMY_ARG5 , unsigned short = 0u +#else +#define Vc_DUMMY_ARG0 +#define Vc_DUMMY_ARG1 +#define Vc_DUMMY_ARG2 +#define Vc_DUMMY_ARG3 +#define Vc_DUMMY_ARG4 +#define Vc_DUMMY_ARG5 +#endif // Vc_MSVC + +// simd_cast_impl_smaller_input {{{2 +// The following function can be implemented without the sizeof...(From) overload. +// However, ICC has a bug (Premier Issue #6000116338) which leads to an ICE. Splitting the +// function in two works around the issue. +template +Vc_INTRINSIC Vc_CONST enable_if +simd_cast_impl_smaller_input(const From &... xs, const T &last) +{ + Return r = simd_cast(xs...); + for (size_t i = 0; i < N; ++i) { + r[i + N * sizeof...(From)] = static_cast(last[i]); + } + return r; +} +template +Vc_INTRINSIC Vc_CONST Return simd_cast_impl_smaller_input(const T &last) +{ + Return r = Return(); + for (size_t i = 0; i < N; ++i) { + r[i] = static_cast(last[i]); + } + return r; +} +template +Vc_INTRINSIC Vc_CONST enable_if simd_cast_impl_larger_input( + const From &... xs, const T &last) +{ + Return r = simd_cast(xs...); + for (size_t i = N * sizeof...(From); i < Return::Size; ++i) { + r[i] = static_cast(last[i - N * sizeof...(From)]); + } + return r; +} +template +Vc_INTRINSIC Vc_CONST Return simd_cast_impl_larger_input(const T &last) +{ + Return r = Return(); + for (size_t i = 0; i < Return::size(); ++i) { + r[i] = static_cast(last[i]); + } + return r; +} + +// simd_cast_without_last (declaration) {{{2 +template +Vc_INTRINSIC_L Vc_CONST_L Return + simd_cast_without_last(const From &... xs, const T &) Vc_INTRINSIC_R Vc_CONST_R; + +// are_all_types_equal {{{2 +template struct are_all_types_equal; +template +struct are_all_types_equal : public std::integral_constant +{ +}; +template +struct are_all_types_equal + : public std::integral_constant< + bool, std::is_same::value && are_all_types_equal::value> +{ +}; + +// simd_cast_interleaved_argument_order (declarations) {{{2 +/*! \internal + The need for simd_cast_interleaved_argument_order stems from a shortcoming in pack + expansion of variadic templates in C++. For a simd_cast with SimdArray arguments that + are bisectable (i.e. \c storage_type0 and \c storage_type1 are equal) the generic + implementation needs to forward to a simd_cast of the \c internal_data0 and \c + internal_data1 of the arguments. But the required order of arguments is + `internal_data0(arg0), internal_data1(arg0), internal_data0(arg1), ...`. This is + impossible to achieve with pack expansion. It is only possible to write + `internal_data0(args)..., internal_data1(args)...` and thus have the argument order + mixed up. The simd_cast_interleaved_argument_order “simply” calls simd_cast with the + arguments correctly reordered (i.e. interleaved). + + The implementation of simd_cast_interleaved_argument_order is done generically, so that + it supports any number of arguments. The central idea of the implementation is an + `extract` function which returns one value of an argument pack determined via an index + passed as template argument. This index is generated via an index_sequence. The + `extract` function uses two argument packs (of equal size) to easily return values from + the front and middle of the argument pack (for doing the deinterleave). + */ +template +Vc_INTRINSIC Vc_CONST Return + simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b); + +// simd_cast_with_offset (declarations and one impl) {{{2 +// offset == 0 {{{3 +template +Vc_INTRINSIC Vc_CONST + enable_if<(are_all_types_equal::value && offset == 0), Return> + simd_cast_with_offset(const From &x, const Froms &... xs); +// offset > 0 && offset divisible by Return::Size {{{3 +template +Vc_INTRINSIC Vc_CONST + enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return> + simd_cast_with_offset(const From &x); +// offset > 0 && offset NOT divisible && Return is non-atomic simd(mask)array {{{3 +template +Vc_INTRINSIC Vc_CONST + enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && + ((Traits::isSimdArray::value && + !Traits::isAtomicSimdArray::value) || + (Traits::isSimdMaskArray::value && + !Traits::isAtomicSimdMaskArray::value))), + Return> + simd_cast_with_offset(const From &x); +// offset > 0 && offset NOT divisible && Return is atomic simd(mask)array {{{3 +template +Vc_INTRINSIC Vc_CONST + enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && + ((Traits::isSimdArray::value && + Traits::isAtomicSimdArray::value) || + (Traits::isSimdMaskArray::value && + Traits::isAtomicSimdMaskArray::value))), + Return> + simd_cast_with_offset(const From &x); +// offset > first argument (drops first arg) {{{3 +template +Vc_INTRINSIC Vc_CONST enable_if< + (are_all_types_equal::value && From::Size <= offset), Return> + simd_cast_with_offset(const From &, const Froms &... xs) +{ + return simd_cast_with_offset(xs...); +} + +// offset > first and only argument (returns Zero) {{{3 +template +Vc_INTRINSIC Vc_CONST enable_if<(From::Size <= offset), Return> simd_cast_with_offset( + const From &) +{ + return Return::Zero(); +} + +// first_type_of {{{2 +template struct first_type_of_impl +{ + using type = T; +}; +template using first_type_of = typename first_type_of_impl::type; + +// simd_cast_drop_arguments (declarations) {{{2 +template +Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x); +template +Vc_INTRINSIC Vc_CONST + enable_if<(are_all_types_equal::value && + sizeof...(Froms) * first_type_of::Size < Return::Size), + Return> + simd_cast_drop_arguments(Froms... xs, first_type_of x); +// The following function can be implemented without the sizeof...(From) overload. +// However, ICC has a bug (Premier Issue #6000116338) which leads to an ICE. Splitting the +// function in two works around the issue. +template +Vc_INTRINSIC Vc_CONST enable_if< + (are_all_types_equal::value && + (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0), + Return> +simd_cast_drop_arguments(Froms... xs, From x, From); +template +Vc_INTRINSIC Vc_CONST + enable_if<(are_all_types_equal::value && From::Size >= Return::Size), Return> + simd_cast_drop_arguments(From x, From); + +namespace +{ +#ifdef Vc_DEBUG_SIMD_CAST +void debugDoNothing(const std::initializer_list &) {} +template +inline void vc_debug_(const char *prefix, const char *suffix, const T0 &arg0, + const Ts &... args) +{ + std::cerr << prefix << arg0; + debugDoNothing({&(std::cerr << ", " << args)...}); + std::cerr << suffix; +} +#else +template +Vc_INTRINSIC void vc_debug_(const char *, const char *, const T0 &, const Ts &...) +{ +} +#endif +} // unnamed namespace + +// is_less trait{{{2 +template +struct is_less : public std::integral_constant { +}; + +// is_power_of_2 trait{{{2 +template +struct is_power_of_2 : public std::integral_constant { +}; + +// simd_cast(xs...) to SimdArray/-mask {{{2 +#define Vc_SIMDARRAY_CASTS(SimdArrayType_, NativeType_) \ + template \ + Vc_INTRINSIC Vc_CONST enable_if< \ + (Traits::isAtomic##SimdArrayType_::value && \ + is_less::Size * sizeof...(Froms), Return::Size>::value && \ + are_all_types_equal, Froms...>::value), \ + Return> \ + simd_cast(NativeType_ x, Froms... xs) \ + { \ + vc_debug_("simd_cast{1}(", ")\n", x, xs...); \ + return {simd_cast(x, xs...)}; \ + } \ + template \ + Vc_INTRINSIC Vc_CONST enable_if< \ + (Traits::isAtomic##SimdArrayType_::value && \ + !is_less::Size * sizeof...(Froms), Return::Size>::value && \ + are_all_types_equal, Froms...>::value), \ + Return> \ + simd_cast(NativeType_ x, Froms... xs) \ + { \ + vc_debug_("simd_cast{2}(", ")\n", x, xs...); \ + return {simd_cast_without_last, Froms...>(x, xs...)}; \ + } \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(Traits::is##SimdArrayType_::value && \ + !Traits::isAtomic##SimdArrayType_::value && \ + is_less(), \ + NativeType_::Size *(1 + sizeof...(Froms))>::value && \ + are_all_types_equal, Froms...>::value), \ + Return> \ + simd_cast(NativeType_ x, Froms... xs) \ + { \ + vc_debug_("simd_cast{3}(", ")\n", x, xs...); \ + using R0 = typename Return::storage_type0; \ + using R1 = typename Return::storage_type1; \ + return {simd_cast_drop_arguments(x, xs...), \ + simd_cast_with_offset(x, xs...)}; \ + } \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(Traits::is##SimdArrayType_::value && \ + !Traits::isAtomic##SimdArrayType_::value && \ + !is_less(), \ + NativeType_::Size *(1 + sizeof...(Froms))>::value && \ + are_all_types_equal, Froms...>::value), \ + Return> \ + simd_cast(NativeType_ x, Froms... xs) \ + { \ + vc_debug_("simd_cast{4}(", ")\n", x, xs...); \ + using R0 = typename Return::storage_type0; \ + using R1 = typename Return::storage_type1; \ + return {simd_cast(x, xs...), R1::Zero()}; \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + +Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector); +Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask); +#undef Vc_SIMDARRAY_CASTS + +// simd_cast(V) {{{2 +#define Vc_SIMDARRAY_CASTS(SimdArrayType_, NativeType_) \ + /* SIMD Vector/Mask to atomic SimdArray/simdmaskarray */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if::value, Return> \ + simd_cast(NativeType_ x Vc_DUMMY_ARG0) \ + { \ + vc_debug_("simd_cast{offset, atomic}(", ")\n", offset, x); \ + return {simd_cast(x)}; \ + } \ + /* both halves of Return array are extracted from argument */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(Traits::is##SimdArrayType_::value && \ + !Traits::isAtomic##SimdArrayType_::value && \ + Return::Size * offset + Common::left_size() < \ + NativeType_::Size), \ + Return> \ + simd_cast(NativeType_ x Vc_DUMMY_ARG1) \ + { \ + vc_debug_("simd_cast{offset, split Return}(", ")\n", offset, x); \ + using R0 = typename Return::storage_type0; \ + constexpr int entries_offset = offset * Return::Size; \ + constexpr int entries_offset_right = entries_offset + R0::Size; \ + return { \ + simd_cast_with_offset(x), \ + simd_cast_with_offset( \ + x)}; \ + } \ + /* SIMD Vector/Mask to non-atomic SimdArray/simdmaskarray */ \ + /* right half of Return array is zero */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(Traits::is##SimdArrayType_::value && \ + !Traits::isAtomic##SimdArrayType_::value && \ + Return::Size * offset + Common::left_size() >= \ + NativeType_::Size), \ + Return> \ + simd_cast(NativeType_ x Vc_DUMMY_ARG2) \ + { \ + vc_debug_("simd_cast{offset, R1::Zero}(", ")\n", offset, x); \ + using R0 = typename Return::storage_type0; \ + using R1 = typename Return::storage_type1; \ + constexpr int entries_offset = offset * Return::Size; \ + return {simd_cast_with_offset(x), R1::Zero()}; \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + +Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector); +Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask); +#undef Vc_SIMDARRAY_CASTS + +// simd_cast(xs...) from SimdArray/-mask {{{2 +#define Vc_SIMDARRAY_CASTS(SimdArrayType_) \ + /* indivisible SimdArrayType_ */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(are_all_types_equal, From...>::value && \ + (sizeof...(From) == 0 || N * sizeof...(From) < Return::Size) && \ + !std::is_same>::value), \ + Return> \ + simd_cast(const SimdArrayType_ &x0, const From &... xs) \ + { \ + vc_debug_("simd_cast{indivisible}(", ")\n", x0, xs...); \ + return simd_cast(internal_data(x0), internal_data(xs)...); \ + } \ + /* indivisible SimdArrayType_ && can drop arguments from the end */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(are_all_types_equal, From...>::value && \ + (sizeof...(From) > 0 && (N * sizeof...(From) >= Return::Size)) && \ + !std::is_same>::value), \ + Return> \ + simd_cast(const SimdArrayType_ &x0, const From &... xs) \ + { \ + vc_debug_("simd_cast{indivisible2}(", ")\n", x0, xs...); \ + return simd_cast_without_last::storage_type, \ + typename From::storage_type...>( \ + internal_data(x0), internal_data(xs)...); \ + } \ + /* bisectable SimdArrayType_ (N = 2^n) && never too large */ \ + template \ + Vc_INTRINSIC Vc_CONST enable_if< \ + (N != M && are_all_types_equal, From...>::value && \ + !std::is_same>::value && \ + is_less::value && is_power_of_2::value), \ + Return> \ + simd_cast(const SimdArrayType_ &x0, const From &... xs) \ + { \ + vc_debug_("simd_cast{bisectable}(", ")\n", x0, xs...); \ + return simd_cast_interleaved_argument_order< \ + Return, typename SimdArrayType_::storage_type0, \ + typename From::storage_type0...>(internal_data0(x0), internal_data0(xs)..., \ + internal_data1(x0), internal_data1(xs)...); \ + } \ + /* bisectable SimdArrayType_ (N = 2^n) && input so large that at least the last \ + * input can be dropped */ \ + template \ + Vc_INTRINSIC Vc_CONST enable_if< \ + (N != M && are_all_types_equal, From...>::value && \ + !is_less::value && is_power_of_2::value), \ + Return> \ + simd_cast(const SimdArrayType_ &x0, const From &... xs) \ + { \ + vc_debug_("simd_cast{bisectable2}(", ")\n", x0, xs...); \ + return simd_cast_without_last, From...>( \ + x0, xs...); \ + } \ + /* remaining SimdArrayType_ input never larger (N != 2^n) */ \ + template \ + Vc_INTRINSIC Vc_CONST enable_if< \ + (N != M && are_all_types_equal, From...>::value && \ + N * (1 + sizeof...(From)) <= Return::Size && !is_power_of_2::value), \ + Return> \ + simd_cast(const SimdArrayType_ &x0, const From &... xs) \ + { \ + vc_debug_("simd_cast{remaining}(", ")\n", x0, xs...); \ + return simd_cast_impl_smaller_input, \ + From...>(x0, xs...); \ + } \ + /* remaining SimdArrayType_ input larger (N != 2^n) */ \ + template \ + Vc_INTRINSIC Vc_CONST enable_if< \ + (N != M && are_all_types_equal, From...>::value && \ + N * (1 + sizeof...(From)) > Return::Size && !is_power_of_2::value), \ + Return> \ + simd_cast(const SimdArrayType_ &x0, const From &... xs) \ + { \ + vc_debug_("simd_cast{remaining2}(", ")\n", x0, xs...); \ + return simd_cast_impl_larger_input, \ + From...>(x0, xs...); \ + } \ + /* a single bisectable SimdArrayType_ (N = 2^n) too large */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(N != M && N >= 2 * Return::Size && is_power_of_2::value), Return> \ + simd_cast(const SimdArrayType_ &x) \ + { \ + vc_debug_("simd_cast{single bisectable}(", ")\n", x); \ + return simd_cast(internal_data0(x)); \ + } \ + template \ + Vc_INTRINSIC Vc_CONST enable_if<(N != M && N > Return::Size && \ + N < 2 * Return::Size && is_power_of_2::value), \ + Return> \ + simd_cast(const SimdArrayType_ &x) \ + { \ + vc_debug_("simd_cast{single bisectable2}(", ")\n", x); \ + return simd_cast(internal_data0(x), internal_data1(x)); \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON + +Vc_SIMDARRAY_CASTS(SimdArray); +Vc_SIMDARRAY_CASTS(SimdMaskArray); +#undef Vc_SIMDARRAY_CASTS + +// simd_cast(SimdArray/-mask) {{{2 +#define Vc_SIMDARRAY_CASTS(SimdArrayType_) \ + /* offset == 0 is like without offset */ \ + template \ + Vc_INTRINSIC Vc_CONST enable_if<(offset == 0), Return> simd_cast( \ + const SimdArrayType_ &x Vc_DUMMY_ARG0) \ + { \ + vc_debug_("simd_cast{offset == 0}(", ")\n", offset, x); \ + return simd_cast(x); \ + } \ + /* forward to V */ \ + template \ + Vc_INTRINSIC Vc_CONST enable_if<(offset != 0), Return> simd_cast( \ + const SimdArrayType_ &x Vc_DUMMY_ARG1) \ + { \ + vc_debug_("simd_cast{offset, forward}(", ")\n", offset, x); \ + return simd_cast(internal_data(x)); \ + } \ + /* convert from right member of SimdArray */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(N != M && offset * Return::Size >= Common::left_size() && \ + offset != 0 && Common::left_size() % Return::Size == 0), \ + Return> \ + simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG2) \ + { \ + vc_debug_("simd_cast{offset, right}(", ")\n", offset, x); \ + return simd_cast() / Return::Size>( \ + internal_data1(x)); \ + } \ + /* same as above except for odd cases where offset * Return::Size doesn't fit the \ + * left side of the SimdArray */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(N != M && offset * Return::Size >= Common::left_size() && \ + offset != 0 && Common::left_size() % Return::Size != 0), \ + Return> \ + simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG3) \ + { \ + vc_debug_("simd_cast{offset, right, nofit}(", ")\n", offset, x); \ + return simd_cast_with_offset()>( \ + internal_data1(x)); \ + } \ + /* convert from left member of SimdArray */ \ + template \ + Vc_INTRINSIC Vc_CONST enable_if< \ + (N != M && /*offset * Return::Size < Common::left_size() &&*/ \ + offset != 0 && (offset + 1) * Return::Size <= Common::left_size()), \ + Return> \ + simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG4) \ + { \ + vc_debug_("simd_cast{offset, left}(", ")\n", offset, x); \ + return simd_cast(internal_data0(x)); \ + } \ + /* fallback to copying scalars */ \ + template \ + Vc_INTRINSIC Vc_CONST \ + enable_if<(N != M && (offset * Return::Size < Common::left_size()) && \ + offset != 0 && (offset + 1) * Return::Size > Common::left_size()), \ + Return> \ + simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG5) \ + { \ + vc_debug_("simd_cast{offset, copy scalars}(", ")\n", offset, x); \ + using R = typename Return::EntryType; \ + Return r = Return::Zero(); \ + for (std::size_t i = offset * Return::Size; \ + i < std::min(N, (offset + 1) * Return::Size); ++i) { \ + r[i - offset * Return::Size] = static_cast(x[i]); \ + } \ + return r; \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON +Vc_SIMDARRAY_CASTS(SimdArray); +Vc_SIMDARRAY_CASTS(SimdMaskArray); +#undef Vc_SIMDARRAY_CASTS +// simd_cast_drop_arguments (definitions) {{{2 +template +Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x) +{ + return simd_cast(x); +} +template +Vc_INTRINSIC Vc_CONST + enable_if<(are_all_types_equal::value && + sizeof...(Froms) * first_type_of::Size < Return::Size), + Return> + simd_cast_drop_arguments(Froms... xs, first_type_of x) +{ + return simd_cast(xs..., x); +} +// The following function can be implemented without the sizeof...(From) overload. +// However, ICC has a bug (Premier Issue #6000116338) which leads to an ICE. Splitting the +// function in two works around the issue. +template +Vc_INTRINSIC Vc_CONST enable_if< + (are_all_types_equal::value && + (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0), + Return> +simd_cast_drop_arguments(Froms... xs, From x, From) +{ + return simd_cast_drop_arguments(xs..., x); +} +template +Vc_INTRINSIC Vc_CONST + enable_if<(are_all_types_equal::value && From::Size >= Return::Size), Return> + simd_cast_drop_arguments(From x, From) +{ + return simd_cast_drop_arguments(x); +} + +// simd_cast_with_offset (definitions) {{{2 + template + Vc_INTRINSIC Vc_CONST + enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), + Return> simd_cast_with_offset(const From &x) +{ + return simd_cast(x); +} +template +Vc_INTRINSIC Vc_CONST + enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && + ((Traits::isSimdArray::value && + !Traits::isAtomicSimdArray::value) || + (Traits::isSimdMaskArray::value && + !Traits::isAtomicSimdMaskArray::value))), + Return> + simd_cast_with_offset(const From &x) +{ + using R0 = typename Return::storage_type0; + using R1 = typename Return::storage_type1; + return {simd_cast_with_offset(x), + simd_cast_with_offset(x)}; +} +template +Vc_INTRINSIC Vc_CONST + enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && + ((Traits::isSimdArray::value && + Traits::isAtomicSimdArray::value) || + (Traits::isSimdMaskArray::value && + Traits::isAtomicSimdMaskArray::value))), + Return> + simd_cast_with_offset(const From &x) +{ + return simd_cast(x.shifted(offset % Return::Size)); +} +template +Vc_INTRINSIC Vc_CONST + enable_if<(are_all_types_equal::value && offset == 0), Return> + simd_cast_with_offset(const From &x, const Froms &... xs) +{ + return simd_cast(x, xs...); +} + +// simd_cast_without_last (definition) {{{2 +template +Vc_INTRINSIC Vc_CONST Return simd_cast_without_last(const From &... xs, const T &) +{ + return simd_cast(xs...); +} + +// simd_cast_interleaved_argument_order (definitions) {{{2 + +#ifdef Vc_MSVC +// MSVC doesn't see that the Ts pack below can be empty and thus complains when extract_interleaved +// is called with only 2 arguments. These overloads here are *INCORRECT standard C++*, but they make +// MSVC do the right thing. +template +Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const T0 &) +{ + return a0; +} +template +Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const T0 &b0) +{ + return b0; +} +#endif // Vc_MSVC + +/// \internal returns the first argument +template +Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, + const Ts &..., + const T0 &, + const Ts &...) +{ + return a0; +} +/// \internal returns the center argument +template +Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, + const Ts &..., + const T0 &b0, + const Ts &...) +{ + return b0; +} +/// \internal drops the first and center arguments and recurses +template +Vc_INTRINSIC Vc_CONST enable_if<(I > 1), T0> extract_interleaved(const T0 &, + const Ts &... a, + const T0 &, + const Ts &... b) +{ + return extract_interleaved(a..., b...); +} +/// \internal calls simd_cast with correct argument order thanks to extract_interleaved +template +Vc_INTRINSIC Vc_CONST Return + simd_cast_interleaved_argument_order_1(index_sequence, const Ts &... a, + const Ts &... b) +{ + return simd_cast(extract_interleaved(a..., b...)...); +} +/// \internal constructs the necessary index_sequence to pass it to +/// simd_cast_interleaved_argument_order_1 +template +Vc_INTRINSIC Vc_CONST Return + simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b) +{ + using seq = make_index_sequence; + return simd_cast_interleaved_argument_order_1(seq(), a..., b...); +} + +// conditional_assign {{{1 +#define Vc_CONDITIONAL_ASSIGN(name_, op_) \ + template \ + Vc_INTRINSIC enable_if conditional_assign( \ + SimdArray &lhs, M &&mask, U &&rhs) \ + { \ + lhs(mask) op_ rhs; \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON +Vc_CONDITIONAL_ASSIGN( Assign, =); +Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); +Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); +Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); +Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); +Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); +Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); +Vc_CONDITIONAL_ASSIGN( AndAssign, &=); +Vc_CONDITIONAL_ASSIGN( OrAssign, |=); +Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); +Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); +#undef Vc_CONDITIONAL_ASSIGN + +#define Vc_CONDITIONAL_ASSIGN(name_, expr_) \ + template \ + Vc_INTRINSIC enable_if> \ + conditional_assign(SimdArray &lhs, M &&mask) \ + { \ + return expr_; \ + } \ + Vc_NOTHING_EXPECTING_SEMICOLON +Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++); +Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask)); +Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--); +Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask)); +#undef Vc_CONDITIONAL_ASSIGN +// transpose_impl {{{1 +namespace Common +{ +template +inline void transpose_impl( + TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], + const TransposeProxy, SimdArray, + SimdArray, SimdArray> &proxy) +{ + V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]), + &internal_data(*r[2]), &internal_data(*r[3])}; + transpose_impl(TransposeTag<4, 4>(), &r2[0], + TransposeProxy{internal_data(std::get<0>(proxy.in)), + internal_data(std::get<1>(proxy.in)), + internal_data(std::get<2>(proxy.in)), + internal_data(std::get<3>(proxy.in))}); +} + +template +inline void transpose_impl( + TransposeTag<2, 4>, SimdArray *Vc_RESTRICT r[], + const TransposeProxy, SimdArray, + SimdArray, SimdArray> &proxy) +{ + auto &lo = *r[0]; + auto &hi = *r[1]; + internal_data0(internal_data0(lo)) = internal_data0(std::get<0>(proxy.in)); + internal_data1(internal_data0(lo)) = internal_data0(std::get<1>(proxy.in)); + internal_data0(internal_data1(lo)) = internal_data0(std::get<2>(proxy.in)); + internal_data1(internal_data1(lo)) = internal_data0(std::get<3>(proxy.in)); + internal_data0(internal_data0(hi)) = internal_data1(std::get<0>(proxy.in)); + internal_data1(internal_data0(hi)) = internal_data1(std::get<1>(proxy.in)); + internal_data0(internal_data1(hi)) = internal_data1(std::get<2>(proxy.in)); + internal_data1(internal_data1(hi)) = internal_data1(std::get<3>(proxy.in)); +} + +template +inline void transpose_impl( + TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], + const TransposeProxy, SimdArray, + SimdArray, SimdArray> &proxy) +{ + V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]), + &internal_data(*r[2]), &internal_data(*r[3])}; + transpose_impl(TransposeTag<4, 4>(), &r2[0], + TransposeProxy{internal_data(std::get<0>(proxy.in)), + internal_data(std::get<1>(proxy.in)), + internal_data(std::get<2>(proxy.in)), + internal_data(std::get<3>(proxy.in))}); +} + +template +inline void transpose_impl( + TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], + const TransposeProxy, SimdArray, + SimdArray, SimdArray> &proxy) +{ + SimdArray *Vc_RESTRICT r0[4 / 2] = {r[0], r[1]}; + SimdArray *Vc_RESTRICT r1[4 / 2] = {r[2], r[3]}; + using H = SimdArray; + transpose_impl(TransposeTag<2, 4>(), &r0[0], + TransposeProxy{internal_data0(std::get<0>(proxy.in)), + internal_data0(std::get<1>(proxy.in)), + internal_data0(std::get<2>(proxy.in)), + internal_data0(std::get<3>(proxy.in))}); + transpose_impl(TransposeTag<2, 4>(), &r1[0], + TransposeProxy{internal_data1(std::get<0>(proxy.in)), + internal_data1(std::get<1>(proxy.in)), + internal_data1(std::get<2>(proxy.in)), + internal_data1(std::get<3>(proxy.in))}); +} + +/* TODO: +template +inline enable_if<(N > VSize), void> transpose_impl( + std::array * Vc_RESTRICT, 4> & r, + const TransposeProxy, SimdArray, + SimdArray, SimdArray> &proxy) +{ + typedef SimdArray SA; + std::array r0 = { + {&internal_data0(*r[0]), &internal_data0(*r[1]), &internal_data0(*r[2]), + &internal_data0(*r[3])}}; + transpose_impl( + r0, TransposeProxy{ + internal_data0(std::get<0>(proxy.in)), + internal_data0(std::get<1>(proxy.in)), + internal_data0(std::get<2>(proxy.in)), + internal_data0(std::get<3>(proxy.in))}); + + std::array r1 = { + {&internal_data1(*r[0]), &internal_data1(*r[1]), &internal_data1(*r[2]), + &internal_data1(*r[3])}}; + transpose_impl( + r1, TransposeProxy{ + internal_data1(std::get<0>(proxy.in)), + internal_data1(std::get<1>(proxy.in)), + internal_data1(std::get<2>(proxy.in)), + internal_data1(std::get<3>(proxy.in))}); +} +*/ +} // namespace Common + +// Traits static assertions {{{1 +static_assert(Traits::has_no_allocated_data &>::value, ""); +static_assert(Traits::has_no_allocated_data>::value, ""); +static_assert(Traits::has_no_allocated_data &>::value, ""); +static_assert(Traits::has_no_allocated_data>::value, ""); +static_assert(Traits::has_no_allocated_data &>::value, ""); +static_assert(Traits::has_no_allocated_data>::value, ""); +static_assert(Traits::has_no_allocated_data>::value, ""); +static_assert(Traits::has_no_allocated_data &&>::value, ""); +// }}}1 +/// @} + +} // namespace Vc_VERSIONED_NAMESPACE + +// numeric_limits {{{1 +namespace std +{ +template +struct numeric_limits> : public numeric_limits { +private: + using R = Vc::SimdArray; + +public: + static Vc_ALWAYS_INLINE Vc_CONST R max() noexcept { return numeric_limits::max(); } + static Vc_ALWAYS_INLINE Vc_CONST R min() noexcept { return numeric_limits::min(); } + static Vc_ALWAYS_INLINE Vc_CONST R lowest() noexcept + { + return numeric_limits::lowest(); + } + static Vc_ALWAYS_INLINE Vc_CONST R epsilon() noexcept + { + return numeric_limits::epsilon(); + } + static Vc_ALWAYS_INLINE Vc_CONST R round_error() noexcept + { + return numeric_limits::round_error(); + } + static Vc_ALWAYS_INLINE Vc_CONST R infinity() noexcept + { + return numeric_limits::infinity(); + } + static Vc_ALWAYS_INLINE Vc_CONST R quiet_NaN() noexcept + { + return numeric_limits::quiet_NaN(); + } + static Vc_ALWAYS_INLINE Vc_CONST R signaling_NaN() noexcept + { + return numeric_limits::signaling_NaN(); + } + static Vc_ALWAYS_INLINE Vc_CONST R denorm_min() noexcept + { + return numeric_limits::denorm_min(); + } +}; +} // namespace std +//}}}1 + +#endif // VC_COMMON_SIMDARRAY_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/simdarrayhelper.h vc-1.3.0/common/simdarrayhelper.h --- vc-0.7.4/common/simdarrayhelper.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/simdarrayhelper.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,563 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2013-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SIMDARRAYHELPER_H_ +#define VC_COMMON_SIMDARRAYHELPER_H_ + +#include "macros.h" + +namespace Vc_VERSIONED_NAMESPACE +{ +namespace Common +{ + +/// \addtogroup SimdArray +/// @{ + +namespace Operations/*{{{*/ +{ +struct tag {}; +#define Vc_DEFINE_OPERATION(name_) \ + struct name_ : public tag { \ + template \ + Vc_INTRINSIC void operator()(V &v, Args &&... args) \ + { \ + v.name_(std::forward(args)...); \ + } \ + } +Vc_DEFINE_OPERATION(gather); +Vc_DEFINE_OPERATION(scatter); +Vc_DEFINE_OPERATION(load); +Vc_DEFINE_OPERATION(store); +Vc_DEFINE_OPERATION(setZero); +Vc_DEFINE_OPERATION(setZeroInverted); +Vc_DEFINE_OPERATION(assign); +#undef Vc_DEFINE_OPERATION +#define Vc_DEFINE_OPERATION(name_, code_) \ + struct name_ : public tag { \ + template Vc_INTRINSIC void operator()(V &v) { code_; } \ + } +Vc_DEFINE_OPERATION(increment, ++(v)); +Vc_DEFINE_OPERATION(decrement, --(v)); +Vc_DEFINE_OPERATION(random, v = V::Random()); +#undef Vc_DEFINE_OPERATION +#define Vc_DEFINE_OPERATION_FORWARD(name_) \ + struct Forward_##name_ : public tag \ + { \ + template ()...))> \ + Vc_INTRINSIC void operator()(decltype(name_(std::declval()...)) &v, \ + Args &&... args) \ + { \ + v = name_(std::forward(args)...); \ + } \ + template ()...))> \ + Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args) \ + { \ + name_(std::forward(args)...); \ + } \ + } +Vc_DEFINE_OPERATION_FORWARD(abs); +Vc_DEFINE_OPERATION_FORWARD(asin); +Vc_DEFINE_OPERATION_FORWARD(atan); +Vc_DEFINE_OPERATION_FORWARD(atan2); +Vc_DEFINE_OPERATION_FORWARD(cos); +Vc_DEFINE_OPERATION_FORWARD(ceil); +Vc_DEFINE_OPERATION_FORWARD(copysign); +Vc_DEFINE_OPERATION_FORWARD(exp); +Vc_DEFINE_OPERATION_FORWARD(exponent); +Vc_DEFINE_OPERATION_FORWARD(fma); +Vc_DEFINE_OPERATION_FORWARD(floor); +Vc_DEFINE_OPERATION_FORWARD(frexp); +Vc_DEFINE_OPERATION_FORWARD(isfinite); +Vc_DEFINE_OPERATION_FORWARD(isinf); +Vc_DEFINE_OPERATION_FORWARD(isnan); +Vc_DEFINE_OPERATION_FORWARD(isnegative); +Vc_DEFINE_OPERATION_FORWARD(ldexp); +Vc_DEFINE_OPERATION_FORWARD(log); +Vc_DEFINE_OPERATION_FORWARD(log10); +Vc_DEFINE_OPERATION_FORWARD(log2); +Vc_DEFINE_OPERATION_FORWARD(reciprocal); +Vc_DEFINE_OPERATION_FORWARD(round); +Vc_DEFINE_OPERATION_FORWARD(rsqrt); +Vc_DEFINE_OPERATION_FORWARD(sin); +Vc_DEFINE_OPERATION_FORWARD(sincos); +Vc_DEFINE_OPERATION_FORWARD(sqrt); +Vc_DEFINE_OPERATION_FORWARD(trunc); +Vc_DEFINE_OPERATION_FORWARD(min); +Vc_DEFINE_OPERATION_FORWARD(max); +#undef Vc_DEFINE_OPERATION_FORWARD +template using is_operation = std::is_base_of; +} // namespace Operations }}} + +/** + * \internal + * Helper type to statically communicate segmentation of one vector register into 2^n parts + * (Pieces). + */ +template struct Segment/*{{{*/ +{ + static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report."); + + using type = T_; + using type_decayed = typename std::decay::type; + static constexpr std::size_t Pieces = Pieces_; + static constexpr std::size_t Index = Index_; + using simd_array_type = SimdArray< + typename std::conditional::value, + typename type_decayed::EntryType, float>::type, + type_decayed::Size / Pieces>; + + type data; + + static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces; + + // no non-const operator[] needed (and problematic because of non-movable ElementReference) + decltype(std::declval()[0]) operator[](size_t i) const { return data[i + EntryOffset]; } + + simd_array_type asSimdArray() const + { + return simd_cast(data); + } +};/*}}}*/ + +//Segment specialization {{{ +template +struct Segment { + static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report."); + + using type = T_ *; + using type_decayed = typename std::decay::type; + static constexpr size_t Pieces = Pieces_; + static constexpr size_t Index = Index_; + using simd_array_type = SimdArray< + typename std::conditional::value, + typename type_decayed::VectorEntryType, float>::type, + type_decayed::Size / Pieces> *; + + type data; + + static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces; + + simd_array_type asSimdArray() const + { + return reinterpret_cast< +#ifdef Vc_GCC + // GCC might ICE if this type is declared with may_alias. If it doesn't + // ICE it warns about ignoring the attribute. + typename std::remove_pointer::type +#else + MayAlias::type> +#endif + *>(data) + + Index; + } + + //decltype(std::declval()[0]) operator[](size_t i) { return data[i + EntryOffset]; } + //decltype(std::declval()[0]) operator[](size_t i) const { return data[i + EntryOffset]; } +};/*}}}*/ + +/** \internal + Template class that is used to attach an offset value to an existing type. It is used + for IndexesFromZero construction in SimdArray. The \c data1 constructor needs to know + that the IndexesFromZero constructor requires an offset so that the whole data is + constructed as a correct sequence from `0` to `Size - 1`. + + \tparam T The original type that needs the offset attached. + \tparam Offset An integral value that determines the offset in the complete SimdArray. + */ +template struct AddOffset +{ + constexpr AddOffset() = default; +}; + +// class Split {{{1 +/** \internal + Helper type with static functions to generically adjust arguments for the \c data0 and + \c data1 members of SimdArray and SimdMaskArray. + + \tparam secondOffset The offset in number of elements that \c data1 has in the SimdArray + / SimdMaskArray. This is essentially equal to the number of + elements in \c data0. + */ +template class Split +{ + static Vc_INTRINSIC AddOffset + hiImpl(VectorSpecialInitializerIndexesFromZero) + { + return {}; + } + template + static Vc_INTRINSIC + AddOffset + hiImpl(AddOffset) + { + return {}; + } + + // split composite SimdArray + template > + static Vc_INTRINSIC auto loImpl(const SimdArray &x) + -> decltype(internal_data0(x)) + { + return internal_data0(x); + } + template > + static Vc_INTRINSIC auto hiImpl(const SimdArray &x) + -> decltype(internal_data1(x)) + { + return internal_data1(x); + } + template > + static Vc_INTRINSIC auto loImpl(SimdArray *x) + -> decltype(&internal_data0(*x)) + { + return &internal_data0(*x); + } + template > + static Vc_INTRINSIC auto hiImpl(SimdArray *x) + -> decltype(&internal_data1(*x)) + { + return &internal_data1(*x); + } + + // split atomic SimdArray + template + static Vc_INTRINSIC Segment loImpl(const SimdArray &x) + { + return {internal_data(x)}; + } + template + static Vc_INTRINSIC Segment hiImpl(const SimdArray &x) + { + return {internal_data(x)}; + } + template + static Vc_INTRINSIC Segment loImpl(SimdArray *x) + { + return {&internal_data(*x)}; + } + template + static Vc_INTRINSIC Segment hiImpl(SimdArray *x) + { + return {&internal_data(*x)}; + } + + // split composite SimdMaskArray + template + static Vc_INTRINSIC auto loImpl(const SimdMaskArray &x) -> decltype(internal_data0(x)) + { + return internal_data0(x); + } + template + static Vc_INTRINSIC auto hiImpl(const SimdMaskArray &x) -> decltype(internal_data1(x)) + { + return internal_data1(x); + } + + template + static Vc_INTRINSIC Segment::mask_type, 2, 0> loImpl( + const SimdMaskArray &x) + { + return {internal_data(x)}; + } + template + static Vc_INTRINSIC Segment::mask_type, 2, 1> hiImpl( + const SimdMaskArray &x) + { + return {internal_data(x)}; + } + + // split Vector and Mask + template + static constexpr bool is_vector_or_mask(){ + return (Traits::is_simd_vector::value && !Traits::isSimdArray::value) || + (Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value); + } + template + static Vc_INTRINSIC Segment loImpl(V &&x, enable_if()> = nullarg) + { + return {std::forward(x)}; + } + template + static Vc_INTRINSIC Segment hiImpl(V &&x, enable_if()> = nullarg) + { + return {std::forward(x)}; + } + + // generically split Segments + template + static Vc_INTRINSIC Segment loImpl( + const Segment &x) + { + return {x.data}; + } + template + static Vc_INTRINSIC Segment hiImpl( + const Segment &x) + { + return {x.data}; + } + + /** \internal + * \name Checks for existence of \c loImpl / \c hiImpl + */ + //@{ + template ()))> + static std::true_type have_lo_impl(int); + template static std::false_type have_lo_impl(float); + template static constexpr bool have_lo_impl() + { + return decltype(have_lo_impl(1))::value; + } + + template ()))> + static std::true_type have_hi_impl(int); + template static std::false_type have_hi_impl(float); + template static constexpr bool have_hi_impl() + { + return decltype(have_hi_impl(1))::value; + } + //@} + +public: + /** \internal + * \name with Operations tag + * + * These functions don't overload on the data parameter. The first parameter (the tag) clearly + * identifies the intended function. + */ + //@{ + template + static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr) + { + return ptr; + } + template + static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr) + { + return ptr + secondOffset; + } + template ::value>> + static Vc_ALWAYS_INLINE decltype(loImpl(std::declval())) + lo(Operations::gather, U &&x) + { + return loImpl(std::forward(x)); + } + template ::value>> + static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval())) + hi(Operations::gather, U &&x) + { + return hiImpl(std::forward(x)); + } + template + static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr) + { + return ptr; + } + template + static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr) + { + return ptr + secondOffset; + } + //@} + + /** \internal + \name without Operations tag + + These functions are not clearly tagged as to where they are used and therefore + behave differently depending on the type of the parameter. Different behavior is + implemented via overloads of \c loImpl and \c hiImpl. They are not overloads of \c + lo and \c hi directly because it's hard to compete against a universal reference + (i.e. an overload for `int` requires overloads for `int &`, `const int &`, and `int + &&`. If one of them were missing `U &&` would win in overload resolution). + */ + //@{ + template + static Vc_ALWAYS_INLINE decltype(loImpl(std::declval())) lo(U &&x) + { + return loImpl(std::forward(x)); + } + template + static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval())) hi(U &&x) + { + return hiImpl(std::forward(x)); + } + + template + static Vc_ALWAYS_INLINE enable_if(), U> lo(U &&x) + { + return std::forward(x); + } + template + static Vc_ALWAYS_INLINE enable_if(), U> hi(U &&x) + { + return std::forward(x); + } + //@} +}; + +// actual_value {{{1 +template +static Vc_INTRINSIC const V &actual_value(Op, const SimdArray &x) +{ + return internal_data(x); +} +template +static Vc_INTRINSIC V *actual_value(Op, SimdArray *x) +{ + return &internal_data(*x); +} +template +static Vc_INTRINSIC typename Segment::simd_array_type actual_value( + Op, Segment &&seg) +{ + return seg.asSimdArray(); +} + +template +static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray &x) +{ + return internal_data(x); +} +template +static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray *x) +{ + return &internal_data(*x); +} + +// unpackArgumentsAuto {{{1 +/**\internal + * \name unpackArgumentsAuto + * + * Search for the right amount of SimdArray "unpacking" (via actual_value) to match the + * interface of the function to be called. + * + * The compiler can figure this out for us thanks to SFINAE. The approach is to have a + * number \c I that determines the indexes of the arguments to be transformed via + * actual_value. Each bit of \c I identifies an argument. unpackArgumentsAuto starts the + * recursion with `I = 0`, i.e. no actual_value transformations. If the overload calling + * \c op is unavailable due to a substitution failure \c I is incremented and the function + * recurses. Otherwise there are two unpackArgumentsAutoImpl functions in the overload + * set. The first argument (\c int / \c float) leads to a preference of the function + * calling \c op, thus ending the recursion. + */ +///@{ + +///\internal transforms \p arg via actual_value +template +Vc_INTRINSIC decltype(actual_value(std::declval(), std::declval())) +conditionalUnpack(std::true_type, Op op, Arg &&arg) +{ + return actual_value(op, std::forward(arg)); +} +///\internal forwards \p arg to its return value +template +Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg) +{ + return std::forward(arg); +} + +///\internal true-/false_type that selects whether the argument with index B should be unpacked +template +struct selectorType : public std::integral_constant { +}; + +///\internal ends the recursion, transforms arguments, and calls \p op +template +Vc_INTRINSIC decltype(std::declval()(std::declval(), + conditionalUnpack(selectorType(), + std::declval(), + std::declval())...)) +unpackArgumentsAutoImpl(int, index_sequence, Op op, R &&r, Args &&... args) +{ + op(std::forward(r), + conditionalUnpack(selectorType(), op, std::forward(args))...); +} + +///\internal the current actual_value calls don't work: recurse to I + 1 +template +Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl( + float, index_sequence is, Op op, R &&r, Args &&... args) +{ + // if R is nullptr_t then the return type cannot enforce that actually any unwrapping + // of the SimdArray types happens. Thus, you could get an endless loop of the + // SimdArray function overload calling itself, if the index goes up to (1 << + // sizeof...(Args)) - 1 (which means no argument transformations via actual_value). + static_assert( + I < (1 << sizeof...(Args)) - (std::is_same::value ? 1 : 0), + "Vc or compiler bug. Please report. Failed to find a combination of " + "actual_value(arg) transformations that allows calling Op."); + unpackArgumentsAutoImpl(int(), is, op, std::forward(r), + std::forward(args)...); +} + +#ifdef Vc_ICC +template struct IccWorkaround { + using type = void; +}; +template struct IccWorkaround<2, Ts...> { + using type = typename std::remove_pointer>::type>::type>::type; +}; +#endif + +///\internal The interface to start the machinery. +template +Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args) +{ +#ifdef Vc_ICC + // ugly hacky workaround for ICC: + // The compiler fails to do SFINAE right on recursion. We have to hit the right + // recursionStart number from the start. + const int recursionStart = + Traits::isSimdArray< + typename IccWorkaround::type>::value && + (std::is_same::value || + std::is_same::value) + ? 2 + : 0; +#else + const int recursionStart = 0; +#endif + unpackArgumentsAutoImpl( + int(), make_index_sequence(), op, std::forward(r), + std::forward(args)...); +} +///@} + +//}}}1 +///@} +} // namespace Common +} // namespace Vc + +#endif // VC_COMMON_SIMDARRAYHELPER_H_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/simd_cast_caller.tcc vc-1.3.0/common/simd_cast_caller.tcc --- vc-0.7.4/common/simd_cast_caller.tcc 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/simd_cast_caller.tcc 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,85 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_ +#define VC_COMMON_SIMD_CAST_CALLER_TCC_ + +#include "macros.h" +namespace Vc_VERSIONED_NAMESPACE { + +template +template +Vc_INTRINSIC SimdMaskArray::SimdMaskArray( + const SimdMaskArray &x, + enable_if) + : data(simd_cast(internal_data(x))) +{ +} +template +template +Vc_INTRINSIC SimdMaskArray::SimdMaskArray( + const SimdMaskArray &x, + enable_if<(N > V::Size && N <= 2 * V::Size)>) + : data(simd_cast(internal_data(internal_data0(x)), internal_data(internal_data1(x)))) +{ +} +template +template +Vc_INTRINSIC SimdMaskArray::SimdMaskArray( + const SimdMaskArray &x, + enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>) + : data(simd_cast(internal_data(internal_data0(internal_data0(x))), + internal_data(internal_data1(internal_data0(x))), + internal_data(internal_data0(internal_data1(x))), + internal_data(internal_data1(internal_data1(x))))) +{ +} +// conversion from any Segment object (could be SimdMaskArray or Mask) +template +template +Vc_INTRINSIC SimdMaskArray::SimdMaskArray( + Common::Segment &&x, + enable_if::value == Size * Pieces>) + : data(simd_cast(x.data)) +{ +} +// conversion from Mask +template +template +Vc_INTRINSIC SimdMaskArray::SimdMaskArray( + M k, + enable_if<(Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value && + Traits::simd_vector_size::value == Size)>) + : data(simd_cast(k)) +{ +} + +} + +#endif // VC_COMMON_SIMD_CAST_CALLER_TCC_ + +// vim: foldmethod=marker diff -Nru vc-0.7.4/common/simd_cast.h vc-1.3.0/common/simd_cast.h --- vc-0.7.4/common/simd_cast.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/simd_cast.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,68 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SIMD_CAST_H_ +#define VC_COMMON_SIMD_CAST_H_ + +#include +#include "macros.h" + +// declare a bogus simd_cast function template in the global namespace to enable ADL for +// simd_cast +template void simd_cast(); + +namespace Vc_VERSIONED_NAMESPACE +{ +/** + * Casts the argument \p x from type \p From to type \p To. + * + * This function implements the trivial case where \p To and \p From are the same type. + * + * \param x The object of type \p From to be converted to type \p To. + * \returns An object of type \p To with all vector components converted according to + * standard conversion behavior as mandated by the C++ standard for the + * underlying arithmetic types. + */ +template +Vc_INTRINSIC Vc_CONST To +simd_cast(From &&x, enable_if>::value> = nullarg) +{ + return std::forward(x); +} + +/** + * A cast from nothing results in default-initialization of \p To. + * + * This function can be useful in generic code where a parameter pack expands to nothing. + * + * \returns A zero-initialized object of type \p To. + */ +template Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); } + +} // namespace Vc + +#endif // VC_COMMON_SIMD_CAST_H_ diff -Nru vc-0.7.4/common/simdize.h vc-1.3.0/common/simdize.h --- vc-0.7.4/common/simdize.h 1969-12-31 18:00:00.000000000 -0600 +++ vc-1.3.0/common/simdize.h 2016-10-27 02:05:02.000000000 -0500 @@ -0,0 +1,1809 @@ +/* This file is part of the Vc library. {{{ +Copyright © 2014-2015 Matthias Kretz + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the names of contributing organizations nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +}}}*/ + +#ifndef VC_COMMON_SIMDIZE_H_ +#define VC_COMMON_SIMDIZE_H_ + +#include +#include + +#include "macros.h" + +/*! +\addtogroup Simdize + +Automatic type vectorization. + +The simdize expression transforms the type \c T to a vectorized variant. This requires the type +\c T to be a class template instance. + +Example: +First, we declare a class template for a three-dimensional point. The template parameter \c T +determines the type of the members and is \c float in the scalar (classical) case. +\code +template struct PointTemplate +{ + T x, y, z; + + // Declares tuple_size and makes the members accessible via get(point), allowing + // the simdize implementation to convert between Point and PointV (see below). + Vc_SIMDIZE_INTERFACE((x, y, z)); + + PointTemplate(T xx, T yy, T zz) : x{xx}, y{yy}, z{zz} {}; + + // The following function will automatically be vectorized in the PointV type. + T distance_to_origin() const { return std::sqrt(x * x + y * y + z * z); } +}; +\endcode + +In the following we create a type alias for the scalar type, which simply means instantiating +\c PointTemplate with \c float. The resulting type can then be transformed with \ref simdize. +\code +using Point = PointTemplate; // A simple struct with three floats and two functions. +using PointV = Vc::simdize; // The vectorization of Point stores three float_v and thus + // float_v::size() Points. +\endcode + +The following shows a code example using the above \c Point and \c PointV types. +\code +PointV pv = Point{0.f, 1.f, 2.f}; // Constructs a PointV containing PointV::size() + // copies of Point{0, 1, 2}. +for (int i = 1; i < int(pv.size()); ++i) { + assign(pv, i, {i + 0.f, i + 1.f, i + 2.f}); +} + +const Vc::float_v l = pv.distance_to_origin(); +std::cout << l << '\n'; +// prints [2.23607, 3.74166, 5.38516, 7.07107, 8.77496, 10.4881, 12.2066, 13.9284] with +// float_v::size() == 8 + +const Point most_distant = extract(pv, (l.max() == l).firstOne()); +std::cout << '(' << most_distant.x << ", " << most_distant.y << ", " << most_distant.z << ")\n"; +// prints (7, 8, 9) with float_v::size() == 8 +\endcode + */ +namespace Vc_VERSIONED_NAMESPACE +{ +/**\internal + * \ingroup Simdize + * This namespace contains all the required code for implementing simdize. None of this + * code should be directly accessed by users, though the unit test for simdize + * certainly may look into some of the details if necessary. + */ +namespace SimdizeDetail +{ +/** + * \addtogroup Simdize + * @{ + */ +using std::is_same; +using std::is_base_of; +using std::false_type; +using std::true_type; +using std::iterator_traits; +using std::conditional; +using std::size_t; +template +using conditional_t = typename conditional::type; + +/**\internal + * Typelist is a simple helper class for supporting multiple parameter packs in one class + * template. + */ +template struct Typelist; + +/**\internal + * The Category identifies how the type argument to simdize has to be transformed. + */ +enum class Category { + ///\internal No transformation + None, + ///\internal simple Vector transformation + ArithmeticVectorizable, + ///\internal transform an input iterator to return vectorized entries + InputIterator, + ///\internal transform a forward iterator to return vectorized entries + OutputIterator, + ///\internal transform an output iterator to return vectorized entries + ForwardIterator, + ///\internal transform a bidirectional iterator to return vectorized entries + BidirectionalIterator, + ///\internal transform a random access iterator to return vectorized entries + RandomAccessIterator, + ///\internal transform a class template recursively + ClassTemplate +}; + +/**\internal + * iteratorCategories(int()) returns whether iterator_traits::iterator_category is a + * valid type and whether it is derived from RandomAccessIterator or ForwardIterator. + */ +template +constexpr Category iteratorCategories(int, ItCat * = nullptr) +{ + return is_base_of::value + ? Category::RandomAccessIterator + : is_base_of::value + ? Category::BidirectionalIterator + : is_base_of::value + ? Category::ForwardIterator + : is_base_of::value + ? Category::OutputIterator + : is_base_of::value + ? Category::InputIterator + : Category::None; +} +/**\internal + * This overload is selected for pointer types => RandomAccessIterator. + */ +template +constexpr enable_if::value, Category> iteratorCategories(float) +{ + return Category::RandomAccessIterator; +} +/**\internal + * This overload is selected if T does not work with iterator_traits. + */ +template constexpr Category iteratorCategories(...) +{ + return Category::None; +} + +/**\internal + * Simple trait to identify whether a type T is a class template or not. + */ +template struct is_class_template : public false_type +{ +}; +template