1 /**
2 * AVX and FP16C intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 *            cet 2024.
8 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 */
10 module inteli.avxintrin;
11 
12 // AVX instructions
13 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX
14 // Note: this header will work whether you have AVX enabled or not.
15 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
16 // generate AVX instructions.
17 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively
18 // generate AVX instructions.
19 
20 // This header also implements FP16C intrinsics.
21 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C
22 // With LDC, use "dflags-ldc": ["-mattr=+f16c"] or equivalent to actively
23 // generate F16C instructions.
24 // With GDC, use "dflags-gdc": ["-mf16c"] or equivalent to actively
25 // generate F16C instructions.
26 
27 /// IMPORTANT NOTE ABOUT MASK LOAD/STORE:
28 ///
29 /// In theory, masked load/store can adress unadressable memory provided the mask is zero.
30 /// In practice, that is not the case for the following reasons:
31 /// 
32 /// - AMD manual says:
33 ///   "Exception and trap behavior for elements not selected for loading or storing from/to memory
34 ///   is implementation dependent. For instance, a given implementation may signal a data 
35 ///   breakpoint or a page fault for doublewords that are zero-masked and not actually written."
36 ///
37 /// - Intel fetches the whole cacheline anyway:
38 ///   https://erik.science/2019/06/21/AVX-fun.html
39 ///   "Even if the mask is stored in the special mask registers, it will still first fetch the data
40 ///    before checking the mask."
41 ///
42 /// So intel-intrinsics adopted the tightened semantics of only adressing fully addressable memory 
43 /// with masked loads and stores.
44 
45 
46 /// Some AVX intrinsics takes a float comparison constant.
47 /// When labelled "ordered" it means "AND ordered"
48 /// When labelled "unordered" it means "OR unordered"
49 alias _CMP_EQ = int;
50 ///ditto
51 enum : _CMP_EQ
52 {
53     _CMP_EQ_OQ    = 0x00, // Equal (ordered, non-signaling)
54     _CMP_LT_OS    = 0x01, // Less-than (ordered, signaling)
55     _CMP_LE_OS    = 0x02, // Less-than-or-equal (ordered, signaling)
56     _CMP_UNORD_Q  = 0x03, // Unordered (non-signaling)
57     _CMP_NEQ_UQ   = 0x04, // Not-equal (unordered, non-signaling)
58     _CMP_NLT_US   = 0x05, // Not-less-than (unordered, signaling)
59     _CMP_NLE_US   = 0x06, // Not-less-than-or-equal (unordered, signaling)
60     _CMP_ORD_Q    = 0x07, // Ordered (nonsignaling)
61     _CMP_EQ_UQ    = 0x08, // Equal (unordered, non-signaling)
62     _CMP_NGE_US   = 0x09, // Not-greater-than-or-equal (unordered, signaling)
63     _CMP_NGT_US   = 0x0a, // Not-greater-than (unordered, signaling)
64     _CMP_FALSE_OQ = 0x0b, // False (ordered, non-signaling)
65     _CMP_NEQ_OQ   = 0x0c, // Not-equal (ordered, non-signaling)
66     _CMP_GE_OS    = 0x0d, // Greater-than-or-equal (ordered, signaling)
67     _CMP_GT_OS    = 0x0e, // Greater-than (ordered, signaling)
68     _CMP_TRUE_UQ  = 0x0f, // True (unordered, non-signaling)
69     _CMP_EQ_OS    = 0x10, // Equal (ordered, signaling)
70     _CMP_LT_OQ    = 0x11, // Less-than (ordered, non-signaling)
71     _CMP_LE_OQ    = 0x12, // Less-than-or-equal (ordered, non-signaling)
72     _CMP_UNORD_S  = 0x13, // Unordered (signaling)
73     _CMP_NEQ_US   = 0x14, // Not-equal (unordered, signaling)
74     _CMP_NLT_UQ   = 0x15, // Not-less-than (unordered, non-signaling)
75     _CMP_NLE_UQ   = 0x16, // Not-less-than-or-equal (unordered, non-signaling)
76     _CMP_ORD_S    = 0x17, // Ordered (signaling)
77     _CMP_EQ_US    = 0x18, // Equal (unordered, signaling)
78     _CMP_NGE_UQ   = 0x19, // Not-greater-than-or-equal (unordered, non-signaling)
79     _CMP_NGT_UQ   = 0x1a, // Not-greater-than (unordered, non-signaling)
80     _CMP_FALSE_OS = 0x1b, // False (ordered, signaling)
81     _CMP_NEQ_OS   = 0x1c, // Not-equal (ordered, signaling)
82     _CMP_GE_OQ    = 0x1d, // Greater-than-or-equal (ordered, non-signaling)
83     _CMP_GT_OQ    = 0x1e, // Greater-than (ordered, non-signaling)
84     _CMP_TRUE_US  = 0x1f  // (unordered, signaling)
85 }
86 
87 public import inteli.types;
88 import inteli.internals;
89 
90 // Pull in all previous instruction set intrinsics.
91 public import inteli.smmintrin;
92 public import inteli.tmmintrin;
93 public import inteli.nmmintrin;
94 
95 
96 
97 // In x86, LDC earlier version may have trouble preserving the stack pointer when an unsupported
98 // 256-bit vector type is passed, and AVX is disabled.
99 // This leads to disabling some intrinsics in this particular situation, since they are not safe for
100 // the caller.
101 version(LDC)
102 {
103     version(X86)
104     {
105         enum llvm256BitStackWorkaroundIn32BitX86 = __VERSION__ < 2099;
106     }
107     else 
108         enum llvm256BitStackWorkaroundIn32BitX86 = false;
109 }
110 else
111     enum llvm256BitStackWorkaroundIn32BitX86 = false;
112 
113 
114 
115 
116 nothrow @nogc:
117 
118 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
119 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted
120 {
121     return a + b;
122 }
123 unittest
124 {
125     align(32) double[4] A = [-1, 2, -3, 40000];
126     align(32) double[4] B = [ 9, -7, 8, -0.5];
127     __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
128     double[4] correct = [8, -5, 5, 39999.5];
129     assert(R.array == correct);
130 }
131 
132 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
133 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted
134 {
135     return a + b;
136 }
137 unittest
138 {
139     align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6];
140     align(32) float[8] B = [ 9.0f, -7, 8,  -0.5, 8, 7, 3, -1];
141     __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
142     float[8] correct     = [8, -5, 5, 39999.5, 8, 10, 8, 5];
143     assert(R.array == correct);
144 }
145 
146 /// Alternatively add and subtract packed double-precision (64-bit) floating-point
147 ///  elements in `a` to/from packed elements in `b`.
148 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted
149 {
150     // PERF DMD
151     static if (GDC_or_LDC_with_AVX)
152     {
153         return __builtin_ia32_addsubpd256(a, b);
154     }
155     else
156     {
157         //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3
158         ////       LDC x86 generates addsubpd since LDC 1.18 with -O2
159         //// LDC ARM: not fantastic, ok since LDC 1.18 -O2
160         a.ptr[0] = a.array[0] + (-b.array[0]);
161         a.ptr[1] = a.array[1] + b.array[1];
162         a.ptr[2] = a.array[2] + (-b.array[2]);
163         a.ptr[3] = a.array[3] + b.array[3];
164         return a;
165     }
166 }
167 unittest
168 {
169     align(32) double[4] A = [-1, 2, -3, 40000];
170     align(32) double[4] B = [ 9, -7, 8, -0.5];
171     __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
172     double[4] correct = [-10, -5, -11, 39999.5];
173     assert(R.array == correct);
174 }
175 
176 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 
177 /// in `a` to/from packed elements in `b`.
178 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted
179 {
180     // PERF DMD
181     static if (GDC_or_LDC_with_AVX)
182     {
183         return __builtin_ia32_addsubps256(a, b);
184     }
185     else
186     {
187         // Note: GDC x86 generates addsubps since GDC 11 -O3
188         //               and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2
189         //       LDC x86 generates addsubps since LDC 1.18 -O2
190         //               and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1
191         // LDC ARM: neat output since LDC 1.21 -O2
192    
193         a.ptr[0] = a.array[0] + (-b.array[0]);
194         a.ptr[1] = a.array[1] + b.array[1];
195         a.ptr[2] = a.array[2] + (-b.array[2]);
196         a.ptr[3] = a.array[3] + b.array[3];
197         a.ptr[4] = a.array[4] + (-b.array[4]);
198         a.ptr[5] = a.array[5] + b.array[5];
199         a.ptr[6] = a.array[6] + (-b.array[6]);
200         a.ptr[7] = a.array[7] + b.array[7];
201         return a;
202     }
203 }
204 unittest
205 {
206     align(32) float[8] A = [-1.0f,  2,  -3, 40000,    0, 3,  5,  6];
207     align(32) float[8] B = [ 9.0f, -7,   8,  -0.5,    8, 7,  3, -1];
208     __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
209     float[8] correct     = [  -10, -5, -11, 39999.5, -8, 10, 2,  5];
210     assert(R.array == correct);
211 }
212 
213 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`.
214 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted
215 {
216     // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd,
217     //       but those do not seem needed at any optimization level.
218     return cast(__m256d)(cast(__m256i)a & cast(__m256i)b);
219 }
220 unittest
221 {
222     double a = 4.32;
223     double b = -78.99;
224     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
225     __m256d A = _mm256_set_pd(a, b, a, b);
226     __m256d B = _mm256_set_pd(b, a, b, a);
227     long4 R = cast(long4)( _mm256_and_pd(A, B) );
228     assert(R.array[0] == correct);
229     assert(R.array[1] == correct);
230     assert(R.array[2] == correct);
231     assert(R.array[3] == correct);
232 }
233 
234 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
235 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted
236 {
237     return cast(__m256)(cast(__m256i)a & cast(__m256i)b);
238 }
239 unittest
240 {
241     float a = 4.32f;
242     float b = -78.99f;
243     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
244     __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b);
245     __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a);
246     int8 R = cast(int8)( _mm256_and_ps(A, B) );
247     foreach(i; 0..8)
248         assert(R.array[i] == correct);
249 }
250 
251 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
252 /// and then AND with b.
253 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted
254 {
255     // PERF DMD
256     __m256i notA = _mm256_not_si256(cast(__m256i)a);
257     __m256i ib = cast(__m256i)b;
258     __m256i ab = notA & ib;
259     return cast(__m256d)ab;
260 }
261 unittest
262 {
263     double a = 4.32;
264     double b = -78.99;
265     long notA = ~ ( *cast(long*)(&a) );
266     long correct = notA & (*cast(long*)(&b));
267     __m256d A = _mm256_set_pd(a, a, a, a);
268     __m256d B = _mm256_set_pd(b, b, b, b);
269     long4 R = cast(long4)( _mm256_andnot_pd(A, B) );
270     foreach(i; 0..4)
271         assert(R.array[i] == correct);
272 }
273 
274 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
275 /// and then AND with b.
276 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted
277 {
278     // PERF DMD
279     __m256i notA = _mm256_not_si256(cast(__m256i)a);
280     __m256i ib = cast(__m256i)b;
281     __m256i ab = notA & ib;
282     return cast(__m256)ab;
283 }
284 unittest
285 {
286     float a = 4.32f;
287     float b = -78.99f;
288     int notA = ~ ( *cast(int*)(&a) );
289     int correct = notA & (*cast(int*)(&b));
290     __m256 A = _mm256_set1_ps(a);
291     __m256 B = _mm256_set1_ps(b);
292     int8 R = cast(int8)( _mm256_andnot_ps(A, B) );
293     foreach(i; 0..8)
294         assert(R.array[i] == correct);
295 }
296 
297 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 
298 /// mask `imm8`.
299 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b)
300 {
301     static assert(imm8 >= 0 && imm8 < 16);
302 
303     // PERF DMD
304     static if (GDC_with_AVX)
305     {
306         return __builtin_ia32_blendpd256 (a, b, imm8);
307     }
308     else
309     {
310         // Works great with LDC.
311         double4 r;
312         for (int n = 0; n < 4; ++n)
313         {
314             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
315         }
316         return r;
317     }
318 }
319 unittest
320 {
321     __m256d A = _mm256_setr_pd(0, 1, 2, 3);
322     __m256d B = _mm256_setr_pd(8, 9, 10, 11);
323     double4 C = _mm256_blend_pd!0x06(A, B);
324     double[4] correct =    [0, 9, 10, 3];
325     assert(C.array == correct);
326 }
327 
328 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
329 /// mask `imm8`.
330 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted
331 {
332     static assert(imm8 >= 0 && imm8 < 256);
333     // PERF DMD
334     static if (GDC_with_AVX)
335     {
336         return __builtin_ia32_blendps256 (a, b, imm8);
337     }
338     else version(LDC)
339     {
340         // LDC x86: generates a vblendps since LDC 1.1 -O0
341         //   arm64: pretty good, four instructions worst case
342         return shufflevectorLDC!(float8, (imm8 & 1) ? 8 : 0,
343                                  (imm8 & 2) ? 9 : 1,
344                                  (imm8 & 4) ? 10 : 2,
345                                  (imm8 & 8) ? 11 : 3,
346                                  (imm8 & 16) ? 12 : 4,
347                                  (imm8 & 32) ? 13 : 5,
348                                  (imm8 & 64) ? 14 : 6,
349                                  (imm8 & 128) ? 15 : 7)(a, b);
350     }
351     else
352     {
353         // LDC x86: vblendps generated since LDC 1.27 -O1
354         float8 r;
355         for (int n = 0; n < 8; ++n)
356         {
357             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
358         }
359         return r;
360     }
361 }
362 unittest
363 {
364     __m256 A = _mm256_setr_ps(0, 1,  2,  3,  4,  5,  6,  7);
365     __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15);
366     float8 C = _mm256_blend_ps!0xe7(A, B);
367     float[8] correct =       [8, 9, 10,  3,  4, 13, 14, 15];
368     assert(C.array == correct);
369 }
370 
371 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask.
372 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted
373 {
374     // PERF DMD
375     static if (GDC_with_AVX)
376     {
377         // Amazingly enough, GCC/GDC generates the vblendvpd instruction
378         // with -mavx2 but not -mavx.
379         // Not sure what is the reason, and there is a replacement sequence.
380         // Sounds like a bug, similar to _mm_blendv_pd
381         // or maybe the instruction in unsafe?
382         return __builtin_ia32_blendvpd256(a, b, mask);
383     }
384     else static if (LDC_with_AVX)
385     {
386         return __builtin_ia32_blendvpd256(a, b, mask);
387     }
388     else
389     {
390         // LDC x86: vblendvpd since LDC 1.27 -O2
391         //     arm64: only 4 instructions, since LDC 1.27 -O2
392         __m256d r;
393         long4 lmask = cast(long4)mask;
394         for (int n = 0; n < 4; ++n)
395         {
396             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
397         }
398         return r;
399     }
400 }
401 unittest
402 {
403     __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
404     __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
405     __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0);
406     __m256d R = _mm256_blendv_pd(A, B, M);
407     double[4] correct1 = [5.0, 2.0, 3.0, 8.0];
408     assert(R.array == correct1);
409 }
410 
411 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 
412 /// using `mask`.
413 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted
414 {
415     // PERF DMD
416     static if (GDC_or_LDC_with_AVX)
417     {
418         return __builtin_ia32_blendvps256(a, b, mask);
419     }
420     else static if (LDC_with_ARM64)
421     {
422         int8 shift;
423         shift = 31;
424         int8 lmask = cast(int8)mask >> shift;     
425         int8 ia = cast(int8)a;   
426         int8 ib = cast(int8)b;
427         return cast(__m256)(ia ^ ((ia ^ ib) & lmask));
428     }
429     else
430     {
431         // In both LDC and GDC with SSE4.1, this generates blendvps as fallback
432         __m256 r;
433         int8 lmask = cast(int8)mask;
434         for (int n = 0; n < 8; ++n)
435         {
436             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
437         }
438         return r;
439     }
440 }
441 unittest
442 {
443     __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
444     __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f);
445     __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f);
446     __m256 R = _mm256_blendv_ps(A, B, M);
447     float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f];
448     assert(R.array == correct1);
449 }
450 
451 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit)
452 /// floating-point elements) to all elements.
453 /// This effectively duplicates the 128-bit vector.
454 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted
455 {
456     // PERF DMD
457     static if (GDC_with_AVX)
458     {
459         return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr);
460     }
461     else
462     {
463         const(double)* p = cast(const(double)*) mem_addr;
464         __m256d r;
465         r.ptr[0] = p[0];
466         r.ptr[1] = p[1];
467         r.ptr[2] = p[0];
468         r.ptr[3] = p[1];
469         return r;
470     }
471 }
472 unittest
473 {
474     __m128d A = _mm_setr_pd(3, -4);
475     __m256d B = _mm256_broadcast_pd(&A);
476     double[4] correct = [3, -4, 3, -4];
477     assert(B.array == correct);
478 }
479 
480 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 
481 /// floating-point elements) to all elements.
482 /// This effectively duplicates the 128-bit vector.
483 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted
484 {
485     // PERF DMD
486     static if (GDC_with_AVX)
487     {
488         return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr);
489     }   
490     else
491     {
492         const(float)* p = cast(const(float)*)mem_addr;
493         __m256 r;
494         r.ptr[0] = p[0];
495         r.ptr[1] = p[1];
496         r.ptr[2] = p[2];
497         r.ptr[3] = p[3];
498         r.ptr[4] = p[0];
499         r.ptr[5] = p[1];
500         r.ptr[6] = p[2];
501         r.ptr[7] = p[3];
502         return r;
503     }
504 }
505 unittest
506 {
507     __m128 A = _mm_setr_ps(1, 2, 3, -4);
508     __m256 B = _mm256_broadcast_ps(&A);
509     float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4];
510     assert(B.array == correct);
511 }
512 
513 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
514 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted
515 {
516     static if (GDC_with_AVX)
517     {
518         return __builtin_ia32_vbroadcastsd256(mem_addr);
519     }
520     else
521     {
522         double a = *mem_addr;
523         __m256d r;
524         r.ptr[0] = a;
525         r.ptr[1] = a;
526         r.ptr[2] = a;
527         r.ptr[3] = a;
528         return r;
529     }
530 }
531 unittest
532 {
533     double t = 7.5f;
534     __m256d A = _mm256_broadcast_sd(&t);
535     double[4] correct = [7.5, 7.5, 7.5, 7.5];
536     assert(A.array == correct);
537 }
538 
539 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
540 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted
541 {
542     // PERF DMD
543     static if (GDC_with_AVX)
544     {
545         return __builtin_ia32_vbroadcastss(mem_addr);
546     }
547     else
548     {
549         float a = *mem_addr;
550         __m128 r;
551         r.ptr[0] = a;
552         r.ptr[1] = a;
553         r.ptr[2] = a;
554         r.ptr[3] = a;
555         return r;
556     }
557 }
558 unittest
559 {
560     float t = 7.5f;
561     __m128 A = _mm_broadcast_ss(&t);
562     float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f];
563     assert(A.array == correct);
564 }
565 
566 __m256 _mm256_broadcast_ss (const(float)* mem_addr)
567 {
568     // PERF DMD
569     static if (GDC_with_AVX)
570     {
571         return __builtin_ia32_vbroadcastss256 (mem_addr);
572     }
573     else
574     {
575         float a = *mem_addr;
576         __m256 r = __m256(a);
577         return r;
578     }
579 }
580 unittest
581 {
582     float t = 7.5f;
583     __m256 A = _mm256_broadcast_ss(&t);
584     float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f];
585     assert(A.array == correct);
586 }
587 
588 /// Cast vector of type `__m256d` to type `__m256`.
589 __m256 _mm256_castpd_ps (__m256d a) pure @safe
590 {
591     return cast(__m256)a;
592 }
593 
594 /// Cast vector of type `__m256d` to type `__m256i`.
595 __m256i _mm256_castpd_si256 (__m256d a) pure @safe
596 {
597     return cast(__m256i)a;
598 }
599 
600 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined.
601 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted
602 {
603     static if (GDC_with_AVX)
604     {
605         return __builtin_ia32_pd256_pd(a);
606     }
607     else
608     {
609         __m256d r = void;
610         r.ptr[0] = a.array[0];
611         r.ptr[1] = a.array[1];
612         return r;
613     }
614 }
615 unittest
616 {
617     __m128d A = _mm_setr_pd(4.0, -6.125);
618     __m256d B = _mm256_castpd128_pd256(A);
619     assert(B.array[0] == 4.0);
620     assert(B.array[1] == -6.125);
621 }
622 
623 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost.
624 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted
625 {
626     static if (GDC_with_AVX)
627     {
628         return __builtin_ia32_pd_pd256(a);
629     }
630     else
631     {
632         __m128d r;
633         r.ptr[0] = a.array[0];
634         r.ptr[1] = a.array[1];
635         return r;
636     }
637 }
638 unittest
639 {
640     __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0);
641     __m128d B = _mm256_castpd256_pd128(A);
642     assert(B.array[0] == 4.0);
643     assert(B.array[1] == -6.25);
644 }
645 
646 /// Cast vector of type `__m256` to type `__m256d`.
647 __m256d _mm256_castps_pd (__m256 a) pure @safe
648 {
649     return cast(__m256d)a;
650 }
651 
652 /// Cast vector of type `__m256` to type `__m256i`.
653 __m256i _mm256_castps_si256 (__m256 a) pure @safe
654 {
655     return cast(__m256i)a;
656 }
657 
658 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined.
659 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted
660 {
661     static if (GDC_with_AVX)
662     {
663         return __builtin_ia32_ps256_ps(a);
664     }
665     else
666     {
667         __m256 r = void;
668         r.ptr[0] = a.array[0];
669         r.ptr[1] = a.array[1];
670         r.ptr[2] = a.array[2];
671         r.ptr[3] = a.array[3];
672         return r;
673     }
674 }
675 unittest
676 {
677     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
678     __m256 B = _mm256_castps128_ps256(A);
679     float[4] correct = [1.0f, 2, 3, 4];
680     assert(B.array[0..4] == correct);
681 }
682 
683 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost.
684 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted
685 {
686     return *cast(const(__m128)*)(&a);
687 }
688 unittest
689 {
690     __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
691     __m128 B = _mm256_castps256_ps128(A);
692     float[4] correct = [1.0f, 2, 3, 4];
693     assert(B.array == correct);
694 }
695 
696 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined.
697 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted
698 {
699     long2 la = cast(long2)a;
700     long4 r = void;
701     r.ptr[0] = la.array[0];
702     r.ptr[1] = la.array[1];
703     return r;
704 }
705 unittest
706 {
707     __m128i A = _mm_setr_epi64(-1, 42);
708     __m256i B = _mm256_castsi128_si256(A);
709     long[2] correct = [-1, 42];
710     assert(B.array[0..2] == correct);
711 }
712 
713 /// Cast vector of type `__m256i` to type `__m256d`.
714 __m256d _mm256_castsi256_pd (__m256i a) pure @safe
715 {
716     return cast(__m256d)a;
717 }
718 
719 /// Cast vector of type `__m256i` to type `__m256`.
720 __m256 _mm256_castsi256_ps (__m256i a) pure @safe
721 {
722     return cast(__m256)a;
723 }
724 
725 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost.
726 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted
727 {
728     long2 r = void;
729     r.ptr[0] = a.array[0];
730     r.ptr[1] = a.array[1];
731     return cast(__m128i)r;
732 }
733 unittest
734 {
735     long4 A;
736     A.ptr[0] = -1;
737     A.ptr[1] = 42;
738     long2 B = cast(long2)(_mm256_castsi256_si128(A));
739     long[2] correct = [-1, 42];
740     assert(B.array[0..2] == correct);
741 }
742 
743 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer 
744 /// value, and store the results as packed double-precision floating-point elements.
745 __m256d _mm256_ceil_pd (__m256d a) @safe
746 {
747     static if (LDC_with_ARM64)
748     {
749          __m128d lo = _mm256_extractf128_pd!0(a);
750         __m128d hi = _mm256_extractf128_pd!1(a);
751         __m128d ilo = _mm_ceil_pd(lo);
752         __m128d ihi = _mm_ceil_pd(hi);
753         return _mm256_set_m128d(ihi, ilo);
754     }
755     else
756     {
757         return _mm256_round_pd!2(a);
758     }
759 }
760 unittest
761 {
762     __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f);
763     A = _mm256_ceil_pd(A);
764     double[4] correct = [2.0, -2.0, 54.0, -2.0];
765     assert(A.array == correct);
766 }
767 
768 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer 
769 /// value, and store the results as packed single-precision floating-point elements.
770 __m256 _mm256_ceil_ps (__m256 a) @safe
771 {
772     static if (LDC_with_ARM64)
773     {
774         __m128 lo = _mm256_extractf128_ps!0(a);
775         __m128 hi = _mm256_extractf128_ps!1(a);
776         __m128 ilo = _mm_ceil_ps(lo);
777         __m128 ihi = _mm_ceil_ps(hi);
778         return _mm256_set_m128(ihi, ilo);
779     }
780     else
781     {
782         return _mm256_round_ps!2(a);
783     }
784 }
785 unittest
786 {
787     __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f);
788     __m256 C = _mm256_ceil_ps(A);
789     float[8] correct       = [2.0f, -2.0f,  54.0f, -2.0f, -1,    3,     -53,    3];
790     assert(C.array == correct);
791 }
792 
793 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b` based on the 
794 /// comparison operand specified by `imm8`. 
795 __m128d _mm_cmp_pd(int imm8)(__m128d a, __m128d b) pure @safe
796 {
797     enum comparison = mapAVXFPComparison(imm8);
798     return cast(__m128d) cmppd!comparison(a, b);
799 }
800 unittest
801 {
802     __m128d A = _mm_setr_pd(double.infinity, double.nan);
803     __m128d B = _mm_setr_pd(3.0,             4.0);
804     long2 R = cast(long2) _mm_cmp_pd!_CMP_GT_OS(A, B);
805     long[2] correct = [-1, 0];
806     assert(R.array == correct);
807 
808     long2 R2 = cast(long2) _mm_cmp_pd!_CMP_NLE_UQ(A, B);
809     long[2] correct2 = [-1, -1];
810     assert(R2.array == correct2);
811 }
812 
813 ///ditto
814 __m256d _mm256_cmp_pd(int imm8)(__m256d a, __m256d b) pure @safe
815 {
816     enum comparison = mapAVXFPComparison(imm8);
817     return cast(__m256d) cmppd256!comparison(a, b);
818 }
819 unittest
820 {
821     __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, double.nan);
822     __m256d B = _mm256_setr_pd(3.0, 2.0, 1.0, double.nan);
823     __m256i R = cast(__m256i) _mm256_cmp_pd!_CMP_LT_OS(A, B);
824     long[4] correct = [-1, 0, 0, 0];
825     assert(R.array == correct);
826 }
827 
828 /// Compare packed double-precision (32-bit) floating-point elements in `a` and `b` based on the 
829 /// comparison operand specified by `imm8`. 
830 __m128 _mm_cmp_ps(int imm8)(__m128 a, __m128 b) pure @safe
831 {
832     enum comparison = mapAVXFPComparison(imm8);
833     return cast(__m128) cmpps!comparison(a, b);
834 }
835 
836 ///ditto
837 __m256 _mm256_cmp_ps(int imm8)(__m256 a, __m256 b) pure @safe
838 {
839     enum comparison = mapAVXFPComparison(imm8);
840     return cast(__m256) cmpps256!comparison(a, b);
841 }
842 
843 /// Compare the lower double-precision (64-bit) floating-point element in `a` and `b` based on the
844 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 
845 /// copy the upper element from `a` to the upper element of result.
846 __m128d _mm_cmp_sd(int imm8)(__m128d a, __m128d b) pure @safe
847 {
848     enum comparison = mapAVXFPComparison(imm8);
849     return cast(__m128d) cmpsd!comparison(a, b);
850 }
851 
852 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` based on the
853 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 
854 /// copy the upper 3 packed elements from `a` to the upper elements of result.
855 __m128 _mm_cmp_ss(int imm8)(__m128 a, __m128 b) pure @safe
856 {
857     enum comparison = mapAVXFPComparison(imm8);
858     return cast(__m128) cmpss!comparison(a, b);
859 }
860 
861 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 
862 /// elements.
863 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted
864 {
865     static if (LDC_with_optimizations)
866     {
867         enum ir = `
868             %r = sitofp <4 x i32> %0 to <4 x double>
869             ret <4 x double> %r`;
870         return LDCInlineIR!(ir, double4, __m128i)(a);
871     }
872     else static if (GDC_with_AVX)
873     {
874         return __builtin_ia32_cvtdq2pd256(a);
875     }
876     else
877     {
878         double4 r;
879         r.ptr[0] = a.array[0];
880         r.ptr[1] = a.array[1];
881         r.ptr[2] = a.array[2];
882         r.ptr[3] = a.array[3];
883         return r;
884     }
885 }
886 unittest
887 {
888     __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54));
889     double[4] correct = [54.0, 54, 54, 54];
890     assert(R.array == correct);
891 }
892 
893 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 
894 /// elements.
895 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted
896 {
897     static if (LDC_with_optimizations)
898     {
899         enum ir = `
900             %r = sitofp <8 x i32> %0 to <8 x float>
901             ret <8 x float> %r`;
902         return LDCInlineIR!(ir, float8, int8)(cast(int8)a);
903     }
904     else static if (GDC_with_AVX)
905     {
906         return __builtin_ia32_cvtdq2ps256(cast(int8)a);
907     }
908     else
909     {
910         int8 ia = cast(int8)a;
911         __m256 r;
912         r.ptr[0] = ia.array[0];
913         r.ptr[1] = ia.array[1];
914         r.ptr[2] = ia.array[2];
915         r.ptr[3] = ia.array[3];
916         r.ptr[4] = ia.array[4];
917         r.ptr[5] = ia.array[5];
918         r.ptr[6] = ia.array[6];
919         r.ptr[7] = ia.array[7];
920         return r;
921     }
922 }
923 unittest
924 {
925     __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5));
926     float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5];
927     assert(R.array == correct);
928 }
929 
930 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 
931 /// integers. Follows the current rounding mode.
932 __m128i _mm256_cvtpd_epi32 (__m256d a) @safe
933 {
934     static if (GDC_or_LDC_with_AVX)
935     {
936         return __builtin_ia32_cvtpd2dq256(a);
937     }
938     else
939     {
940         __m128d lo = _mm256_extractf128_pd!0(a);
941         __m128d hi = _mm256_extractf128_pd!1(a);
942         __m128i ilo = _mm_cvtpd_epi32(lo); // Only lower 64-bit contains significant values
943         __m128i ihi = _mm_cvtpd_epi32(hi);
944         return _mm_unpacklo_epi64(ilo, ihi);
945     }
946 }
947 unittest
948 {
949     int4 A = _mm256_cvtpd_epi32(_mm256_setr_pd(61.0, 55.0, -100, 1_000_000));
950     int[4] correct = [61, 55, -100, 1_000_000];
951     assert(A.array == correct);
952 }
953 
954 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 
955 /// floating-point elements.
956 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted
957 {
958     // PERF DMD
959     static if (GDC_or_LDC_with_AVX)
960     {
961         return __builtin_ia32_cvtpd2ps256(a);
962     }
963     else
964     {
965         __m128 r;
966         r.ptr[0] = a.array[0];
967         r.ptr[1] = a.array[1];
968         r.ptr[2] = a.array[2];
969         r.ptr[3] = a.array[3];
970         return r;
971     }
972 }
973 unittest
974 {
975     __m256d A = _mm256_setr_pd(1.0, 2, 3, 5);
976     __m128 R = _mm256_cvtpd_ps(A);
977     float[4] correct = [1.0f, 2, 3, 5];
978     assert(R.array == correct);
979 }
980 
981 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
982 /// integers, using the current rounding mode.
983 __m256i _mm256_cvtps_epi32 (__m256 a) @trusted
984 {
985     static if (GDC_or_LDC_with_AVX)
986     {
987         return cast(__m256i) __builtin_ia32_cvtps2dq256(a);
988     }
989     else
990     {
991         __m128 lo = _mm256_extractf128_ps!0(a);
992         __m128 hi = _mm256_extractf128_ps!1(a);
993         __m128i ilo = _mm_cvtps_epi32(lo);
994         __m128i ihi = _mm_cvtps_epi32(hi);
995         return _mm256_set_m128i(ihi, ilo);
996     }
997 }
998 unittest
999 {
1000     uint savedRounding = _MM_GET_ROUNDING_MODE();
1001 
1002     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1003     __m256i A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.1f, 53.5f, -2.9f, -1.4f, 2.1f, -53.5f, 2.9f));
1004     assert( (cast(int8)A).array == [1, -2, 54, -3, -1, 2, -54, 3]);
1005 
1006     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1007     A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.11f, 53.4f, -2.8f, -1.3f, 2.11f, -53.4f, 2.8f));
1008     assert( (cast(int8)A).array == [1, -3, 53, -3, -2, 2, -54, 2]);
1009 
1010     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1011     A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f));
1012     assert( (cast(int8)A).array == [2, -2, 54, -2, -1, 3, -53, 3]);
1013 
1014     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1015     A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.17f, 53.8f, -2.91f, -1.4f, 2.17f, -53.8f, 2.91f));
1016     assert( (cast(int8)A).array == [1, -2, 53, -2, -1, 2, -53, 2]);
1017 
1018     _MM_SET_ROUNDING_MODE(savedRounding);
1019 }
1020 
1021 
1022 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 
1023 /// (64-bit) floating-point elements.
1024 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted
1025 {   
1026     // PERF DMD
1027     static if (GDC_with_AVX)
1028     {
1029         return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin
1030     }
1031     else
1032     {
1033         // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0
1034         __m256d r;
1035         r.ptr[0] = a.array[0];
1036         r.ptr[1] = a.array[1];
1037         r.ptr[2] = a.array[2];
1038         r.ptr[3] = a.array[3];
1039         return r;
1040     }
1041 }
1042 unittest
1043 {
1044     __m128 A = _mm_setr_ps(1.0f, 2, 3, 5);
1045     __m256d R = _mm256_cvtps_pd(A);
1046     double[4] correct = [1.0, 2, 3, 5];
1047     assert(R.array == correct);
1048 }
1049 
1050 /// Return the lower double-precision (64-bit) floating-point element of `a`.
1051 double _mm256_cvtsd_f64 (__m256d a) pure @safe
1052 {
1053     return a.array[0];
1054 }
1055 
1056 /// Return the lower 32-bit integer in `a`.
1057 int _mm256_cvtsi256_si32 (__m256i a) pure @safe
1058 {
1059     return (cast(int8)a).array[0];
1060 }
1061 
1062 /// Return the lower single-precision (32-bit) floating-point element of `a`.
1063 float _mm256_cvtss_f32 (__m256 a) pure @safe
1064 {
1065     return a.array[0];
1066 }
1067 
1068 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 
1069 /// integers with truncation.
1070 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted
1071 {
1072     // PERF DMD
1073     static if (GDC_or_LDC_with_AVX)
1074     {
1075         return cast(__m128i)__builtin_ia32_cvttpd2dq256(a);
1076     }
1077     else
1078     {
1079         __m128i r;
1080         r.ptr[0] = cast(int)a.array[0];
1081         r.ptr[1] = cast(int)a.array[1];
1082         r.ptr[2] = cast(int)a.array[2];
1083         r.ptr[3] = cast(int)a.array[3];
1084         return r;
1085     }
1086 }
1087 unittest
1088 {
1089     __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1);
1090     __m128i R = _mm256_cvttpd_epi32(A);
1091     int[4] correct = [3, -7, -1000, 4];
1092     assert(R.array == correct);
1093 }
1094 
1095 /// Convert packed single-precision (32-bit) floating-point elements in `a`.
1096 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted
1097 {
1098     // PERF DMD
1099     static if (GDC_or_LDC_with_AVX)
1100     {
1101         return cast(__m256i)__builtin_ia32_cvttps2dq256(a);
1102     }
1103     else
1104     {
1105         int8 r;
1106         r.ptr[0] = cast(int)a.array[0];
1107         r.ptr[1] = cast(int)a.array[1];
1108         r.ptr[2] = cast(int)a.array[2];
1109         r.ptr[3] = cast(int)a.array[3];
1110         r.ptr[4] = cast(int)a.array[4];
1111         r.ptr[5] = cast(int)a.array[5];
1112         r.ptr[6] = cast(int)a.array[6];
1113         r.ptr[7] = cast(int)a.array[7];
1114         return cast(__m256i)r;
1115     }
1116 }
1117 unittest
1118 {
1119     __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0);
1120     int8 R = cast(int8) _mm256_cvttps_epi32(A);
1121     int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4];
1122     assert(R.array == correct);
1123 }
1124 
1125 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1126 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe
1127 {
1128     return a / b;
1129 }
1130 unittest
1131 {
1132     __m256d a = [1.5, -2.0, 3.0, 1.0];
1133     a = _mm256_div_pd(a, a);
1134     double[4] correct = [1.0, 1.0, 1.0, 1.0];
1135     assert(a.array == correct);
1136 }
1137 
1138 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
1139 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe
1140 {
1141     return a / b;
1142 }
1143 unittest
1144 {
1145     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f];
1146     a = _mm256_div_ps(a, a);
1147     float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f];
1148     assert(a.array == correct);
1149 }
1150 
1151 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 
1152 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 
1153 /// using the low 4 bits of `imm8`.
1154 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b)
1155 {
1156     // PERF DMD
1157     static if (GDC_or_LDC_with_AVX)
1158     {
1159         return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8);
1160     }
1161     else
1162     {
1163         // Note: in LDC with SSE4.1 but no AVX, we _could_ increase perf a bit by using two 
1164         // _mm_dp_ps.
1165         __m256 zero = _mm256_setzero_ps();
1166         enum ubyte op = (imm8 >>> 4) & 15;
1167         __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b);
1168         float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
1169         float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7];
1170         __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo));
1171         enum ubyte op2 = (imm8 & 15);
1172         return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r);
1173     }
1174 }
1175 unittest
1176 {
1177     // Products:                 9    14    20   24     6    16    12   -24
1178     __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f);
1179     __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f);
1180     float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B);
1181     float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B);
1182     float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B);
1183     float[8] correct1 =   [67.0f, 67.0f, 67.0f,67.0f,  10,   10,   10,  10];
1184     float[8] correct2 =   [23.0f, 0.0f, 23.0f,  0.0f,  22,    0,   22,   0];
1185     float[8] correct3 =   [0.0f, 29.0f, 0.0f,  29.0f,   0,   18,    0,  18];
1186     assert(R1.array == correct1);
1187     assert(R2.array == correct2);
1188     assert(R3.array == correct3);
1189 }
1190 
1191 /// Extract a 32-bit integer from `a`, selected with `imm8`.
1192 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted
1193 {
1194     return (cast(int8)a).array[imm8 & 7];
1195 }
1196 unittest
1197 {
1198     align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6];
1199     auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr);
1200     assert(_mm256_extract_epi32(A, 0) == -1);
1201     assert(_mm256_extract_epi32(A, 1 + 8) == 2);
1202     assert(_mm256_extract_epi32(A, 3 + 16) == 4);
1203     assert(_mm256_extract_epi32(A, 7 + 32) == -6);
1204 }
1205 
1206 /// Extract a 64-bit integer from `a`, selected with `index`.
1207 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe
1208 {
1209     return a.array[index & 3];
1210 }
1211 unittest
1212 {
1213     __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0);
1214     assert(_mm256_extract_epi64(A, -8) == -7);
1215     assert(_mm256_extract_epi64(A, 1) == 6);
1216     assert(_mm256_extract_epi64(A, 2 + 4) == 42);
1217 }
1218 
1219 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1).
1220 /// Note: `_mm256_extractf128_pd!0` is equivalent to `_mm256_castpd256_pd128`.
1221 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted
1222 {
1223     version(GNU) pragma(inline, true); // else GDC has trouble inlining this
1224 
1225     // PERF DMD
1226     static if (GDC_with_AVX)
1227     {
1228         // Note: needs to be a template intrinsics because of this builtin.
1229         return __builtin_ia32_vextractf128_pd256(a, imm8 & 1);
1230     }
1231     else
1232     {
1233         double2 r = void;
1234         enum int index = 2*(imm8 & 1);
1235         r.ptr[0] = a.array[index+0];
1236         r.ptr[1] = a.array[index+1];
1237         return r;
1238     }
1239 }
1240 unittest
1241 {
1242     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
1243     double[4] correct = [1.0, 2, 3, 4];
1244     __m128d l0 = _mm256_extractf128_pd!18(A);
1245     __m128d l1 = _mm256_extractf128_pd!55(A);
1246     assert(l0.array == correct[0..2]);
1247     assert(l1.array == correct[2..4]);
1248 }
1249 
1250 ///ditto
1251 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted
1252 {
1253     version(GNU) pragma(inline, true); // else GDC has trouble inlining this
1254 
1255     // PERF DMD
1256     static if (GDC_with_AVX)
1257     {
1258         return __builtin_ia32_vextractf128_ps256(a, imm8 & 1);
1259     }
1260     else
1261     {
1262         float4 r = void; // Optimize well since LDC 1.1 -O1
1263         enum int index = 4*(imm8 & 1);
1264         r.ptr[0] = a.array[index+0];
1265         r.ptr[1] = a.array[index+1];
1266         r.ptr[2] = a.array[index+2];
1267         r.ptr[3] = a.array[index+3];
1268         return r;
1269     }
1270 }
1271 unittest
1272 {
1273     __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8);
1274     float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
1275     __m128 l0 = _mm256_extractf128_ps!8(A);
1276     __m128 l1 = _mm256_extractf128_ps!255(A);
1277     assert(l0.array == correct[0..4]);
1278     assert(l1.array == correct[4..8]);
1279 }
1280 
1281 ///ditto
1282 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted
1283 {
1284     version(GNU) pragma(inline, true); // else GDC has trouble inlining this
1285 
1286     // PERF DMD
1287     static if (GDC_with_AVX)
1288     {
1289         // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256
1290         // could be a non-template, however, this wins in -O0.
1291         // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd
1292         return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1);
1293     }
1294     else
1295     {
1296         long2 r = void;
1297         enum int index = 2*(imm8 & 1);
1298         r.ptr[0] = a.array[index+0];
1299         r.ptr[1] = a.array[index+1];
1300         return cast(__m128i)r;
1301     }
1302 }
1303 unittest
1304 {
1305     __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8);
1306     int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8];
1307     __m128i l0 = _mm256_extractf128_si256!0(A);
1308     __m128i l1 = _mm256_extractf128_si256!1(A);
1309     assert(l0.array == correct[0..4]);
1310     assert(l1.array == correct[4..8]);
1311 }
1312 
1313 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an integer 
1314 /// value, and store the results as packed double-precision floating-point elements.
1315 __m256d _mm256_floor_pd (__m256d a) @safe
1316 {
1317     static if (LDC_with_ARM64)
1318     {
1319         __m128d lo = _mm256_extractf128_pd!0(a);
1320         __m128d hi = _mm256_extractf128_pd!1(a);
1321         __m128d ilo = _mm_floor_pd(lo);
1322         __m128d ihi = _mm_floor_pd(hi);
1323         return _mm256_set_m128d(ihi, ilo);
1324     }
1325     else
1326     {
1327         return _mm256_round_pd!1(a);
1328     }
1329 }
1330 unittest
1331 {
1332     __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f);
1333     A = _mm256_floor_pd(A);
1334     double[4] correct = [1.0, -3.0, 53.0, -3.0];
1335     assert(A.array == correct);
1336 }
1337 
1338 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an integer 
1339 /// value, and store the results as packed single-precision floating-point elements.
1340 __m256 _mm256_floor_ps (__m256 a) @safe
1341 {
1342     static if (LDC_with_ARM64)
1343     {
1344         __m128 lo = _mm256_extractf128_ps!0(a);
1345         __m128 hi = _mm256_extractf128_ps!1(a);
1346         __m128 ilo = _mm_floor_ps(lo);
1347         __m128 ihi = _mm_floor_ps(hi);
1348         return _mm256_set_m128(ihi, ilo);
1349     }
1350     else
1351     {
1352         return _mm256_round_ps!1(a);
1353     }
1354 }
1355 unittest
1356 {
1357     __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f);
1358     __m256 C = _mm256_floor_ps(A);
1359     float[8] correct       = [1.0f, -3.0f,  53.0f, -3.0f, -2,    2,     -54,    2];
1360     assert(C.array == correct);
1361 }
1362 
1363 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 
1364 /// and `b`. 
1365 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted
1366 {
1367     static if (GDC_or_LDC_with_AVX)
1368     {
1369         return __builtin_ia32_haddpd256(a, b);
1370     }
1371     else
1372     {
1373         __m256d res;
1374         res.ptr[0] = a.array[1] + a.array[0];
1375         res.ptr[1] = b.array[1] + b.array[0];
1376         res.ptr[2] = a.array[3] + a.array[2];
1377         res.ptr[3] = b.array[3] + b.array[2];
1378         return res;
1379     }
1380 }
1381 unittest
1382 {
1383     __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
1384     __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
1385     __m256d C = _mm256_hadd_pd(A, B);
1386     double[4] correct =      [3.5, 8.0, 30.0, 114.0];
1387     assert(C.array == correct);
1388 }
1389 
1390 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and
1391 /// `b`.
1392 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted
1393 {
1394     // PERD DMD
1395     static if (GDC_or_LDC_with_AVX)
1396     {
1397         return __builtin_ia32_haddps256(a, b);
1398     }
1399     else static if (LDC_with_ARM64)
1400     {
1401         __m128 a_hi = _mm256_extractf128_ps!1(a);
1402         __m128 a_lo = _mm256_extractf128_ps!0(a);
1403         __m128 b_hi = _mm256_extractf128_ps!1(b);
1404         __m128 b_lo = _mm256_extractf128_ps!0(b);
1405         __m128 hi = vpaddq_f32(a_hi, b_hi);
1406         __m128 lo = vpaddq_f32(a_lo, b_lo);
1407         return _mm256_set_m128(hi, lo);
1408     }
1409     else
1410     {    
1411         __m256 res;
1412         res.ptr[0] = a.array[1] + a.array[0];
1413         res.ptr[1] = a.array[3] + a.array[2];
1414         res.ptr[2] = b.array[1] + b.array[0];
1415         res.ptr[3] = b.array[3] + b.array[2];
1416         res.ptr[4] = a.array[5] + a.array[4];
1417         res.ptr[5] = a.array[7] + a.array[6];
1418         res.ptr[6] = b.array[5] + b.array[4];
1419         res.ptr[7] = b.array[7] + b.array[6];
1420         return res;
1421     }
1422 }
1423 unittest
1424 {
1425     __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
1426     __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
1427     __m256 R = _mm256_hadd_ps(A, B);
1428     float[8] correct =      [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f];
1429     assert(R.array == correct);
1430 }
1431 
1432 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in
1433 /// `a` and `b`. 
1434 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted
1435 {
1436     static if (GDC_or_LDC_with_AVX)
1437     {
1438         return __builtin_ia32_hsubpd256(a, b);
1439     }
1440     else 
1441     {
1442         // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64
1443         __m256d res;
1444         res.ptr[0] = a.array[0] - a.array[1];
1445         res.ptr[1] = b.array[0] - b.array[1];
1446         res.ptr[2] = a.array[2] - a.array[3];
1447         res.ptr[3] = b.array[2] - b.array[3];
1448         return res;
1449     }
1450 }
1451 unittest
1452 {
1453     __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
1454     __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
1455     __m256d C = _mm256_hsub_pd(A, B);
1456     double[4] correct =      [-0.5, -6.0, 12.0, 86.0];
1457     assert(C.array == correct);
1458 }
1459 
1460 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted
1461 {
1462     // PERD DMD
1463     static if (GDC_or_LDC_with_AVX)
1464     {
1465         return __builtin_ia32_hsubps256(a, b);
1466     }
1467     else
1468     {
1469         __m128 a_hi = _mm256_extractf128_ps!1(a);
1470         __m128 a_lo = _mm256_extractf128_ps!0(a);
1471         __m128 b_hi = _mm256_extractf128_ps!1(b);
1472         __m128 b_lo = _mm256_extractf128_ps!0(b);
1473         __m128 hi = _mm_hsub_ps(a_hi, b_hi);
1474         __m128 lo = _mm_hsub_ps(a_lo, b_lo);
1475         return _mm256_set_m128(hi, lo);
1476     }
1477 }
1478 unittest
1479 {
1480     __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
1481     __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
1482     __m256 R = _mm256_hsub_ps(A, B);
1483     float[8] correct =   [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f];
1484     assert(R.array == correct);
1485 }
1486 
1487 /// Copy `a`, and insert the 16-bit integer `i` into the result at the location specified by 
1488 /// `index & 15`.
1489 __m256i _mm256_insert_epi16 (__m256i a, short i, const int index) pure @trusted
1490 {
1491     short16 sa = cast(short16)a;
1492     sa.ptr[index & 15] = i;
1493     return cast(__m256i)sa;
1494 }
1495 unittest
1496 {
1497     __m256i A = _mm256_set1_epi16(1);
1498     short16 R = cast(short16) _mm256_insert_epi16(A, 2, 16 + 16 + 7);
1499     short[16] correct = [1, 1, 1, 1, 1, 1, 1, 2, 
1500                          1, 1, 1, 1, 1, 1, 1, 1 ];
1501     assert(R.array == correct);
1502 }
1503 
1504 /// Copy `a`, and insert the 32-bit integer `i` into the result at the location specified by 
1505 /// `index & 7`.
1506 __m256i _mm256_insert_epi32 (__m256i a, int i, const int index) pure @trusted
1507 {
1508     int8 ia = cast(int8)a;
1509     ia.ptr[index & 7] = i;
1510     return cast(__m256i)ia;
1511 }
1512 unittest
1513 {
1514     __m256i A = _mm256_set1_epi32(1);
1515     int8 R = cast(int8) _mm256_insert_epi32(A, -2, 8 + 8 + 1);
1516     int[8] correct = [1, -2, 1, 1, 1, 1, 1, 1];
1517     assert(R.array == correct);
1518 }
1519 
1520 /// Copy `a`, and insert the 64-bit integer `i` into the result at the location specified by 
1521 /// `index & 3`.
1522 __m256i _mm256_insert_epi64(__m256i a, long i, const int index) pure @trusted
1523 {
1524     a.ptr[index & 3] = i;
1525     return a;
1526 }
1527 unittest
1528 {
1529     __m256i A = _mm256_set1_epi64(1);
1530     long4 R = cast(long4) _mm256_insert_epi64(A, -2, 2 - 4 - 4);
1531     long[4] correct = [1, 1, -2, 1];
1532     assert(R.array == correct);
1533 }
1534 
1535 /// Copy `a`, and insert the 8-bit integer `i` into the result at the location specified by 
1536 /// `index & 31`.
1537 __m256i _mm256_insert_epi8(__m256i a, byte i, const int index) pure @trusted
1538 {
1539     byte32 ba = cast(byte32)a;
1540     ba.ptr[index & 31] = i;
1541     return cast(__m256i)ba;
1542 }
1543 unittest
1544 {
1545     __m256i A = _mm256_set1_epi8(1);
1546     byte32 R = cast(byte32) _mm256_insert_epi8(A, -2, 7 - 32 - 32);
1547     byte[32] correct = [1, 1, 1, 1, 1, 1, 1,-2, 1, 1, 1, 1, 1, 1, 1, 1,
1548                         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ];
1549     assert(R.array == correct);
1550 }
1551 
1552 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 
1553 /// floating-point elements) from `b` at the location specified by `imm8`.
1554 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted
1555 {
1556     static if (GDC_with_AVX)
1557     {
1558         enum ubyte lane = imm8 & 1;
1559         return __builtin_ia32_vinsertf128_pd256(a, b, lane);
1560     }
1561     else
1562     {
1563         __m256d r = a;
1564         enum int index = (imm8 & 1) ? 2 : 0;
1565         r.ptr[index] = b.array[0];
1566         r.ptr[index+1] = b.array[1];
1567         return r;
1568     }
1569 }
1570 
1571 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point
1572 /// elements) from `b`, at the location specified by `imm8`.
1573 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted
1574 {
1575     static if (GDC_with_AVX)
1576     {
1577         enum ubyte lane = imm8 & 1;
1578         return __builtin_ia32_vinsertf128_ps256(a, b, lane);
1579     }
1580     else
1581     {
1582         __m256 r = a;
1583         enum int index = (imm8 & 1) ? 4 : 0;
1584         r.ptr[index] = b.array[0];
1585         r.ptr[index+1] = b.array[1];
1586         r.ptr[index+2] = b.array[2];
1587         r.ptr[index+3] = b.array[3];
1588         return r;
1589     }
1590 }
1591 
1592 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`.
1593 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted
1594 {
1595     static if (GDC_with_AVX)
1596     {
1597         enum ubyte lane = imm8 & 1;
1598         return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane);
1599     }
1600     else
1601     {
1602         long2 lb = cast(long2)b;
1603         __m256i r = a;
1604         enum int index = (imm8 & 1) ? 2 : 0;
1605         r.ptr[index] = lb.array[0];
1606         r.ptr[index+1] = lb.array[1];
1607         return r;
1608     }
1609 }
1610 
1611 /// Load 256-bits of integer data from unaligned memory into dst. 
1612 /// This intrinsic may run better than `_mm256_loadu_si256` when the data crosses a cache 
1613 /// line boundary.
1614 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted
1615 {
1616     // PERF DMD
1617     static if (GDC_or_LDC_with_AVX)
1618     {
1619         return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr);
1620     }
1621     else
1622         return _mm256_loadu_si256(mem_addr);
1623 }
1624 unittest
1625 {
1626     int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34];
1627     int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]);
1628     assert(A.array == correct[1..9]);
1629 }
1630 
1631 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
1632 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
1633 /// exception may be generated.
1634 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted
1635 {
1636     return *cast(__m256d*)mem_addr;
1637 }
1638 unittest
1639 {
1640     static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0];
1641     __m256d A = _mm256_load_pd(correct.ptr);
1642     assert(A.array == correct);
1643 }
1644 
1645 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 
1646 /// floating-point elements) from memory. 
1647 /// `mem_addr` must be aligned on a 32-byte boundary or a 
1648 /// general-protection exception may be generated.
1649 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted
1650 {
1651     return *cast(__m256*)mem_addr;
1652 }
1653 unittest
1654 {
1655     static immutable align(32) float[8] correct = 
1656         [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2];
1657     __m256 A = _mm256_load_ps(correct.ptr);
1658     assert(A.array == correct);
1659 }
1660 
1661 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on
1662 /// any particular boundary.
1663 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org
1664 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted
1665 {
1666     // PERF DMD
1667     static if (GDC_with_AVX)
1668     {
1669         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr);
1670     }
1671     else static if (LDC_with_optimizations)
1672     {
1673         return loadUnaligned!(__m256i)(cast(long*)mem_addr);
1674     }
1675     else
1676     {
1677         const(long)* p = cast(const(long)*)mem_addr; 
1678         long4 r;
1679         r.ptr[0] = p[0];
1680         r.ptr[1] = p[1];
1681         r.ptr[2] = p[2];
1682         r.ptr[3] = p[3];
1683         return r;
1684     }
1685 }
1686 unittest
1687 {
1688     align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6];
1689     int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr);
1690     assert(A.array == correct);
1691 }
1692 
1693 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 
1694 /// 32-byte boundary or a general-protection exception may be generated.
1695 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system
1696 {
1697     return *cast(__m256i*)mem_addr;
1698 }
1699 unittest
1700 {
1701     static immutable align(64) long[4] correct = [1, -2, long.min, long.max];
1702     __m256i A = _mm256_load_si256(correct.ptr);
1703     assert(A.array == correct);
1704 }
1705 
1706 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
1707 /// from memory. `mem_addr` does not need to be aligned on any particular boundary.
1708 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system
1709 {
1710     // PERF DMD
1711     static if (GDC_with_AVX)
1712     {
1713         return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr);
1714     }
1715     else static if (LDC_with_optimizations)
1716     {
1717         return loadUnaligned!(__m256d)(cast(double*)mem_addr);
1718     }    
1719     else
1720     {
1721         const(double)* p = cast(const(double)*)mem_addr; 
1722         double4 r;
1723         r.ptr[0] = p[0];
1724         r.ptr[1] = p[1];
1725         r.ptr[2] = p[2];
1726         r.ptr[3] = p[3];
1727         return r;
1728     }
1729 }
1730 unittest
1731 {
1732     double[4] correct = [1.0, -2.0, 0.0, 768.5];
1733     __m256d A = _mm256_loadu_pd(correct.ptr);
1734     assert(A.array == correct);
1735 }
1736 
1737 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory.
1738 /// `mem_addr` does not need to be aligned on any particular boundary.
1739 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system
1740 {
1741     // PERF DMD
1742     static if (GDC_with_AVX)
1743     {
1744         return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr);
1745     }
1746     else static if (LDC_with_optimizations)
1747     {
1748         return loadUnaligned!(__m256)(cast(float*)mem_addr);
1749     }    
1750     else
1751     {
1752         const(float)* p = cast(const(float)*)mem_addr; 
1753         float8 r = void;
1754         r.ptr[0] = p[0];
1755         r.ptr[1] = p[1];
1756         r.ptr[2] = p[2];
1757         r.ptr[3] = p[3];
1758         r.ptr[4] = p[4];
1759         r.ptr[5] = p[5];
1760         r.ptr[6] = p[6];
1761         r.ptr[7] = p[7];
1762         return r;
1763     }
1764 }
1765 unittest
1766 {
1767     align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9];
1768     __m256 A = _mm256_loadu_ps(&correct[1]);
1769     assert(A.array == correct[1..9]);
1770 }
1771 
1772 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 
1773 /// elements) from memory, and combine them into a 256-bit value. 
1774 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1775 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system
1776 {
1777     // Note: no particular instruction for this in x86.
1778     return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr));
1779 }
1780 unittest
1781 {
1782     align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3];
1783     align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4];
1784     __m256 R = _mm256_loadu2_m128(&B[1], &A[1]);
1785     float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2];
1786     assert(R.array == correct);
1787 }
1788 
1789 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point
1790 /// elements) from memory, and combine them into a 256-bit value. 
1791 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1792 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system
1793 {
1794     // Note: no particular instruction for this in x86.
1795     return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr));
1796 }
1797 unittest
1798 {
1799     align(32) double[4] A = [4.5f, 2, 8, 97];
1800     align(32) double[4] B = [6.5f, 3, 9, 98];
1801     __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]);
1802     double[4] correct = [2.0, 8, 3, 9];
1803     assert(R.array == correct);
1804 }
1805 
1806 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 
1807 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1808 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted
1809 {
1810     // Note: no particular instruction for this in x86.
1811     return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr));
1812 }
1813 unittest
1814 {
1815     align(32) long[4] A = [5, 2, 8, 97];
1816     align(32) long[4] B = [6, 3, 9, 98];
1817     __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*)  &A[1]);
1818     long[4] correct = [2, 8, 3, 9];
1819     assert(R.array == correct);
1820 }
1821 
1822 version(DigitalMars)
1823 {
1824     // this avoids a bug with DMD < 2.099 -a x86 -O
1825     private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099);
1826 }
1827 else
1828 {
1829     private enum bool maskLoadWorkaroundDMD = false;
1830 }
1831 
1832 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 
1833 /// (elements are zeroed out when the high bit of the corresponding element is not set).
1834 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
1835 /// only when needed.
1836 /// See: "Note about mask load/store" to know why you must address valid memory only.
1837 __m128d _mm_maskload_pd (const(double)* mem_addr, __m128i mask) /* pure */ @system
1838 {
1839     // PERF DMD
1840     static if (LDC_with_AVX)
1841     {
1842         // MAYDO report that the builtin is impure
1843         return __builtin_ia32_maskloadpd(mem_addr, cast(long2)mask);
1844     }
1845     else static if (GDC_with_AVX)
1846     {
1847         return __builtin_ia32_maskloadpd(cast(double2*)mem_addr, cast(long2)mask);
1848     }
1849     else
1850     {
1851         __m128d a = _mm_loadu_pd(mem_addr);
1852         __m128d zero = _mm_setzero_pd();
1853         return _mm_blendv_pd(zero, a, cast(double2)mask);
1854     }
1855 }
1856 unittest
1857 {
1858     static if (!maskLoadWorkaroundDMD) 
1859     {
1860         double[2] A = [7.5, 1];
1861         double2 B = _mm_maskload_pd(A.ptr, _mm_setr_epi64(-1, 1));
1862         double[2] correct = [7.5, 0];
1863         assert(B.array == correct);
1864     }
1865 }
1866 
1867 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask`
1868 /// (elements are zeroed out when the high bit of the corresponding element is not set).
1869 /// See: "Note about mask load/store" to know why you must address valid memory only.
1870 __m256d _mm256_maskload_pd (const(double)* mem_addr, __m256i mask) /*pure*/ @system
1871 {
1872     // PERF DMD
1873     static if (LDC_with_AVX)
1874     {
1875         // MAYDO that the builtin is impure
1876         return __builtin_ia32_maskloadpd256(mem_addr, mask);
1877     }
1878     else static if (GDC_with_AVX)
1879     {
1880         return __builtin_ia32_maskloadpd256(cast(double4*)mem_addr, mask);
1881     }
1882     else
1883     {
1884         __m256d a = _mm256_loadu_pd(mem_addr);
1885         __m256d zero = _mm256_setzero_pd();
1886         return _mm256_blendv_pd(zero, a, cast(double4)mask);
1887     }
1888 }
1889 unittest
1890 {
1891     static if (!maskLoadWorkaroundDMD)
1892     {
1893         double[4] A = [7.5, 1, 2, 3];
1894         double4 B = _mm256_maskload_pd(A.ptr, _mm256_setr_epi64(1, -1, -1, 1));
1895         double[4] correct = [0.0, 1, 2, 0];
1896         assert(B.array == correct);
1897     }
1898 }
1899 
1900 /// Load packed single-precision (32-bit) floating-point elements from memory using mask (elements
1901 /// are zeroed out when the high bit of the corresponding element is not set).
1902 /// Warning: See "Note about mask load/store" to know why you must address valid memory only.
1903 __m128 _mm_maskload_ps (const(float)* mem_addr, __m128i mask) /* pure */ @system
1904 {
1905     // PERF DMD
1906     static if (LDC_with_AVX)
1907     {
1908         // MAYDO report that the builtin is impure
1909         return __builtin_ia32_maskloadps(mem_addr, mask);
1910     }
1911     else static if (GDC_with_AVX)
1912     {
1913         return __builtin_ia32_maskloadps(cast(float4*)mem_addr, mask);
1914     }
1915     else
1916     {
1917         __m128 a = _mm_loadu_ps(mem_addr);
1918         __m128 zero = _mm_setzero_ps();
1919         return _mm_blendv_ps(zero, a, cast(float4)mask);
1920     }
1921 }
1922 unittest
1923 {
1924     static if (!maskLoadWorkaroundDMD)
1925     {
1926         float[4] A = [7.5f, 1, 2, 3];
1927         float4 B = _mm_maskload_ps(A.ptr, _mm_setr_epi32(1, -1, -1, 1));  // can NOT address invalid memory!
1928         float[4] correct = [0.0f, 1, 2, 0];
1929         assert(B.array == correct);
1930     }
1931 }
1932 
1933 /// Load packed single-precision (32-bit) floating-point elements from memory using `mask`
1934 /// (elements are zeroed out when the high bit of the corresponding element is not set).
1935 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
1936 /// only when needed.
1937 /// See: "Note about mask load/store" to know why you must address valid memory only.
1938 __m256 _mm256_maskload_ps (const(float)* mem_addr, __m256i mask) /*pure*/ @system
1939 {
1940     // PERF DMD
1941     static if (LDC_with_AVX)
1942     {
1943         // MAYDO report that the builtin is impure
1944         return __builtin_ia32_maskloadps256(mem_addr, cast(int8)mask);
1945     }
1946     else static if (GDC_with_AVX)
1947     {
1948         return __builtin_ia32_maskloadps256(cast(float8*)mem_addr, cast(int8)mask);
1949     }
1950     else
1951     {
1952         __m256 a = _mm256_loadu_ps(mem_addr);
1953         __m256 zero = _mm256_setzero_ps();
1954         return _mm256_blendv_ps(zero, a, cast(float8)mask);
1955     }
1956 }
1957 unittest
1958 {
1959     float[8] A                  = [1,   7.5f,  1,  2, 3,  4,  5, 6];
1960     __m256i  M = _mm256_setr_epi32(1,     -1,  1, -1, 1, -1, -1, 1);
1961     float8 B = _mm256_maskload_ps(A.ptr, M);
1962     float[8] correct =            [0.0f, 7.5f, 0,  2, 0,  4,  5, 0];
1963     assert(B.array == correct);
1964 }
1965 
1966 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`.
1967 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
1968 /// only when needed.
1969 /// See: "Note about mask load/store" to know why you must address valid memory only.
1970 void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) /* pure */ @system
1971 {
1972     // PERF DMD
1973     static if (LDC_with_AVX)
1974     {
1975         // MAYDO that the builtin is impure
1976         __builtin_ia32_maskstorepd(mem_addr, cast(long2)mask, a);
1977     }
1978     else static if (GDC_with_AVX)
1979     {
1980         __builtin_ia32_maskstorepd(cast(double2*)mem_addr, cast(long2)mask, a);
1981     }
1982     else
1983     {
1984         __m128d source = _mm_loadu_pd(mem_addr);
1985         __m128d r = _mm_blendv_pd(source, a, cast(double2) mask);
1986         _mm_storeu_pd(mem_addr, r);
1987     }
1988 }
1989 unittest
1990 {
1991     double[2] A = [0.0, 1.0];
1992     __m128i M = _mm_setr_epi64(-1, 0);
1993     __m128d B = _mm_setr_pd(2.0, 3.0);
1994     _mm_maskstore_pd(A.ptr, M, B);
1995     double[2] correct = [2.0, 1.0];
1996     assert(A == correct);
1997 }
1998 
1999 
2000 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`.
2001 /// See: "Note about mask load/store" to know why you must address valid memory only.
2002 static if (!llvm256BitStackWorkaroundIn32BitX86)
2003 {
2004     void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) /* pure */ @system
2005     {
2006         // PERF DMD
2007         static if (LDC_with_AVX)
2008         {
2009             // MAYDO that the builtin is impure
2010             __builtin_ia32_maskstorepd256(mem_addr, cast(long4)mask, a);
2011         }
2012         else static if (GDC_with_AVX)
2013         {
2014             __builtin_ia32_maskstorepd256(cast(double4*)mem_addr, cast(long4)mask, a);
2015         }
2016         else
2017         {
2018             __m256d source = _mm256_loadu_pd(mem_addr);
2019             __m256d r = _mm256_blendv_pd(source, a, cast(double4) mask);
2020             _mm256_storeu_pd(mem_addr, r);
2021         }
2022     }
2023     unittest
2024     {
2025         double[4] A = [0.0, 1, 2, 3];
2026         __m256i M = _mm256_setr_epi64x(-9, 0, -1, 0);
2027         __m256d B = _mm256_setr_pd(2, 3, 4, 5);
2028         _mm256_maskstore_pd(A.ptr, M, B);
2029         double[4] correct = [2.0, 1, 4, 3];
2030         assert(A == correct);
2031     }
2032 }
2033 
2034 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`.
2035 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access
2036 /// only when needed.
2037 /// See: "Note about mask load/store" to know why you must address valid memory only.
2038 void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)  /* pure */ @system
2039 {
2040     // PERF DMD
2041     static if (LDC_with_AVX)
2042     {
2043         // MAYDO report that the builtin is impure
2044         __builtin_ia32_maskstoreps(mem_addr, mask, a);
2045     }
2046     else static if (GDC_with_AVX)
2047     {
2048         __builtin_ia32_maskstoreps(cast(float4*)mem_addr, mask, a);
2049     }
2050     else
2051     {
2052         __m128 source = _mm_loadu_ps(mem_addr);
2053         __m128 r = _mm_blendv_ps(source, a, cast(float4) mask);
2054         _mm_storeu_ps(mem_addr, r);
2055     }
2056 }
2057 unittest
2058 {
2059     float[4] A = [0.0f, 1, 2, 6];
2060     __m128i M = _mm_setr_epi32(-1, 0, -1, 0);
2061     __m128 B = _mm_setr_ps(2, 3, 4, 5);
2062     _mm_maskstore_ps(A.ptr, M, B);
2063     float[4] correct = [2.0f, 1, 4, 6];
2064     assert(A == correct);
2065 }
2066 
2067 static if (!llvm256BitStackWorkaroundIn32BitX86)
2068 {
2069     /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`.
2070     /// See: "Note about mask load/store" to know why you must address valid memory only.
2071     void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) /* pure */ @system
2072     {
2073         // PERF DMD
2074         static if (LDC_with_AVX)
2075         {
2076             // MAYDO report that the builtin is impure
2077             __builtin_ia32_maskstoreps256(mem_addr, cast(int8)mask, a);
2078         }
2079         else static if (GDC_with_AVX)
2080         {
2081             __builtin_ia32_maskstoreps256(cast(float8*)mem_addr, cast(int8)mask, a);
2082         }
2083         else
2084         {
2085             __m256 source = _mm256_loadu_ps(mem_addr);
2086             __m256 r = _mm256_blendv_ps(source, a, cast(float8) mask);
2087             _mm256_storeu_ps(mem_addr, r);
2088         }
2089     }
2090     unittest
2091     {
2092         float[8] A                 = [0.0f, 0, 1,  2, 3,  4,  5, 7];
2093         __m256i M = _mm256_setr_epi32(  0, -1, 0, -1, 0, -1, -1, 0);
2094         __m256 B = _mm256_set1_ps(6.0f);
2095         _mm256_maskstore_ps(A.ptr, M, B);
2096         float[8] correct           = [0.0f, 6, 1,  6, 3,  6,  6, 7];
2097         assert(A == correct);
2098     }
2099 }
2100 
2101 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2102 /// packed maximum values.
2103 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted
2104 {    
2105     // PERF DMD
2106     static if (GDC_or_LDC_with_AVX)
2107     {
2108         return __builtin_ia32_maxpd256(a, b);
2109     }
2110     else
2111     {
2112         // LDC: becomes good in -O2
2113         // PERF: GDC without AVX
2114         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2115         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2116         a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
2117         a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
2118         return a;
2119     }
2120 }
2121 unittest
2122 {
2123     __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
2124     __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
2125     __m256d M = _mm256_max_pd(A, B);
2126     double[4] correct =       [4.0, 8.0, 0.0, double.infinity];
2127 }
2128 
2129 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
2130 /// packed maximum values.
2131 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted
2132 {
2133     // PERF DMD
2134     static if (GDC_or_LDC_with_AVX)
2135     {
2136         return __builtin_ia32_maxps256(a, b);
2137     }
2138     else
2139     {
2140         // LDC: becomes good in -O2, but looks brittle.
2141         // PERF GDC without AVX
2142         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2143         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2144         a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
2145         a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
2146         a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4];
2147         a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5];
2148         a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6];
2149         a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7];
2150         return a;
2151     }
2152 }
2153 unittest
2154 {
2155     __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
2156     __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
2157     __m256 M = _mm256_max_ps(A, B);
2158     float[8] correct =       [4.0, 8.0,  0.0, float.infinity , 4, 3, 3, 4];
2159 }
2160 
2161 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2162 /// packed minimum values.
2163 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted
2164 {
2165     // PERF DMD
2166     static if (GDC_or_LDC_with_AVX)
2167     {
2168         return __builtin_ia32_minpd256(a, b);
2169     }
2170     else
2171     {
2172         // LDC: becomes good in -O2
2173         // PERF: GDC without AVX
2174         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2175         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2176         a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
2177         a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
2178         return a;
2179     }
2180 }
2181 unittest
2182 {
2183     __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
2184     __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
2185     __m256d M = _mm256_min_pd(A, B);
2186     double[4] correct =       [1.0, 8.0, -9.0, 100000.0];
2187 }
2188 
2189 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
2190 /// packed maximum values.
2191 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted
2192 {
2193     // PERF DMD
2194     static if (GDC_or_LDC_with_AVX)
2195     {
2196         return __builtin_ia32_minps256(a, b);
2197     }
2198     else
2199     {
2200         // LDC: becomes good in -O2, but looks brittle.
2201         // PERF GDC without AVX
2202         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2203         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2204         a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
2205         a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
2206         a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4];
2207         a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5];
2208         a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6];
2209         a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7];
2210         return a;
2211     }
2212 }
2213 unittest
2214 {
2215     __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
2216     __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
2217     __m256 M = _mm256_min_ps(A, B);
2218     float[8] correct =       [1.0, 1.0, -9.0, 100000.0f     , 1, 2, 2, 1];
2219 }
2220 
2221 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`.
2222 __m256d _mm256_movedup_pd (__m256d a) @trusted
2223 {
2224     // PERF DMD
2225     static if (GDC_with_AVX)
2226     {
2227         return __builtin_ia32_movddup256 (a);
2228     }
2229     else
2230     {
2231         a.ptr[1] = a.array[0];
2232         a.ptr[3] = a.array[2];
2233         return a;
2234     }
2235 }
2236 unittest
2237 {
2238     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
2239     A = _mm256_movedup_pd(A);
2240     double[4] correct = [1.0, 1, 3, 3];
2241     assert(A.array == correct);
2242 }
2243 
2244 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
2245 __m256 _mm256_movehdup_ps (__m256 a) @trusted
2246 {
2247     // PERF DMD
2248     static if (GDC_with_AVX)
2249     {
2250         return __builtin_ia32_movshdup256 (a);
2251     }
2252     else
2253     {
2254         a.ptr[0] = a.array[1];
2255         a.ptr[2] = a.array[3];
2256         a.ptr[4] = a.array[5];
2257         a.ptr[6] = a.array[7];
2258         return a;
2259     }
2260 }
2261 unittest
2262 {
2263     __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
2264     A = _mm256_movehdup_ps(A);
2265     float[8] correct = [2.0, 2, 4, 4, 6, 6, 8, 8];
2266     assert(A.array == correct);
2267 }
2268 
2269 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
2270 __m256 _mm256_moveldup_ps (__m256 a) @trusted
2271 {
2272     // PERF DMD
2273     static if (GDC_with_AVX)
2274     {
2275         return __builtin_ia32_movsldup256 (a);
2276     }
2277     else
2278     {
2279         a.ptr[1] = a.array[0];
2280         a.ptr[3] = a.array[2];
2281         a.ptr[5] = a.array[4];
2282         a.ptr[7] = a.array[6];
2283         return a;
2284     }
2285 }
2286 unittest
2287 {
2288     __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
2289     A = _mm256_moveldup_ps(A);
2290     float[8] correct = [1.0, 1, 3, 3, 5, 5, 7, 7];
2291     assert(A.array == correct);
2292 }
2293 
2294 /// Set each bit of result mask based on the most significant bit of the corresponding packed 
2295 /// double-precision (64-bit) floating-point element in `a`.
2296 int _mm256_movemask_pd (__m256d a) @safe
2297 {
2298     // PERF DMD
2299     static if (GDC_or_LDC_with_AVX)
2300     {
2301         return __builtin_ia32_movmskpd256(a);
2302     }
2303     else static if (LDC_with_SSE2)
2304     {
2305         // this doesn't benefit GDC, and not clear for arm64.
2306         __m128d A_lo = _mm256_extractf128_pd!0(a);
2307         __m128d A_hi = _mm256_extractf128_pd!1(a);
2308 
2309         return (_mm_movemask_pd(A_hi) << 2) | _mm_movemask_pd(A_lo);
2310     }
2311     else
2312     {
2313         // Fortunately, branchless on arm64
2314         long4 lv = cast(long4)a;
2315         int r = 0;
2316         if (lv.array[0] < 0) r += 1;
2317         if (lv.array[1] < 0) r += 2;
2318         if (lv.array[2] < 0) r += 4;
2319         if (lv.array[3] < 0) r += 8;
2320         return r;
2321     }
2322 }
2323 unittest
2324 {
2325     __m256d A = _mm256_setr_pd(-1, -double.infinity, 0, -1);
2326     assert(_mm256_movemask_pd(A) == 1 + 2 + 8);
2327 }
2328 
2329 /// Set each bit of mask result based on the most significant bit of the corresponding packed 
2330 /// single-precision (32-bit) floating-point element in `a`.
2331 int _mm256_movemask_ps (__m256 a) @system
2332 {
2333     // PERF DMD
2334     // PERF GDC without AVX
2335     static if (GDC_or_LDC_with_AVX)
2336     {
2337         return __builtin_ia32_movmskps256(a);
2338     }
2339     else version(LDC)
2340     {
2341         // this doesn't benefit GDC (unable to inline), but benefits both LDC with SSE2 and ARM64
2342         __m128 A_lo = _mm256_extractf128_ps!0(a);
2343         __m128 A_hi = _mm256_extractf128_ps!1(a);
2344         return (_mm_movemask_ps(A_hi) << 4) | _mm_movemask_ps(A_lo);
2345     }
2346     else
2347     {
2348         int8 lv = cast(int8)a;
2349         int r = 0;
2350         if (lv.array[0] < 0) r += 1;
2351         if (lv.array[1] < 0) r += 2;
2352         if (lv.array[2] < 0) r += 4;
2353         if (lv.array[3] < 0) r += 8;
2354         if (lv.array[4] < 0) r += 16;
2355         if (lv.array[5] < 0) r += 32;
2356         if (lv.array[6] < 0) r += 64;
2357         if (lv.array[7] < 0) r += 128;
2358         return r;
2359     }
2360 }
2361 unittest
2362 {
2363     __m256 A = _mm256_setr_ps(-1, -double.infinity, 0, -1, 1, double.infinity, -2, double.nan);
2364     assert(_mm256_movemask_ps(A) == 1 + 2 + 8 + 64);
2365 }
2366 
2367 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`.
2368 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe
2369 {
2370     return a * b;
2371 }
2372 unittest
2373 {
2374     __m256d a = [-2.0, 1.5, -2.0, 1.5];
2375     a = _mm256_mul_pd(a, a);
2376     assert(a.array == [4.0, 2.25, 4.0, 2.25]);
2377 }
2378 
2379 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
2380 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe
2381 {
2382     return a * b;
2383 }
2384 unittest
2385 {
2386     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f];
2387     a = _mm256_mul_ps(a, a);
2388     float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f];
2389     assert(a.array == correct);
2390 }
2391 
2392 
2393 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS
2394 __m256i _mm256_not_si256 (__m256i a) pure @safe
2395 {
2396     return ~a;
2397 }
2398 unittest
2399 {
2400     __m256i A = _mm256_set1_epi64x(-748);
2401     long4 notA = cast(long4) _mm256_not_si256(A);
2402     int[4] correct = [747, 747, 747, 747];
2403     assert(notA.array == correct);
2404 }
2405 
2406 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2407 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe
2408 {
2409     return cast(__m256d)( cast(__m256i)a | cast(__m256i)b );
2410 }
2411 
2412 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
2413 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe
2414 {
2415     return cast(__m256)( cast(__m256i)a | cast(__m256i)b );
2416 }
2417 
2418 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `imm8`.
2419 __m128d _mm_permute_pd(int imm8)(__m128d a) pure @trusted
2420 {
2421     static if (GDC_with_AVX)
2422     {
2423         return __builtin_ia32_vpermilpd(a, imm8 & 3);
2424     }
2425     else
2426     {
2427         // Shufflevector not particularly better for LDC here
2428         __m128d r;
2429         r.ptr[0] = a.array[imm8 & 1];
2430         r.ptr[1] = a.array[(imm8 >> 1) & 1];
2431         return r;
2432     }
2433 }
2434 unittest
2435 {
2436     __m128d A = _mm_setr_pd(5, 6);
2437     __m128d B = _mm_permute_pd!1(A);
2438     __m128d C = _mm_permute_pd!3(A);
2439     double[2] RB = [6, 5];
2440     double[2] RC = [6, 6];
2441     assert(B.array == RB);
2442     assert(C.array == RC);
2443 }
2444 
2445 ///ditto
2446 __m256d _mm256_permute_pd(int imm8)(__m256d a) pure @trusted
2447 {
2448     // PERF DMD
2449     static if (GDC_with_AVX)
2450     {
2451         return __builtin_ia32_vpermilpd256(a, imm8 & 15);
2452     }
2453     else version(LDC)
2454     {
2455         return shufflevectorLDC!(double4,        
2456                                        (imm8 >> 0) & 1,
2457                                      ( (imm8 >> 1) & 1),
2458                                  2 + ( (imm8 >> 2) & 1),
2459                                  2 + ( (imm8 >> 3) & 1) )(a, a);
2460     }
2461     else
2462     {
2463         __m256d r;
2464         r.ptr[0] = a.array[ imm8       & 1];
2465         r.ptr[1] = a.array[(imm8 >> 1) & 1];
2466         r.ptr[2] = a.array[2 + ((imm8 >> 2) & 1)];
2467         r.ptr[3] = a.array[2 + ((imm8 >> 3) & 1)];
2468         return r;
2469     }
2470 }
2471 unittest
2472 {
2473     __m256d A = _mm256_setr_pd(0.0, 1, 2, 3);
2474     __m256d R = _mm256_permute_pd!(1 + 4)(A);
2475     double[4] correct = [1.0, 0, 3, 2];
2476     assert(R.array == correct);
2477 }
2478 
2479 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `imm8`.
2480 __m128 _mm_permute_ps(int imm8)(__m128 a) pure @trusted
2481 {
2482     // PERF DMD
2483     static if (GDC_with_AVX)
2484     {
2485         return __builtin_ia32_vpermilps(a, cast(ubyte)imm8);
2486     }
2487     else version(LDC)
2488     {
2489         return shufflevectorLDC!(float4, (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, 
2490             (imm8 >> 6) & 3)(a, a);
2491     }
2492     else
2493     {
2494         // PERF: could use _mm_shuffle_ps which is a super set
2495         // when AVX isn't available
2496         __m128 r;
2497         r.ptr[0] = a.array[(imm8 >> 0) & 3];
2498         r.ptr[1] = a.array[(imm8 >> 2) & 3];
2499         r.ptr[2] = a.array[(imm8 >> 4) & 3];
2500         r.ptr[3] = a.array[(imm8 >> 6) & 3];
2501         return r;
2502     }
2503 }
2504 unittest
2505 {
2506     __m128 A = _mm_setr_ps(0.0f, 1, 2, 3);
2507     __m128 R = _mm_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A);
2508     float[4] correct = [1.0f, 3, 0, 2];
2509     assert(R.array == correct);
2510 }
2511 
2512 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 
2513 /// the control in `imm8`. The same shuffle is applied in lower and higher 128-bit lane.
2514 __m256 _mm256_permute_ps(int imm8)(__m256 a,) pure @trusted
2515 {
2516     // PERF DMD
2517     static if (GDC_with_AVX)
2518     {
2519         return __builtin_ia32_vpermilps256(a, cast(ubyte)imm8);
2520     }
2521     else version(LDC)
2522     {
2523         return shufflevectorLDC!(float8, 
2524             (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, (imm8 >> 6) & 3,
2525             4 + ((imm8 >> 0) & 3), 4 + ((imm8 >> 2) & 3), 4 + ((imm8 >> 4) & 3), 
2526             4 + ((imm8 >> 6) & 3))(a, a);
2527     }
2528     else
2529     {
2530         __m256 r;
2531         r.ptr[0] = a.array[(imm8 >> 0) & 3];
2532         r.ptr[1] = a.array[(imm8 >> 2) & 3];
2533         r.ptr[2] = a.array[(imm8 >> 4) & 3];
2534         r.ptr[3] = a.array[(imm8 >> 6) & 3];
2535         r.ptr[4] = a.array[4 + ((imm8 >> 0) & 3)];
2536         r.ptr[5] = a.array[4 + ((imm8 >> 2) & 3)];
2537         r.ptr[6] = a.array[4 + ((imm8 >> 4) & 3)];
2538         r.ptr[7] = a.array[4 + ((imm8 >> 6) & 3)];
2539         return r;
2540     }
2541 }
2542 unittest
2543 {
2544     __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7);
2545     __m256 R = _mm256_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A);
2546     float[8] correct = [1.0f, 3, 0, 2, 5, 7, 4, 6];
2547     assert(R.array == correct);
2548 } 
2549 
2550 /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 
2551 /// selected by `imm8` from `a` and `b`.
2552 /// See the documentation as the `imm8` format is quite complex.
2553 __m256d _mm256_permute2f128_pd(int imm8)(__m256d a, __m256d b) pure @safe
2554 {
2555     return cast(__m256d) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b);
2556 }
2557 ///ditto
2558 __m256 _mm256_permute2f128_ps(int imm8)(__m256 a, __m256 b) pure @safe
2559 {
2560     return cast(__m256) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b);
2561 }
2562 ///ditto
2563 __m256i _mm256_permute2f128_si256(int imm8)(__m256i a, __m256i b) pure @trusted
2564 {
2565     static if (GDC_with_AVX)
2566     {
2567         return cast(__m256i) __builtin_ia32_vperm2f128_si256(cast(int8)a, cast(int8)b, cast(ubyte)imm8);
2568     }
2569     else 
2570     {
2571         static __m128i SELECT4(int imm4)(__m256i a, __m256i b) pure @trusted
2572         {
2573             static assert(imm4 >= 0 && imm4 <= 15);
2574             static if (imm4 & 8)
2575             {
2576                 return _mm_setzero_si128();
2577             }
2578             else static if ((imm4 & 2) == 0)
2579             {
2580                 long2 r;
2581                 enum int index = 2*(imm4 & 1);
2582                 r.ptr[0] = a.array[index+0];
2583                 r.ptr[1] = a.array[index+1];
2584                 return cast(__m128i)r;
2585             }
2586             else
2587             {
2588                 static assert( (imm4 & 2) != 0);
2589                 long2 r;
2590                 enum int index = 2*(imm4 & 1);
2591                 r.ptr[0] = b.array[index+0];
2592                 r.ptr[1] = b.array[index+1];
2593                 return cast(__m128i)r;
2594             }
2595         }
2596 
2597         long4 r;
2598         __m128i lo = SELECT4!(imm8 & 15)(a, b);
2599         __m128i hi = SELECT4!((imm8 >> 4) & 15)(a, b);
2600         return _mm256_set_m128i(hi, lo);
2601     }
2602 }
2603 unittest
2604 {
2605     __m256d A = _mm256_setr_pd(8.0, 1, 2, 3);
2606     __m256d B = _mm256_setr_pd(4.0, 5, 6, 7);
2607     __m256d R = _mm256_permute2f128_pd!(128 + 2)(A, B);
2608     double[4] correct = [4.0, 5.0, 0.0, 0.0];
2609     assert(R.array == correct);
2610 
2611     __m256d R2 = _mm256_permute2f128_pd!(3*16 + 1)(A, B);
2612     double[4] correct2 = [2.0, 3.0, 6.0, 7.0];
2613     assert(R2.array == correct2);
2614 }
2615 
2616 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `b`.
2617 /// Warning: the selector is in bit 1, not bit 0, of each 64-bit element!
2618 ///          This is really not intuitive.
2619 __m128d _mm_permutevar_pd(__m128d a, __m128i b) pure @trusted
2620 {
2621     enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
2622 
2623     static if (GDC_or_LDC_with_AVX)
2624     {
2625         return cast(__m128d) __builtin_ia32_vpermilvarpd(a, cast(long2)b);
2626     }
2627     else static if (implementWithByteShuffle)
2628     {
2629         align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7];
2630         align(16) static immutable byte[16] mmBroadcast_u8 = [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8];
2631         int4 bi = cast(int4)b;
2632         long2 two;
2633         two = 2;
2634         bi = _mm_slli_epi64(cast(__m128i)( (cast(long2)bi) & two), 2);
2635         bi = _mm_shuffle_epi8(bi, *cast(__m128i*)mmBroadcast_u8.ptr);
2636         // bi is now [ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ]
2637         byte16 bytesIndices = cast(byte16)bi;
2638         bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr;
2639 
2640         // which allows us to make a single _mm_shuffle_epi8
2641         return cast(__m128d) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices);
2642     }
2643     else
2644     {
2645         // This isn't great in ARM64, TBL or TBX instructions can't do that.
2646         // that could fit the bill, if it had 64-bit operands. But it only has 8-bit operands.
2647         // SVE2 could do it with svtbx[_f64] probably.
2648         long2 bl = cast(long2)b;
2649         __m128d r;
2650         r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1];
2651         r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1];
2652         return r;
2653     }
2654 }
2655 unittest
2656 {
2657     __m128d A = _mm_setr_pd(5, 6);
2658     __m128d B = _mm_permutevar_pd(A, _mm_setr_epi64(2, 1));
2659     __m128d C = _mm_permutevar_pd(A, _mm_setr_epi64(1 + 2 + 4, 2));    
2660     // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element
2661     double[2] RB = [6, 5];
2662     double[2] RC = [6, 6];
2663     assert(B.array == RB);
2664     assert(C.array == RC);
2665 }
2666 
2667 ///ditto
2668 __m256d _mm256_permutevar_pd (__m256d a, __m256i b) pure @trusted
2669 {
2670     // Worth it: for GDC, in SSSE3+
2671     //           for LDC, all the time
2672     version(LDC)
2673         enum bool implementWithByteShuffle = true;
2674     else
2675         enum bool implementWithByteShuffle = GDC_with_SSSE3;
2676 
2677     // PERF DMD
2678     static if (GDC_or_LDC_with_AVX)
2679     {
2680         return cast(__m256d) __builtin_ia32_vpermilvarpd256(a, cast(long4)b);
2681     }
2682     else static if (implementWithByteShuffle)
2683     {
2684         // because we don't have 256-bit vectors, split and use _mm_permutevar_ps
2685         __m128d a_lo = _mm256_extractf128_pd!0(a);
2686         __m128d a_hi = _mm256_extractf128_pd!1(a);
2687         __m128i b_lo = _mm256_extractf128_si256!0(b);
2688         __m128i b_hi = _mm256_extractf128_si256!1(b);
2689         __m128d r_lo = _mm_permutevar_pd(a_lo, b_lo);
2690         __m128d r_hi = _mm_permutevar_pd(a_hi, b_hi);
2691         return _mm256_set_m128d(r_hi, r_lo);
2692     }
2693     else
2694     {
2695         long4 bl = cast(long4)b;
2696         __m256d r;
2697         r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1];
2698         r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1];
2699         r.ptr[2] = a.array[2 + ((bl.array[2] & 2) >> 1)];
2700         r.ptr[3] = a.array[2 + ((bl.array[3] & 2) >> 1)];
2701         return r;
2702     }
2703 }
2704 unittest
2705 {
2706     __m256d A = _mm256_setr_pd(5, 6, 7, 8);
2707     __m256d B = _mm256_permutevar_pd(A, _mm256_setr_epi64(2, 1, 0, 2));
2708     __m256d C = _mm256_permutevar_pd(A, _mm256_setr_epi64(1 + 2 + 4, 2, 2, 0));
2709     // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element
2710     double[4] RB = [6, 5, 7, 8];
2711     double[4] RC = [6, 6, 8, 7];
2712     assert(B.array == RB);
2713     assert(C.array == RC);
2714 }
2715 
2716 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `b`.
2717 __m128 _mm_permutevar_ps (__m128 a, __m128i b) @trusted
2718 {
2719     // PERF DMD
2720 
2721     enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
2722 
2723     static if (GDC_or_LDC_with_AVX)
2724     {
2725         return cast(__m128) __builtin_ia32_vpermilvarps(a, cast(int4)b);
2726     }
2727     else static if (implementWithByteShuffle)
2728     {
2729         // This workaround is worth it: in GDC with SSSE3, in LDC with SSSE3, in ARM64 (neon)
2730         int4 bi = cast(int4)b;
2731         int4 three;
2732         three = 3;
2733         bi = _mm_slli_epi32(bi & three, 2);
2734         // bi is [ind0 0 0 0 ind1 0 0 0 ind2 0 0 0 ind3 0 0 0]
2735         bi = bi | _mm_slli_si128!1(bi);
2736         bi = bi | _mm_slli_si128!2(bi);
2737         // bi is now [ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind2 ind2 ind2 ind2 ind3 ind3 ind3 ind3]
2738         byte16 bytesIndices = cast(byte16)bi;
2739         align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
2740         bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr;
2741 
2742         // which allows us to make a single _mm_shuffle_epi8
2743         return cast(__m128) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices);
2744     }
2745     else
2746     {
2747 
2748         int4 bi = cast(int4)b;
2749         __m128 r;
2750         r.ptr[0] = a.array[ (bi.array[0] & 3) ];
2751         r.ptr[1] = a.array[ (bi.array[1] & 3) ];
2752         r.ptr[2] = a.array[ (bi.array[2] & 3) ];
2753         r.ptr[3] = a.array[ (bi.array[3] & 3) ];
2754         return r;
2755     }
2756 }
2757 unittest
2758 {
2759     __m128 A = _mm_setr_ps(5, 6, 7, 8);
2760     __m128 B = _mm_permutevar_ps(A, _mm_setr_epi32(2, 1, 0, 2 + 4));
2761     __m128 C = _mm_permutevar_ps(A, _mm_setr_epi32(2, 3 + 8, 1, 0));
2762     float[4] RB = [7, 6, 5, 7];
2763     float[4] RC = [7, 8, 6, 5];
2764     assert(B.array == RB);
2765     assert(C.array == RC);
2766 }
2767 
2768 ///ditto
2769 __m256 _mm256_permutevar_ps (__m256 a, __m256i b) @trusted
2770 {
2771     // In order to do those two, it is necessary to use _mm_shuffle_epi8 and reconstruct the integers afterwards.
2772     enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64;
2773 
2774     static if (GDC_or_LDC_with_AVX)
2775     {
2776         return __builtin_ia32_vpermilvarps256(a, cast(int8)b);
2777     }
2778     else static if (implementWithByteShuffle)
2779     {
2780         // because we don't have 256-bit vectors, split and use _mm_permutevar_ps
2781         __m128 a_lo = _mm256_extractf128_ps!0(a);
2782         __m128 a_hi = _mm256_extractf128_ps!1(a);
2783         __m128i b_lo = _mm256_extractf128_si256!0(b);
2784         __m128i b_hi = _mm256_extractf128_si256!1(b);
2785         __m128 r_lo = _mm_permutevar_ps(a_lo, b_lo);
2786         __m128 r_hi = _mm_permutevar_ps(a_hi, b_hi);
2787         return _mm256_set_m128(r_hi, r_lo);
2788     }
2789     else
2790     {
2791         int8 bi = cast(int8)b;
2792         __m256 r;
2793         r.ptr[0] = a.array[ (bi.array[0] & 3) ];
2794         r.ptr[1] = a.array[ (bi.array[1] & 3) ];
2795         r.ptr[2] = a.array[ (bi.array[2] & 3) ];
2796         r.ptr[3] = a.array[ (bi.array[3] & 3) ];
2797         r.ptr[4] = a.array[ 4 + (bi.array[4] & 3) ];
2798         r.ptr[5] = a.array[ 4 + (bi.array[5] & 3) ];
2799         r.ptr[6] = a.array[ 4 + (bi.array[6] & 3) ];
2800         r.ptr[7] = a.array[ 4 + (bi.array[7] & 3) ];
2801         return r;
2802     } 
2803 }
2804 unittest
2805 {
2806     __m256 A = _mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8);
2807     __m256 B = _mm256_permutevar_ps(A, _mm256_setr_epi32(2,     1, 0, 2, 3, 2, 1, 0));
2808     __m256 C = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 3 + 8, 1, 0, 2, 3, 0, 1));
2809     float[8] RB = [3.0f, 2, 1, 3, 8, 7, 6, 5];
2810     float[8] RC = [3.0f, 4, 2, 1, 7, 8, 5, 6];
2811     assert(B.array == RB);
2812     assert(C.array == RC);
2813 }
2814 
2815 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements
2816 /// in `a`. The maximum relative error for this approximation is less than 1.5*2^-12.
2817 __m256 _mm256_rcp_ps (__m256 a) pure @trusted
2818 {
2819     // PERF DMD
2820     static if (GDC_or_LDC_with_AVX)
2821     {
2822         return __builtin_ia32_rcpps256(a);
2823     }
2824     else
2825     {        
2826         a.ptr[0] = 1.0f / a.array[0];
2827         a.ptr[1] = 1.0f / a.array[1];
2828         a.ptr[2] = 1.0f / a.array[2];
2829         a.ptr[3] = 1.0f / a.array[3];
2830         a.ptr[4] = 1.0f / a.array[4];
2831         a.ptr[5] = 1.0f / a.array[5];
2832         a.ptr[6] = 1.0f / a.array[6];
2833         a.ptr[7] = 1.0f / a.array[7];
2834         return a;
2835     }
2836 }
2837 unittest
2838 {
2839     __m256 A = _mm256_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f, 9, -46, 1869816, 55583);
2840     __m256 groundTruth = _mm256_set1_ps(1.0f) / A;
2841     __m256 result = _mm256_rcp_ps(A);
2842     foreach(i; 0..8)
2843     {
2844         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2845         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2846     }
2847 }
2848 
2849 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
2850 /// rounding parameter, and store the results as packed double-precision floating-point elements.
2851 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
2852 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
2853 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
2854 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
2855 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
2856 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2857 __m256d _mm256_round_pd(int rounding)(__m256d a) @trusted
2858 {
2859     // PERF DMD
2860     static if (GDC_with_AVX)
2861     {
2862         return __builtin_ia32_roundpd256(a, rounding);
2863     }
2864     else static if (LDC_with_AVX)
2865     {
2866         return __builtin_ia32_roundpd256(a, rounding);
2867     }
2868     else
2869     {
2870         static if (rounding & _MM_FROUND_CUR_DIRECTION)
2871         {
2872             // PERF: non-AVX x86, would probably be faster to convert those double at once to int64
2873 
2874             __m128d A_lo = _mm256_extractf128_pd!0(a);
2875             __m128d A_hi = _mm256_extractf128_pd!1(a);
2876 
2877             // Convert to 64-bit integers one by one
2878             long x0 = _mm_cvtsd_si64(A_lo);
2879             long x2 = _mm_cvtsd_si64(A_hi);
2880             A_lo.ptr[0] = A_lo.array[1];
2881             A_hi.ptr[0] = A_hi.array[1];
2882             long x1 = _mm_cvtsd_si64(A_lo);
2883             long x3 = _mm_cvtsd_si64(A_hi);
2884 
2885             return _mm256_setr_pd(x0, x1, x2, x3);
2886         }
2887         else
2888         {
2889             version(GNU) pragma(inline, false); // this was required for SSE4.1 rounding, let it here
2890 
2891             uint old = _MM_GET_ROUNDING_MODE();
2892             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
2893             
2894             __m128d A_lo = _mm256_extractf128_pd!0(a);
2895             __m128d A_hi = _mm256_extractf128_pd!1(a);
2896 
2897             // Convert to 64-bit integers one by one
2898             long x0 = _mm_cvtsd_si64(A_lo);
2899             long x2 = _mm_cvtsd_si64(A_hi);
2900             A_lo.ptr[0] = A_lo.array[1];
2901             A_hi.ptr[0] = A_hi.array[1];
2902             long x1 = _mm_cvtsd_si64(A_lo);
2903             long x3 = _mm_cvtsd_si64(A_hi);
2904 
2905             // Convert back to double to achieve the rounding
2906             // The problem is that a 64-bit double can't represent all the values 
2907             // a 64-bit integer can (and vice-versa). So this function won't work for
2908             // large values. (FUTURE: what range exactly?)
2909             _MM_SET_ROUNDING_MODE(old);
2910             return _mm256_setr_pd(x0, x1, x2, x3);
2911         }
2912     }
2913 }
2914 unittest
2915 {
2916     // tested in other intrinsics
2917 }
2918 
2919 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
2920 /// rounding parameter, and store the results as packed single-precision floating-point elements.
2921 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
2922 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
2923 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
2924 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
2925 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
2926 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2927 __m256 _mm256_round_ps(int rounding)(__m256 a) @trusted
2928 {
2929     // PERF DMD
2930     static if (GDC_or_LDC_with_AVX)
2931     {
2932         return __builtin_ia32_roundps256(a, rounding);
2933     }
2934     else static if (GDC_or_LDC_with_SSE41)
2935     {
2936         // we can use _mm_round_ps
2937         __m128 lo = _mm256_extractf128_ps!0(a);
2938         __m128 hi = _mm256_extractf128_ps!1(a);
2939         __m128 ilo = _mm_round_ps!rounding(lo); // unfortunately _mm_round_ps isn't fast for arm64, so we avoid that in that case
2940         __m128 ihi = _mm_round_ps!rounding(hi);
2941         return _mm256_set_m128(ihi, ilo);
2942     }
2943     else
2944     {
2945         static if (rounding & _MM_FROUND_CUR_DIRECTION)
2946         {
2947             __m256i integers = _mm256_cvtps_epi32(a);
2948             return _mm256_cvtepi32_ps(integers);
2949         }
2950         else
2951         {
2952             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
2953             uint old = _MM_GET_ROUNDING_MODE();
2954             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
2955             scope(exit) _MM_SET_ROUNDING_MODE(old);
2956 
2957             // Convert to 32-bit integers
2958             __m256i integers = _mm256_cvtps_epi32(a);
2959 
2960             // Convert back to float to achieve the rounding
2961             // The problem is that a 32-float can't represent all the values 
2962             // a 32-bit integer can (and vice-versa). So this function won't work for
2963             // large values. (FUTURE: what range exactly?)
2964             __m256 result = _mm256_cvtepi32_ps(integers);
2965 
2966             return result;
2967         }
2968     }
2969 }
2970 unittest
2971 {
2972     // tested in other intrinsics
2973 }
2974 
2975 
2976 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) 
2977 /// floating-point elements in `a`. The maximum relative error for this approximation is less than
2978 /// 1.5*2^-12.
2979 __m256 _mm256_rsqrt_ps (__m256 a) pure @trusted
2980 {
2981     static if (GDC_or_LDC_with_AVX)
2982     {
2983         return __builtin_ia32_rsqrtps256(a);
2984     }
2985     else version(LDC)
2986     {
2987         a[0] = 1.0f / llvm_sqrt(a[0]);
2988         a[1] = 1.0f / llvm_sqrt(a[1]);
2989         a[2] = 1.0f / llvm_sqrt(a[2]);
2990         a[3] = 1.0f / llvm_sqrt(a[3]);
2991         a[4] = 1.0f / llvm_sqrt(a[4]);
2992         a[5] = 1.0f / llvm_sqrt(a[5]);
2993         a[6] = 1.0f / llvm_sqrt(a[6]);
2994         a[7] = 1.0f / llvm_sqrt(a[7]);
2995         return a;
2996     }
2997     else
2998     {
2999         a.ptr[0] = 1.0f / sqrt(a.array[0]);
3000         a.ptr[1] = 1.0f / sqrt(a.array[1]);
3001         a.ptr[2] = 1.0f / sqrt(a.array[2]);
3002         a.ptr[3] = 1.0f / sqrt(a.array[3]);
3003         a.ptr[4] = 1.0f / sqrt(a.array[4]);
3004         a.ptr[5] = 1.0f / sqrt(a.array[5]);
3005         a.ptr[6] = 1.0f / sqrt(a.array[6]);
3006         a.ptr[7] = 1.0f / sqrt(a.array[7]);
3007         return a;
3008     }
3009 }
3010 unittest
3011 {
3012     __m256 A = _mm256_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f, 2.34f, 70000.0f, 0.00001f, 345.5f);
3013     __m256 groundTruth = _mm256_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f,
3014                                         0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
3015     __m256 result = _mm256_rsqrt_ps(A);
3016     foreach(i; 0..8)
3017     {
3018         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
3019         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
3020     }
3021 }
3022 
3023 /// Set packed 16-bit integers with the supplied values.
3024 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3025 {
3026     short16 r; // Note: = void would prevent GDC from inlining a constant short16...
3027     r.ptr[0] = e0;
3028     r.ptr[1] = e1;
3029     r.ptr[2] = e2;
3030     r.ptr[3] = e3;
3031     r.ptr[4] = e4;
3032     r.ptr[5] = e5;
3033     r.ptr[6] = e6;
3034     r.ptr[7] = e7;
3035     r.ptr[8] = e8;
3036     r.ptr[9] = e9;
3037     r.ptr[10] = e10;
3038     r.ptr[11] = e11;
3039     r.ptr[12] = e12;
3040     r.ptr[13] = e13;
3041     r.ptr[14] = e14;
3042     r.ptr[15] = e15;
3043     return cast(__m256i) r;
3044 }
3045 unittest
3046 {
3047     short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 
3048                                                7, 6, 5, 4, 3, 2, 1, 0);
3049     foreach(i; 0..16)
3050         assert(A.array[i] == i);
3051 }
3052 
3053 /// Set packed 32-bit integers with the supplied values.
3054 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
3055 {
3056     // Inlines a constant with GCC -O1, LDC -O2
3057     int8 r; // = void would prevent GCC from inlining a constant call
3058     r.ptr[0] = e0;
3059     r.ptr[1] = e1;
3060     r.ptr[2] = e2;
3061     r.ptr[3] = e3;
3062     r.ptr[4] = e4;
3063     r.ptr[5] = e5;
3064     r.ptr[6] = e6;
3065     r.ptr[7] = e7;
3066     return cast(__m256i)r;
3067 }
3068 unittest
3069 {
3070     int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
3071     foreach(i; 0..8)
3072         assert(A.array[i] == i);
3073 }
3074 
3075 /// Set packed 64-bit integers with the supplied values.
3076 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted
3077 {
3078     long4 r = void;
3079     r.ptr[0] = e0;
3080     r.ptr[1] = e1;
3081     r.ptr[2] = e2;
3082     r.ptr[3] = e3;
3083     return r;
3084 }
3085 unittest
3086 {
3087     __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max);
3088     long[4] correct = [long.max, long.min, 42, -1];
3089     assert(A.array == correct);
3090 }
3091 
3092 ///ditto
3093 alias _mm256_set_epi64 = _mm256_set_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
3094 
3095 /// Set packed 8-bit integers with the supplied values.
3096 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 
3097                          byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 
3098                          byte e15, byte e14, byte e13, byte e12, byte e11, byte e10,  byte e9,  byte e8, 
3099                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0)
3100 {
3101     // Inline a constant call in GDC -O1 and LDC -O2
3102     align(32) byte[32] result = [ e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
3103                                   e8,  e9, e10, e11, e12, e13, e14, e15,
3104                                  e16, e17, e18, e19, e20, e21, e22, e23,
3105                                  e24, e25, e26, e27, e28, e29, e30, e31 ];
3106     return *cast(__m256i*)(result.ptr);
3107 }
3108 unittest
3109 {
3110     byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7);
3111     byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0,
3112                         14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3113     assert(R.array == correct);
3114 }
3115 
3116 /// Set packed `__m256d` vector with the supplied values.
3117 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted
3118 {
3119     // DMD PERF
3120     static if (GDC_with_AVX)
3121     {
3122         __m256 r = __builtin_ia32_ps256_ps(lo);
3123         return __builtin_ia32_vinsertf128_ps256(r, hi, 1);
3124     }
3125     else
3126     {
3127         __m256 r = void;
3128         r.ptr[0] = lo.array[0];
3129         r.ptr[1] = lo.array[1];
3130         r.ptr[2] = lo.array[2];
3131         r.ptr[3] = lo.array[3];
3132         r.ptr[4] = hi.array[0];
3133         r.ptr[5] = hi.array[1];
3134         r.ptr[6] = hi.array[2];
3135         r.ptr[7] = hi.array[3];
3136         return r;
3137     }
3138 
3139     /*
3140         // BUG, doesn't work if AVX vector is emulated, but SSE vector is not
3141         // See issue #108
3142         __m256 r = void;
3143         __m128* p = cast(__m128*)(&r);
3144         p[0] = lo;
3145         p[1] = hi;
3146         return r;
3147     */
3148 }
3149 unittest
3150 {
3151     __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4);
3152     __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6);
3153     __m256 R = _mm256_set_m128(hi, lo);
3154     float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6];
3155     assert(R.array == correct);
3156 }
3157 
3158 /// Set packed `__m256d` vector with the supplied values.
3159 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted
3160 {
3161     __m256d r = void;
3162     r.ptr[0] = lo.array[0];
3163     r.ptr[1] = lo.array[1];
3164     r.ptr[2] = hi.array[0];
3165     r.ptr[3] = hi.array[1];
3166     return r;
3167 }
3168 unittest
3169 {
3170     __m128d lo = _mm_setr_pd(1.0, 2.0);
3171     __m128d hi = _mm_setr_pd(3.0, 4.0);
3172     __m256d R = _mm256_set_m128d(hi, lo);
3173     double[4] correct = [1.0, 2.0, 3.0, 4.0];
3174     assert(R.array == correct);
3175 }
3176 
3177 /// Set packed `__m256i` vector with the supplied values.
3178 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted
3179 {
3180     // DMD PERF
3181     static if (GDC_with_AVX)
3182     {
3183         __m256i r = cast(long4) __builtin_ia32_si256_si (lo);
3184         return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1);
3185     }
3186     else
3187     {
3188         int8 r = void;
3189         r.ptr[0] = lo.array[0];
3190         r.ptr[1] = lo.array[1];
3191         r.ptr[2] = lo.array[2];
3192         r.ptr[3] = lo.array[3];
3193         r.ptr[4] = hi.array[0];
3194         r.ptr[5] = hi.array[1];
3195         r.ptr[6] = hi.array[2];
3196         r.ptr[7] = hi.array[3];
3197         return cast(long4)r;
3198     }
3199 }
3200 unittest
3201 {
3202     __m128i lo = _mm_setr_epi32( 1,  2,  3,  4);
3203     __m128i hi =  _mm_set_epi32(-3, -4, -5, -6);
3204     int8 R = cast(int8)_mm256_set_m128i(hi, lo);
3205     int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3];
3206     assert(R.array == correct);
3207 }
3208 
3209 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3210 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted
3211 {
3212     __m256d r = void;
3213     r.ptr[0] = e0;
3214     r.ptr[1] = e1;
3215     r.ptr[2] = e2;
3216     r.ptr[3] = e3;
3217     return r;
3218 }
3219 unittest
3220 {
3221     __m256d A = _mm256_set_pd(3, 2, 1, 546);
3222     double[4] correct = [546.0, 1.0, 2.0, 3.0];
3223     assert(A.array == correct);
3224 }
3225 
3226 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
3227 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
3228 {
3229     // PERF: see #102, use = void?
3230     __m256 r;
3231     r.ptr[0] = e0;
3232     r.ptr[1] = e1;
3233     r.ptr[2] = e2;
3234     r.ptr[3] = e3;
3235     r.ptr[4] = e4;
3236     r.ptr[5] = e5;
3237     r.ptr[6] = e6;
3238     r.ptr[7] = e7;
3239     return r;
3240 }
3241 unittest
3242 {
3243     __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0);
3244     float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0];
3245     assert(A.array == correct);
3246 }
3247 
3248 /// Broadcast 16-bit integer `a` to all elements of the return value.
3249 __m256i _mm256_set1_epi16 (short a) pure @trusted
3250 {
3251     version(DigitalMars) 
3252     {
3253         // workaround https://issues.dlang.org/show_bug.cgi?id=21469
3254         // It used to ICE, after that the codegen was just wrong.
3255         // No issue anymore in DMD 2.101, we can eventually remove that
3256         static if (__VERSION__ < 2101)
3257         {
3258             short16 v = a;
3259             return cast(__m256i) v;
3260         }
3261         else
3262         {
3263             pragma(inline, true);
3264             return cast(__m256i)(short16(a));
3265         }
3266     }
3267     else
3268     {
3269         pragma(inline, true);
3270         return cast(__m256i)(short16(a));
3271     }
3272 }
3273 unittest
3274 {
3275     short16 a = cast(short16) _mm256_set1_epi16(31);
3276     for (int i = 0; i < 16; ++i)
3277         assert(a.array[i] == 31);
3278 }
3279 
3280 /// Broadcast 32-bit integer `a` to all elements.
3281 __m256i _mm256_set1_epi32 (int a) pure @trusted
3282 {
3283     version(DigitalMars) 
3284     {
3285         // No issue anymore in DMD 2.101, we can eventually remove that
3286         static if (__VERSION__ < 2101)
3287         {
3288             int8 v = a;
3289             return cast(__m256i) v;
3290         }
3291         else
3292         {
3293             pragma(inline, true);
3294             return cast(__m256i)(int8(a));
3295         }
3296     }
3297     else
3298     {
3299         pragma(inline, true);
3300         return cast(__m256i)(int8(a));
3301     }
3302 }
3303 unittest
3304 {
3305     int8 a = cast(int8) _mm256_set1_epi32(31);
3306     for (int i = 0; i < 8; ++i)
3307         assert(a.array[i] == 31);
3308 }
3309 
3310 /// Broadcast 64-bit integer `a` to all elements of the return value.
3311 __m256i _mm256_set1_epi64x (long a) pure
3312 {
3313     return cast(__m256i)(long4(a));
3314 }
3315 unittest
3316 {
3317     long4 a = cast(long4) _mm256_set1_epi64x(-31);
3318     for (int i = 0; i < 4; ++i)
3319         assert(a.array[i] == -31);
3320 }
3321 ///ditto
3322 alias _mm256_set1_epi64 = _mm256_set1_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
3323 
3324 /// Broadcast 8-bit integer `a` to all elements of the return value.
3325 __m256i _mm256_set1_epi8 (byte a) pure @trusted
3326 {
3327     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
3328     {
3329         byte32 v = a;
3330         return cast(__m256i) v;
3331     }
3332     else
3333     {
3334         pragma(inline, true);
3335         return cast(__m256i)(byte32(a));
3336     }
3337 }
3338 unittest
3339 {
3340     byte32 a = cast(byte32) _mm256_set1_epi8(31);
3341     for (int i = 0; i < 32; ++i)
3342         assert(a.array[i] == 31);
3343 }
3344 
3345 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value.
3346 __m256d _mm256_set1_pd (double a) pure @trusted
3347 {
3348     return __m256d(a);
3349 }
3350 unittest
3351 {
3352     double a = 464.21;
3353     double[4] correct = [a, a, a, a];
3354     double4 A = cast(double4) _mm256_set1_pd(a);
3355     assert(A.array == correct);
3356 }
3357 
3358 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value.
3359 __m256 _mm256_set1_ps (float a) pure @trusted
3360 {
3361     return __m256(a);
3362 }
3363 unittest
3364 {
3365     float a = 464.21f;
3366     float[8] correct = [a, a, a, a, a, a, a, a];
3367     float8 A = cast(float8) _mm256_set1_ps(a);
3368     assert(A.array == correct);
3369 }
3370 
3371 /// Set packed 16-bit integers with the supplied values in reverse order.
3372 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9,  short e8,
3373                            short e7,  short e6,  short e5,  short e4,  short e3,  short e2,  short e1,  short e0) pure @trusted
3374 {
3375     short[16] result = [ e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
3376                          e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3377     static if (GDC_with_AVX)
3378     {
3379          return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
3380     }
3381     else static if (LDC_with_optimizations)
3382     {
3383         return cast(__m256i)( loadUnaligned!(short16)(result.ptr) );
3384     }
3385     else
3386     {
3387         short16 r;
3388         for(int n = 0; n < 16; ++n)
3389             r.ptr[n] = result[n];
3390         return cast(__m256i)r;
3391     }
3392 }
3393 unittest
3394 {
3395     short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128,
3396                                                 -1, 0, -21, 21, 42, 127, -42, -128);
3397     short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
3398                          -1, 0, -21, 21, 42, 127, -42, -128];
3399     assert(A.array == correct);
3400 }
3401 
3402 /// Set packed 32-bit integers with the supplied values in reverse order.
3403 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
3404 {
3405     // Inlines a constant with GCC -O1, LDC -O2
3406     int8 r; // = void would prevent GDC from inlining a constant call
3407     r.ptr[0] = e7;
3408     r.ptr[1] = e6;
3409     r.ptr[2] = e5;
3410     r.ptr[3] = e4;
3411     r.ptr[4] = e3;
3412     r.ptr[5] = e2;
3413     r.ptr[6] = e1;
3414     r.ptr[7] = e0;
3415     return cast(__m256i)r;
3416 }
3417 unittest
3418 {
3419     int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666);
3420     int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666];
3421     assert(A.array == correct);
3422 }
3423 
3424 /// Set packed 64-bit integers with the supplied values in reverse order.
3425 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted
3426 {
3427     long4 r = void;
3428     r.ptr[0] = e3;
3429     r.ptr[1] = e2;
3430     r.ptr[2] = e1;
3431     r.ptr[3] = e0;
3432     return r;
3433 }
3434 unittest
3435 {
3436     __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max);
3437     long[4] correct = [-1, 42, long.min, long.max];
3438     assert(A.array == correct);
3439 }
3440 ///ditto
3441 alias _mm256_setr_epi64 = _mm256_setr_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API.
3442 
3443 /// Set packed 8-bit integers with the supplied values in reverse order.
3444 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24,
3445                           byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16,
3446                           byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9,  byte e8,
3447                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3448 {
3449     // Inline a constant call in GDC -O1 and LDC -O2
3450     align(32) byte[32] result = [ e31,  e30,  e29,  e28,  e27,  e26,  e25,  e24,
3451                                   e23,  e22,  e21,  e20,  e19,  e18,  e17,  e16,
3452                                   e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
3453                                    e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3454     return *cast(__m256i*)(result.ptr);
3455 }
3456 unittest
3457 {
3458     byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128,
3459                                               -1, 0, -21, 21, 42, 127, -42, -128,
3460                                               -1, 0, -21, 21, 42, 127, -42, -128,
3461                                               -1, 0, -21, 21, 42, 127, -42, -128);
3462     byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
3463                         -1, 0, -21, 21, 42, 127, -42, -128,
3464                         -1, 0, -21, 21, 42, 127, -42, -128,
3465                         -1, 0, -21, 21, 42, 127, -42, -128];
3466     assert(A.array == correct);
3467 }
3468 
3469 /// Set packed `__m256` vector with the supplied values.
3470 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi) pure
3471 {
3472     return _mm256_set_m128(hi, lo);
3473 }
3474 unittest
3475 {
3476     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
3477     __m128 B = _mm_setr_ps(3.0f, 4, 5, 6);
3478     __m256 R = _mm256_setr_m128(B, A);
3479     float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,];
3480     assert(R.array == correct);
3481 }
3482 
3483 /// Set packed `__m256d` vector with the supplied values.
3484 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi) pure
3485 {
3486     return _mm256_set_m128d(hi, lo);
3487 }
3488 unittest
3489 {
3490     __m128d A = _mm_setr_pd(1.0, 2.0);
3491     __m128d B = _mm_setr_pd(3.0, 4.0);
3492     __m256d R = _mm256_setr_m128d(B, A);
3493     double[4] correct = [3.0, 4.0, 1.0, 2.0];
3494     assert(R.array == correct);
3495 }
3496 
3497 /// Set packed `__m256i` vector with the supplied values.
3498 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi) pure
3499 {
3500     return _mm256_set_m128i(hi, lo);
3501 }
3502 unittest
3503 {
3504     __m128i A = _mm_setr_epi32( 1,  2,  3,  4);
3505     __m128i B =  _mm_set_epi32(-3, -4, -5, -6);
3506     int8 R = cast(int8)_mm256_setr_m128i(B, A);
3507     int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4];
3508     assert(R.array == correct);
3509 }
3510 
3511 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3512 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted
3513 {
3514     static if (LDC_with_optimizations)
3515     {
3516         // PERF, probably not the best
3517         double[4] result = [e3, e2, e1, e0];
3518         return loadUnaligned!(double4)(result.ptr);
3519     }
3520     else
3521     {
3522         __m256d r;
3523         r.ptr[0] = e3;
3524         r.ptr[1] = e2;
3525         r.ptr[2] = e1;
3526         r.ptr[3] = e0;
3527         return r;
3528     }
3529 }
3530 unittest
3531 {
3532     __m256d A = _mm256_setr_pd(3, 2, 1, 546.125);
3533     double[4] correct = [3.0, 2.0, 1.0, 546.125];
3534     assert(A.array == correct);
3535 }
3536 
3537 
3538 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
3539 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
3540 {
3541     // PERF DMD
3542     static if (GDC_with_AVX)
3543     {
3544         align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3545         return *cast(__m256*)r;
3546     }
3547     else version(LDC)
3548     {
3549         align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
3550         return *cast(__m256*)r;
3551     }
3552     else
3553     {
3554         __m256 r;
3555         r.ptr[0] = e7;
3556         r.ptr[1] = e6;
3557         r.ptr[2] = e5;
3558         r.ptr[3] = e4;
3559         r.ptr[4] = e3;
3560         r.ptr[5] = e2;
3561         r.ptr[6] = e1;
3562         r.ptr[7] = e0;
3563         return r;
3564     }
3565 }
3566 unittest
3567 {
3568     __m256 A = _mm256_setr_ps(   3, 2, 1, 546.125f, 4, 5, 6, 7);
3569     float[8] correct       = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7];
3570     assert(A.array == correct);
3571 }
3572 
3573 /// Return vector of type `__m256d` with all elements set to zero.
3574 __m256d _mm256_setzero_pd() pure @safe
3575 {
3576     return double4(0.0);
3577 }
3578 unittest
3579 {
3580     __m256d A = _mm256_setzero_pd();
3581     double[4] correct = [0.0, 0.0, 0.0, 0.0];
3582     assert(A.array == correct);
3583 }
3584 
3585 /// Return vector of type `__m256` with all elements set to zero.
3586 __m256 _mm256_setzero_ps() pure @safe
3587 {
3588     return float8(0.0f);
3589 }
3590 unittest
3591 {
3592     __m256 A = _mm256_setzero_ps();
3593     float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0];
3594     assert(A.array == correct);
3595 }
3596 
3597 /// Return vector of type `__m256i` with all elements set to zero.
3598 __m256i _mm256_setzero_si256() pure @trusted
3599 {
3600     return __m256i(0);
3601 }
3602 unittest
3603 {
3604     __m256i A = _mm256_setzero_si256();
3605     long[4] correct = [0, 0, 0, 0];
3606     assert(A.array == correct);
3607 }
3608 
3609 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 
3610 /// control in `imm8`.
3611 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted
3612 {
3613     // PERF DMD
3614     static if (GDC_with_AVX)
3615     {
3616         return __builtin_ia32_shufpd256(a, b, imm8);
3617     }
3618     else version(LDC)
3619     {
3620         return shufflevectorLDC!(double4,        
3621                                        (imm8 >> 0) & 1,
3622                                  4 + ( (imm8 >> 1) & 1),
3623                                  2 + ( (imm8 >> 2) & 1),
3624                                  6 + ( (imm8 >> 3) & 1) )(a, b);
3625     }
3626     else
3627     {
3628         double4 r = void;
3629         r.ptr[0] = a.array[(imm8 >> 0) & 1];
3630         r.ptr[1] = b.array[(imm8 >> 1) & 1];
3631         r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)];
3632         r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)];
3633         return r;
3634     }
3635 }
3636 unittest
3637 {
3638     __m256d A = _mm256_setr_pd( 0, 1, 2, 3);
3639     __m256d B = _mm256_setr_pd( 4, 5, 6, 7);
3640     __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B);
3641     double[4] correct = [1.0, 5.0, 2.0, 7.0];
3642     assert(C.array == correct);
3643 } 
3644 
3645 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 
3646 /// the control in `imm8`.
3647 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted
3648 {
3649     // PERF DMD
3650     static if (GDC_with_AVX)
3651     {
3652         return __builtin_ia32_shufps256(a, b, imm8);
3653     }
3654     else version(LDC)
3655     {
3656         return shufflevectorLDC!(float8, (imm8 >> 0) & 3,
3657                                  (imm8 >> 2) & 3,
3658                                  8 + ( (imm8 >> 4) & 3),
3659                                  8 + ( (imm8 >> 6) & 3),
3660                                  4 + ( (imm8 >> 0) & 3),
3661                                  4 + ( (imm8 >> 2) & 3),
3662                                  12 + ( (imm8 >> 4) & 3),
3663                                  12 + ( (imm8 >> 6) & 3) )(a, b);
3664     }
3665     else
3666     {
3667         float8 r = void;
3668         r.ptr[0] = a.array[(imm8 >> 0) & 3];
3669         r.ptr[1] = a.array[(imm8 >> 2) & 3];
3670         r.ptr[2] = b.array[(imm8 >> 4) & 3];
3671         r.ptr[3] = b.array[(imm8 >> 6) & 3];
3672         r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )];
3673         r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )];
3674         r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )];
3675         r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )];
3676         return r;
3677     }
3678 }
3679 unittest
3680 {
3681     __m256 A = _mm256_setr_ps( 0,  1,  2,  3,  4,  5,  6,  7);
3682     __m256 B = _mm256_setr_ps( 8,  9, 10, 11, 12, 13, 14, 15);
3683     __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B);
3684     float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13];
3685     assert(C.array == correct);
3686 } 
3687 
3688 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`.
3689 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted
3690 {
3691     static if (GDC_with_AVX)
3692     {
3693         return __builtin_ia32_sqrtpd256(a);
3694     } 
3695     else version(LDC)
3696     {   
3697         static if (__VERSION__ >= 2084) 
3698             return llvm_sqrt(a); // that capability appeared in LDC 1.14
3699         else
3700         {
3701             a.ptr[0] = llvm_sqrt(a.array[0]);
3702             a.ptr[1] = llvm_sqrt(a.array[1]);
3703             a.ptr[2] = llvm_sqrt(a.array[2]);
3704             a.ptr[3] = llvm_sqrt(a.array[3]);
3705             return a;
3706         }
3707     }    
3708     else
3709     {
3710         a.ptr[0] = sqrt(a.array[0]);
3711         a.ptr[1] = sqrt(a.array[1]);
3712         a.ptr[2] = sqrt(a.array[2]);
3713         a.ptr[3] = sqrt(a.array[3]);
3714         return a;
3715     }
3716 }
3717 unittest
3718 {
3719     __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0));
3720     double[4] correct = [2.0, 2, 2, 2];
3721     assert(A.array == correct);
3722 }
3723 
3724 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
3725 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted
3726 {
3727     static if (GDC_with_AVX)
3728     {
3729         return __builtin_ia32_sqrtps256(a);
3730     } 
3731     else version(LDC)
3732     {  
3733         static if (__VERSION__ >= 2084) 
3734             return llvm_sqrt(a); // that capability appeared in LDC 1.14
3735         else
3736         {  
3737             a.ptr[0] = llvm_sqrt(a.array[0]);
3738             a.ptr[1] = llvm_sqrt(a.array[1]);
3739             a.ptr[2] = llvm_sqrt(a.array[2]);
3740             a.ptr[3] = llvm_sqrt(a.array[3]);
3741             a.ptr[4] = llvm_sqrt(a.array[4]);
3742             a.ptr[5] = llvm_sqrt(a.array[5]);
3743             a.ptr[6] = llvm_sqrt(a.array[6]);
3744             a.ptr[7] = llvm_sqrt(a.array[7]);
3745             return a;
3746         }
3747     }    
3748     else
3749     {
3750         a.ptr[0] = sqrt(a.array[0]);
3751         a.ptr[1] = sqrt(a.array[1]);
3752         a.ptr[2] = sqrt(a.array[2]);
3753         a.ptr[3] = sqrt(a.array[3]);
3754         a.ptr[4] = sqrt(a.array[4]);
3755         a.ptr[5] = sqrt(a.array[5]);
3756         a.ptr[6] = sqrt(a.array[6]);
3757         a.ptr[7] = sqrt(a.array[7]);
3758         return a;
3759     }
3760 }
3761 unittest
3762 {
3763     __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f));
3764     float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2];
3765     assert(A.array == correct);
3766 }
3767 
3768 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
3769 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
3770 /// exception may be generated.
3771 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system
3772 {
3773     *cast(__m256d*)mem_addr = a;
3774 }
3775 unittest
3776 {
3777     align(32) double[4] mem;
3778     double[4] correct = [1.0, 2, 3, 4];
3779     _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4));
3780     assert(mem == correct);
3781 }
3782 
3783 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
3784 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
3785 /// exception may be generated.
3786 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system
3787 {
3788     *cast(__m256*)mem_addr = a;
3789 }
3790 unittest
3791 {
3792     align(32) float[8] mem;
3793     float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
3794     _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1));
3795     assert(mem == correct);
3796 }
3797 
3798 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 
3799 /// boundary or a general-protection exception may be generated.
3800 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe
3801 {
3802     *mem_addr = a;
3803 }
3804 unittest
3805 {
3806     align(32) long[4] mem;
3807     long[4] correct = [5, -6, -7, 8];
3808     _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
3809     assert(mem == correct);
3810 }
3811 
3812 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
3813 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
3814 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system
3815 {
3816     // PERF DMD
3817     static if (GDC_with_AVX)
3818     {
3819         __builtin_ia32_storeupd256(mem_addr, a);
3820     }
3821     else static if (LDC_with_optimizations)
3822     {
3823         storeUnaligned!__m256d(a, mem_addr);
3824     }
3825     else
3826     {
3827         for(int n = 0; n < 4; ++n)
3828             mem_addr[n] = a.array[n];
3829     }
3830 }
3831 unittest
3832 {
3833     align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0];
3834     _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0));
3835     double[4] correct = [4.0, 4, 4, 4];
3836     assert(arr[1..5] == correct);
3837 }
3838 
3839 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
3840 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
3841 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system
3842 {
3843     // PERF DMD
3844     static if (GDC_with_AVX)
3845     {
3846         __builtin_ia32_storeups256(mem_addr, a);
3847     }
3848     else static if (LDC_with_optimizations)
3849     {
3850         storeUnaligned!__m256(a, mem_addr);
3851     }
3852     else
3853     {
3854         for(int n = 0; n < 8; ++n)
3855             mem_addr[n] = a.array[n];
3856     }
3857 }
3858 unittest
3859 {
3860     align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3861     _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f));
3862     float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4];
3863     assert(arr[1..9] == correct);
3864 }
3865 
3866 
3867 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned
3868 ///  on any particular boundary.
3869 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted
3870 {
3871     // PERF DMD
3872     static if (GDC_with_AVX)
3873     {
3874         __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a);
3875     }
3876     else static if (LDC_with_optimizations)
3877     {
3878         storeUnaligned!__m256i(a, cast(long*)mem_addr);
3879     }
3880     else
3881     {
3882         long4 v = cast(long4)a;
3883         long* p = cast(long*)mem_addr;
3884         for(int n = 0; n < 4; ++n)
3885             p[n] = v[n];
3886     }
3887 }
3888 unittest
3889 {
3890     align(32) long[6] arr = [0, 0, 0, 0, 0, 0];
3891     _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4));
3892     long[4] correct = [4, 4, 4, 4];
3893     assert(arr[1..5] == correct);
3894 }
3895 
3896 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 
3897 /// floating-point elements) from `a` into memory two different 128-bit locations. 
3898 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3899 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system
3900 {
3901     // This is way better on GDC, and similarly in LDC, vs using other intrinsics
3902     loaddr[0] = a.array[0];
3903     loaddr[1] = a.array[1];
3904     loaddr[2] = a.array[2];
3905     loaddr[3] = a.array[3];
3906     hiaddr[0] = a.array[4];
3907     hiaddr[1] = a.array[5];
3908     hiaddr[2] = a.array[6];
3909     hiaddr[3] = a.array[7];
3910 }
3911 unittest
3912 {
3913     align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3914     _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f));
3915     float[11] correct     = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0];
3916     assert(A == correct);
3917 }
3918 
3919 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit)
3920 /// floating-point elements) from `a` into memory two different 128-bit locations. 
3921 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3922 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system
3923 {
3924     loaddr[0] = a.array[0];
3925     loaddr[1] = a.array[1];
3926     hiaddr[0] = a.array[2];
3927     hiaddr[1] = a.array[3];
3928 }
3929 unittest
3930 {
3931     double[2] A;
3932     double[2] B;
3933     _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0));
3934     double[2] correct = [-43.0, -43];
3935     assert(A == correct);
3936     assert(B == correct);
3937 }
3938 
3939 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 
3940 /// different 128-bit locations. 
3941 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3942 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted
3943 {
3944     long* hi = cast(long*)hiaddr;
3945     long* lo = cast(long*)loaddr;
3946     lo[0] = a.array[0];
3947     lo[1] = a.array[1];
3948     hi[0] = a.array[2];
3949     hi[1] = a.array[3];
3950 }
3951 unittest
3952 {
3953     long[2] A;
3954     long[2] B;
3955     _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42));
3956     long[2] correct = [-42, -42];
3957     assert(A == correct);
3958     assert(B == correct);
3959 }
3960 
3961 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from
3962 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
3963 /// boundary or a general-protection exception may be generated.
3964 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
3965 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system
3966 {
3967     // PERF DMD
3968     // PERF GDC + SSE2
3969     static if (LDC_with_InlineIREx && LDC_with_optimizations)
3970     {
3971         enum prefix = `!0 = !{ i32 1 }`;
3972         enum ir = `
3973             store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0
3974             ret void`;
3975         LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a);
3976     }   
3977     else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions.
3978     {
3979         __builtin_ia32_movntpd256 (mem_addr, a);
3980     }
3981     else
3982     {
3983         // Regular store instead.
3984         __m256d* dest = cast(__m256d*)mem_addr;
3985         *dest = a;
3986     }
3987 }
3988 unittest
3989 {
3990     align(32) double[4] mem;
3991     double[4] correct = [5.0, -6, -7, 8];
3992     _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8));
3993     assert(mem == correct);
3994 }
3995 
3996 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from
3997 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
3998 /// boundary or a general-protection exception may be generated.
3999 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4000 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system
4001 {
4002     // PERF DMD
4003     // PERF GDC + SSE2
4004     static if (LDC_with_InlineIREx && LDC_with_optimizations)
4005     {
4006         enum prefix = `!0 = !{ i32 1 }`;
4007         enum ir = `
4008             store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0
4009             ret void`;
4010         LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a);
4011     }   
4012     else static if (GDC_with_AVX)
4013     {
4014         __builtin_ia32_movntps256 (mem_addr, a);
4015     }
4016     else
4017     {
4018         // Regular store instead.
4019         __m256* dest = cast(__m256*)mem_addr;
4020         *dest = a;
4021     }
4022 }
4023 unittest
4024 {
4025     align(32) float[8] mem;
4026     float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4];
4027     _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4));
4028     assert(mem == correct);
4029 }
4030 
4031 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 
4032 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be
4033 /// generated.
4034 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2.
4035 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4036 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted
4037 {
4038     // PERF DMD
4039     // PERF GDC
4040     static if (LDC_with_InlineIREx && LDC_with_optimizations)
4041     {
4042         enum prefix = `!0 = !{ i32 1 }`;
4043         enum ir = `
4044             store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0
4045             ret void`;
4046         LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a);
4047     }
4048     else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions.
4049     {
4050         long2 lo, hi;
4051         lo.ptr[0] = a.array[0];
4052         lo.ptr[1] = a.array[1];
4053         hi.ptr[0] = a.array[2];
4054         hi.ptr[1] = a.array[3];
4055         _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo);
4056         _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi);
4057     }
4058     else
4059     {
4060         // Regular store instead.
4061         __m256i* dest = cast(__m256i*)mem_addr;
4062         *dest = a;
4063     }
4064 }
4065 unittest
4066 {
4067     align(32) long[4] mem;
4068     long[4] correct = [5, -6, -7, 8];
4069     _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
4070     assert(mem == correct);
4071 }
4072 
4073 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 
4074 /// packed double-precision (64-bit) floating-point elements in `a`.
4075 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe
4076 {
4077     return a - b;
4078 }
4079 unittest
4080 {
4081     __m256d a = [1.5, -2.0, 3.0, 200000.0];
4082     a = _mm256_sub_pd(a, a);
4083     double[4] correct = [0.0, 0, 0, 0];
4084     assert(a.array == correct);
4085 }
4086 
4087 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 
4088 /// packed single-precision (32-bit) floating-point elements in `a`.
4089 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe
4090 {
4091     return a - b;
4092 }
4093 unittest
4094 {
4095     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f];
4096     a = _mm256_sub_ps(a, a);
4097     float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f];
4098     assert(a.array == correct);
4099 }
4100 
4101 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 
4102 /// return 1 if the sign bit of each 64-bit element in the intermediate value is zero, 
4103 /// otherwise return 0.
4104 int _mm_testc_pd (__m128d a, __m128d b) pure @trusted
4105 {
4106     static if (GDC_or_LDC_with_AVX)
4107     {
4108         return __builtin_ia32_vtestcpd(a, b);
4109     }
4110     else
4111     {
4112         // PERF: maybe do the generic version more like simde
4113         long2 la = cast(long2)a;
4114         long2 lb = cast(long2)b;
4115         long2 r = ~la & lb;
4116         return r.array[0] >= 0 && r.array[1] >= 0;
4117     }
4118 }
4119 unittest
4120 {
4121     __m128d A  = _mm_setr_pd(-1, 1);
4122     __m128d B = _mm_setr_pd(-1, -1);
4123     __m128d C = _mm_setr_pd(1, -1);
4124     assert(_mm_testc_pd(A, A) == 1);
4125     assert(_mm_testc_pd(A, B) == 0);
4126     assert(_mm_testc_pd(B, A) == 1);
4127 }
4128 
4129 ///ditto
4130 int _mm256_testc_pd (__m256d a, __m256d b) pure @safe
4131 {
4132     static if (GDC_or_LDC_with_AVX)
4133     {
4134         return __builtin_ia32_vtestcpd256(a, b);
4135     }
4136     else static if (LDC_with_ARM64)
4137     {
4138         // better to split than do vanilla (down to 10 inst)
4139         __m128d lo_a = _mm256_extractf128_pd!0(a);
4140         __m128d lo_b = _mm256_extractf128_pd!0(b);
4141         __m128d hi_a = _mm256_extractf128_pd!1(a);
4142         __m128d hi_b = _mm256_extractf128_pd!1(b);
4143         return _mm_testc_pd(lo_a, lo_b) & _mm_testc_pd(hi_a, hi_b);
4144     }
4145     else
4146     {
4147         // PERF: do the generic version more like simde, maybe this get rids of arm64 version
4148         long4 la = cast(long4)a;
4149         long4 lb = cast(long4)b;
4150         long4 r = ~la & lb;
4151         return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0;
4152     }
4153 }
4154 unittest
4155 {
4156     __m256d A = _mm256_setr_pd(-1, 1, -1, 1);
4157     __m256d B = _mm256_setr_pd(-1, -1, -1, -1);
4158     __m256d C = _mm256_setr_pd(1, -1, 1, -1);
4159     assert(_mm256_testc_pd(A, A) == 1);
4160     assert(_mm256_testc_pd(A, B) == 0);
4161     assert(_mm256_testc_pd(B, A) == 1);
4162 }
4163 
4164 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 
4165 /// return 1 if the sign bit of each 32-bit element in the intermediate value is zero, 
4166 /// otherwise return 0.
4167 int _mm_testc_ps (__m128 a, __m128 b) pure @safe
4168 {
4169     // PERF DMD
4170     static if (GDC_or_LDC_with_AVX)
4171     {
4172         return __builtin_ia32_vtestcps(a, b);
4173     }   
4174     else static if (LDC_with_ARM64)
4175     {
4176         int4 la = cast(int4)a;
4177         int4 lb = cast(int4)b;
4178         int4 r = ~la & lb;
4179         int4 shift;
4180         shift = 31;
4181         r >>= shift;
4182         int[4] zero = [0, 0, 0, 0];
4183         return r.array == zero;
4184     }
4185     else
4186     {
4187         // PERF: do the generic version more like simde, maybe this get rids of arm64 version
4188         int4 la = cast(int4)a;
4189         int4 lb = cast(int4)b;
4190         int4 r = ~la & lb;
4191         return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0;
4192     }
4193 }
4194 unittest
4195 {
4196     __m128 A = _mm_setr_ps(-1, 1, -1, 1);
4197     __m128 B = _mm_setr_ps(-1, -1, -1, -1);
4198     __m128 C = _mm_setr_ps(1, -1, 1, -1);
4199     assert(_mm_testc_ps(A, A) == 1);
4200     assert(_mm_testc_ps(A, B) == 0);
4201     assert(_mm_testc_ps(B, A) == 1);
4202 }
4203 
4204 ///ditto
4205 int _mm256_testc_ps (__m256 a, __m256 b) pure @safe
4206 {
4207     // PERF DMD
4208     static if (GDC_or_LDC_with_AVX)
4209     {
4210         return __builtin_ia32_vtestcps256(a, b);
4211     }
4212     else static if (LDC_with_ARM64)
4213     {
4214         int8 la = cast(int8)a;
4215         int8 lb = cast(int8)b;
4216         int8 r = ~la & lb;
4217         int8 shift;
4218         shift = 31;
4219         r >>= shift;
4220         int[8] zero = [0, 0, 0, 0, 0, 0, 0, 0];
4221         return r.array == zero;
4222     }
4223     else
4224     {
4225         // PERF: do the generic version more like simde, maybe this get rids of arm64 version
4226         int8 la = cast(int8)a;
4227         int8 lb = cast(int8)b;
4228         int8 r = ~la & lb;
4229         return r.array[0] >= 0 
4230             && r.array[1] >= 0
4231             && r.array[2] >= 0
4232             && r.array[3] >= 0
4233             && r.array[4] >= 0
4234             && r.array[5] >= 0
4235             && r.array[6] >= 0
4236             && r.array[7] >= 0;
4237     }
4238 }
4239 unittest
4240 {
4241     __m256 A = _mm256_setr_ps(-1,  1, -1,  1, -1,  1, -1,  1);
4242     __m256 B = _mm256_setr_ps(-1, -1, -1, -1, -1, -1, -1, -1);
4243     __m256 C = _mm256_setr_ps( 1, -1,  1, -1,  1,  1,  1,  1);
4244     assert(_mm256_testc_ps(A, A) == 1);
4245     assert(_mm256_testc_ps(B, B) == 1);
4246     assert(_mm256_testc_ps(A, B) == 0);
4247     assert(_mm256_testc_ps(B, A) == 1);
4248     assert(_mm256_testc_ps(C, B) == 0);
4249     assert(_mm256_testc_ps(B, C) == 1);
4250 }
4251 
4252 /// Compute the bitwise NOT of `a` and then AND with `b`, and return 1 if the result is zero,
4253 /// otherwise return 0.
4254 /// In other words, test if all bits masked by `b` are also 1 in `a`.
4255 int _mm256_testc_si256 (__m256i a, __m256i b) pure @trusted
4256 {
4257     static if (GDC_or_LDC_with_AVX)
4258     {
4259         return __builtin_ia32_ptestc256(cast(long4)a, cast(long4)b);
4260     }
4261     else static if (LDC_with_ARM64)
4262     {
4263         // better to split than do vanilla (down to 10 inst)
4264         __m128i lo_a = _mm256_extractf128_si256!0(a);
4265         __m128i lo_b = _mm256_extractf128_si256!0(b);
4266         __m128i hi_a = _mm256_extractf128_si256!1(a);
4267         __m128i hi_b = _mm256_extractf128_si256!1(b);
4268         return _mm_testc_si128(lo_a, lo_b) & _mm_testc_si128(hi_a, hi_b);
4269     }
4270     else
4271     {
4272         __m256i c = ~a & b;
4273         long[4] zero = [0, 0, 0, 0];
4274         return c.array == zero;
4275     }
4276 }
4277 unittest
4278 {
4279     __m256i A  = _mm256_setr_epi64(0x01, 0x02, 0x04, 0xf8);
4280     __m256i M1 = _mm256_setr_epi64(0xfe, 0xfd, 0x00, 0x00);
4281     __m256i M2 = _mm256_setr_epi64(0x00, 0x00, 0x04, 0x00);
4282     assert(_mm256_testc_si256(A, A) == 1);
4283     assert(_mm256_testc_si256(A, M1) == 0);
4284     assert(_mm256_testc_si256(A, M2) == 1);
4285 }
4286 
4287 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
4288 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 
4289 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4290 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4291 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise
4292 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4293 ///
4294 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4295 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4296 int _mm_testnzc_pd (__m128d a, __m128d b) pure @safe
4297 {
4298     // PERF DMD
4299     static if (GDC_or_LDC_with_AVX)
4300     {
4301         return __builtin_ia32_vtestnzcpd(a, b);
4302     }
4303     else
4304     {
4305         // ZF = 0 means "there is at least one pair of negative numbers"
4306         // ZF = 1 means "no pairs of negative numbers"
4307         // CF = 0 means "there is a negative number in b that is next to a positive number in a"
4308         // CF = 1 means "all negative numbers in b are also negative in a"
4309         // Consequently, CF = 0 and ZF = 0 means:
4310         //   "There is a pair of matching negative numbers in a and b, 
4311         //   AND also there is a negative number in b, that is matching a positive number in a"
4312         // Phew.
4313 
4314         // courtesy of simd-everywhere
4315         __m128i m = _mm_and_si128(cast(__m128i)a, cast(__m128i)b);
4316         __m128i m2 = _mm_andnot_si128(cast(__m128i)a, cast(__m128i)b);
4317         m = _mm_srli_epi64(m, 63);
4318         m2 = _mm_srli_epi64(m2, 63);
4319         return cast(int)( m.array[0] | m.array[2]) & (m2.array[0] | m2.array[2]);
4320     }
4321 }
4322 unittest
4323 {
4324     __m128d PM = _mm_setr_pd( 1, -1);
4325     __m128d MP = _mm_setr_pd(-1,  1);
4326     __m128d MM = _mm_setr_pd(-1, -1);
4327     assert(_mm_testnzc_pd(PM, MP) == 0);
4328     assert(_mm_testnzc_pd(PM, MM) == 1);
4329     assert(_mm_testnzc_pd(MP, MP) == 0);
4330     assert(_mm_testnzc_pd(MP, MM) == 1);
4331     assert(_mm_testnzc_pd(MM, MM) == 0);
4332 }
4333 
4334 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
4335 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 
4336 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4337 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4338 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise
4339 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4340 ///
4341 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4342 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4343 int _mm256_testnzc_pd (__m256d a, __m256d b) pure @safe
4344 {
4345     // PERF DMD
4346     static if (GDC_or_LDC_with_AVX)
4347     {
4348         return __builtin_ia32_vtestnzcpd256(a, b);
4349     }
4350     else
4351     {
4352         long4 la = cast(long4)a;
4353         long4 lb = cast(long4)b;
4354         long4 r = la & lb;
4355         long m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4356         int ZF = (~m >> 63) & 1;
4357         long4 r2 = ~la & lb;
4358         long m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3];
4359         int CF = (~m2 >> 63) & 1;
4360         return (CF | ZF) == 0;
4361     }
4362 }
4363 unittest
4364 {
4365     __m256d PM = _mm256_setr_pd( 1, -1, 1, 1);
4366     __m256d MP = _mm256_setr_pd(-1,  1, 1, 1);
4367     __m256d MM = _mm256_setr_pd(-1, -1, 1, 1);
4368     assert(_mm256_testnzc_pd(PM, MP) == 0);
4369     assert(_mm256_testnzc_pd(PM, MM) == 1);
4370     assert(_mm256_testnzc_pd(MP, MP) == 0);
4371     assert(_mm256_testnzc_pd(MP, MM) == 1);
4372     assert(_mm256_testnzc_pd(MM, MM) == 0);
4373 }
4374 
4375 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
4376 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 
4377 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4378 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4379 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise
4380 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4381 ///
4382 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4383 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4384 int _mm_testnzc_ps (__m128 a, __m128 b) pure @safe
4385 {
4386     // PERF DMD
4387     static if (GDC_or_LDC_with_AVX)
4388     {
4389         return __builtin_ia32_vtestnzcps(a, b);
4390     }
4391     else
4392     {
4393         int4 la = cast(int4)a;
4394         int4 lb = cast(int4)b;
4395         int4 r = la & lb;
4396         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4397         int ZF = (~m >> 31) & 1;
4398         int4 r2 = ~la & lb;
4399         int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3];
4400         int CF = (~m2 >> 31) & 1;
4401         return (CF | ZF) == 0;
4402     }
4403 }
4404 unittest
4405 {
4406     __m128 PM = _mm_setr_ps( 1, -1, 1, 1);
4407     __m128 MP = _mm_setr_ps(-1,  1, 1, 1);
4408     __m128 MM = _mm_setr_ps(-1, -1, 1, 1);
4409     assert(_mm_testnzc_ps(PM, MP) == 0);
4410     assert(_mm_testnzc_ps(PM, MM) == 1);
4411     assert(_mm_testnzc_ps(MP, MP) == 0);
4412     assert(_mm_testnzc_ps(MP, MM) == 1);
4413     assert(_mm_testnzc_ps(MM, MM) == 0);
4414 }
4415 
4416 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
4417 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 
4418 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 
4419 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set
4420 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise
4421 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
4422 ///
4423 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`,
4424 ///             AND there is at least one negative number in `b` that correspond to a negative number in `a`.
4425 int _mm256_testnzc_ps (__m256 a, __m256 b) pure @safe
4426 {
4427     // PERF DMD
4428     static if (GDC_or_LDC_with_AVX)
4429     {
4430         return __builtin_ia32_vtestnzcps256(a, b);
4431     }
4432     else
4433     {
4434         int8 la = cast(int8)a;
4435         int8 lb = cast(int8)b;
4436         int8 r = la & lb;
4437         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]
4438             |   r.array[4] | r.array[5] | r.array[6] | r.array[7];
4439         int ZF = (~m >> 31) & 1;
4440         int8 r2 = ~la & lb;
4441         int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]
4442                | r2.array[4] | r2.array[5] | r2.array[6] | r2.array[7];
4443         int CF = (~m2 >> 31) & 1;
4444         return (CF | ZF) == 0;
4445     }
4446 }
4447 unittest
4448 {
4449     __m256 PM = _mm256_setr_ps(1, 1, 1, 1,  1, -1, 1, 1);
4450     __m256 MP = _mm256_setr_ps(1, 1, 1, 1, -1,  1, 1, 1);
4451     __m256 MM = _mm256_setr_ps(1, 1, 1, 1, -1, -1, 1, 1);
4452     assert(_mm256_testnzc_ps(PM, MP) == 0);
4453     assert(_mm256_testnzc_ps(PM, MM) == 1);
4454     assert(_mm256_testnzc_ps(MP, MP) == 0);
4455     assert(_mm256_testnzc_ps(MP, MM) == 1);
4456     assert(_mm256_testnzc_ps(MM, MM) == 0);
4457 }
4458 
4459 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`, 
4460 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
4461 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
4462 /// result is zero, otherwise set CF to 0. 
4463 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
4464 int _mm256_testnzc_si256 (__m256i a, __m256i b) pure @trusted
4465 {
4466     // PERF ARM64
4467     // PERF DMD
4468     // PERF LDC without AVX
4469     static if (GDC_or_LDC_with_AVX)
4470     {
4471         return __builtin_ia32_ptestnzc256(cast(long4) a, cast(long4) b);
4472     }
4473     else
4474     {
4475         // Need to defer to _mm_testnzc_si128 if possible, for more speed
4476         __m256i c = a & b;
4477         __m256i d = ~a & b;
4478         long m = c.array[0] | c.array[1] | c.array[2] | c.array[3];
4479         long n = d.array[0] | d.array[1] | d.array[2] | d.array[3];
4480         return (m != 0) & (n != 0);
4481     }
4482 }
4483 unittest
4484 {
4485     __m256i A  = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0, 0, 0, 0);
4486     __m256i M  = _mm256_setr_epi32(0x01, 0x40, 0x00, 0x00, 0, 0, 0, 0);
4487     __m256i Z = _mm256_setzero_si256();
4488     assert(_mm256_testnzc_si256(A, Z) == 0);
4489     assert(_mm256_testnzc_si256(A, M) == 1);
4490     assert(_mm256_testnzc_si256(A, A) == 0);
4491 }
4492 
4493 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 
4494 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of
4495 /// each 64-bit element in the intermediate value is zero, otherwise return 0.
4496 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4497 int _mm_testz_pd (__m128d a, __m128d b) pure @trusted
4498 {
4499     static if (GDC_or_LDC_with_AVX)
4500     {
4501         return __builtin_ia32_vtestzpd(a, b);
4502     }
4503     else
4504     {
4505         long2 la = cast(long2)a;
4506         long2 lb = cast(long2)b;
4507         long2 r = la & lb;
4508         long m = r.array[0] | r.array[1];
4509         return (~m >> 63) & 1;
4510     }
4511 }
4512 unittest
4513 {
4514     __m128d A  = _mm_setr_pd(-1, 1);
4515     __m128d B = _mm_setr_pd(-1, -1);
4516     __m128d C = _mm_setr_pd(1, -1);
4517     assert(_mm_testz_pd(A, A) == 0);
4518     assert(_mm_testz_pd(A, B) == 0);
4519     assert(_mm_testz_pd(C, A) == 1);
4520 }
4521 
4522 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 
4523 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of
4524 /// each 64-bit element in the intermediate value is zero, otherwise return 0.
4525 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4526 int _mm256_testz_pd (__m256d a, __m256d b) pure @trusted
4527 {
4528     static if (GDC_or_LDC_with_AVX)
4529     {
4530         return __builtin_ia32_vtestzpd256(a, b);
4531     }
4532     else
4533     {
4534         long4 la = cast(long4)a;
4535         long4 lb = cast(long4)b;
4536         long4 r = la & lb;
4537         long r2 = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4538         return (~r2 >> 63) & 1;
4539     }
4540 }
4541 unittest
4542 {
4543     __m256d A = _mm256_setr_pd(-1, 1, -1, 1);
4544     __m256d B = _mm256_setr_pd(1,  1, -1, 1);
4545     __m256d C = _mm256_setr_pd(1, -1, 1, -1);
4546     assert(_mm256_testz_pd(A, A) == 0);
4547     assert(_mm256_testz_pd(A, B) == 0);
4548     assert(_mm256_testz_pd(C, A) == 1);
4549 }
4550 
4551 /// Compute the bitwise AND of 128 bits (representing double-precision (32-bit) floating-point 
4552 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of
4553 /// each 32-bit element in the intermediate value is zero, otherwise return 0.
4554 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4555 int _mm_testz_ps (__m128 a, __m128 b) pure @safe
4556 {
4557     // PERF DMD
4558     static if (GDC_or_LDC_with_AVX)
4559     {
4560         return __builtin_ia32_vtestzps(a, b);
4561     }
4562     else
4563     {
4564         int4 la = cast(int4)a;
4565         int4 lb = cast(int4)b;
4566         int4 r = la & lb;
4567         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3];
4568         return (~m >> 31) & 1;
4569     }
4570 }
4571 unittest
4572 {
4573     __m128 A = _mm_setr_ps(-1,  1, -1,  1);
4574     __m128 B = _mm_setr_ps( 1,  1, -1,  1);
4575     __m128 C = _mm_setr_ps( 1, -1,  1, -1);
4576     assert(_mm_testz_ps(A, A) == 0);
4577     assert(_mm_testz_ps(A, B) == 0);
4578     assert(_mm_testz_ps(C, A) == 1);
4579     assert(_mm_testz_ps(C, B) == 1);
4580 }
4581 
4582 /// Compute the bitwise AND of 256 bits (representing double-precision (32-bit) floating-point 
4583 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of
4584 /// each 32-bit element in the intermediate value is zero, otherwise return 0.
4585 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place.
4586 int _mm256_testz_ps (__m256 a, __m256 b) pure @safe
4587 {
4588     // PERF DMD
4589     static if (GDC_or_LDC_with_AVX)
4590     {
4591         return __builtin_ia32_vtestzps256(a, b);
4592     }
4593     else
4594     {
4595         int8 la = cast(int8)a;
4596         int8 lb = cast(int8)b;
4597         int8 r = la & lb;
4598         int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]
4599             | r.array[4] | r.array[5] | r.array[6] | r.array[7];
4600         return (~m >> 31) & 1;
4601     }
4602 }
4603 
4604 /// Compute the bitwise AND of 256 bits (representing integer data) in 
4605 /// and return 1 if the result is zero, otherwise return 0.
4606 /// In other words, test if all bits masked by `b` are 0 in `a`.
4607 int _mm256_testz_si256 (__m256i a, __m256i b) @trusted
4608 {
4609     // PERF DMD
4610     static if (GDC_with_AVX)
4611     {
4612         return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b);
4613     }
4614     else static if (LDC_with_AVX)
4615     {
4616         return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b);
4617     }
4618     else version(LDC)
4619     {
4620         // better to split than do vanilla (down to 8 inst in arm64)
4621         __m128i lo_a = _mm256_extractf128_si256!0(a);
4622         __m128i lo_b = _mm256_extractf128_si256!0(b);
4623         __m128i hi_a = _mm256_extractf128_si256!1(a);
4624         __m128i hi_b = _mm256_extractf128_si256!1(b);
4625         return _mm_testz_si128(lo_a, lo_b) & _mm_testz_si128(hi_a, hi_b);
4626     }
4627     else
4628     {
4629         __m256i c = a & b;
4630         long[4] zero = [0, 0, 0, 0];
4631         return c.array == zero;
4632     }
4633 }
4634 unittest
4635 {
4636     __m256i A  = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0x01, 0x02, 0x04, 0xf8);
4637     __m256i M1 = _mm256_setr_epi32(0xfe, 0xfd, 0x00, 0x07, 0xfe, 0xfd, 0x00, 0x07);
4638     __m256i M2 = _mm256_setr_epi32(0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00);
4639     assert(_mm256_testz_si256(A, A) == 0);
4640     assert(_mm256_testz_si256(A, M1) == 1);
4641     assert(_mm256_testz_si256(A, M2) == 0);
4642 }
4643 
4644 /// Return vector of type __m256d with undefined elements.
4645 __m256d _mm256_undefined_pd () pure @safe
4646 {
4647     __m256d r = void;
4648     return r;
4649 }
4650 
4651 /// Return vector of type __m256 with undefined elements.
4652 __m256 _mm256_undefined_ps () pure @safe
4653 {
4654     __m256 r = void;
4655     return r;
4656 }
4657 
4658 /// Return vector of type __m256i with undefined elements.
4659 __m256i _mm256_undefined_si256 () pure @safe
4660 {
4661     __m256i r = void;
4662     return r;
4663 }
4664 
4665 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
4666 /// each 128-bit lane in `a` and `b`.
4667 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted
4668 {
4669     static if (LDC_with_optimizations)
4670     {
4671         enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
4672                    ret <4 x double> %r`;
4673         return LDCInlineIR!(ir, double4, double4, double4)(a, b);
4674     }
4675     else static if (GDC_with_AVX)
4676     {
4677         return __builtin_ia32_unpckhpd256 (a, b);
4678     }
4679     else
4680     {
4681         __m256d r;
4682         r.ptr[0] = a.array[1];
4683         r.ptr[1] = b.array[1];
4684         r.ptr[2] = a.array[3];
4685         r.ptr[3] = b.array[3];
4686         return r;
4687     } 
4688 }
4689 unittest
4690 {
4691     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
4692     __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
4693     __m256d C = _mm256_unpackhi_pd(A, B);
4694     double[4] correct =       [2.0, 6, 4, 8];
4695     assert(C.array == correct);
4696 }
4697 
4698 
4699 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
4700 /// each 128-bit lane in `a` and `b`.
4701 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted
4702 {
4703     static if (LDC_with_optimizations)
4704     {
4705         enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
4706             ret <8 x float> %r`;
4707         return LDCInlineIR!(ir, float8, float8, float8)(a, b);
4708     }
4709     else static if (GDC_with_AVX)
4710     {
4711         return __builtin_ia32_unpckhps256 (a, b);
4712     }
4713     else
4714     {
4715         __m256 r;
4716         r.ptr[0] = a.array[2];
4717         r.ptr[1] = b.array[2];
4718         r.ptr[2] = a.array[3];
4719         r.ptr[3] = b.array[3];
4720         r.ptr[4] = a.array[6];
4721         r.ptr[5] = b.array[6];
4722         r.ptr[6] = a.array[7];
4723         r.ptr[7] = b.array[7];
4724         return r;
4725     } 
4726 }
4727 unittest
4728 {
4729     __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
4730     __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
4731     __m256 C = _mm256_unpackhi_ps(A, B);
4732     float[8] correct =       [2.0f, 10,  3, 11,  6, 14,  7, 15];
4733     assert(C.array == correct);
4734 }
4735 
4736 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 
4737 /// each 128-bit lane in `a` and `b`.
4738 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b)
4739 {
4740     static if (LDC_with_optimizations)
4741     {
4742         enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
4743             ret <4 x double> %r`;
4744         return LDCInlineIR!(ir, double4, double4, double4)(a, b);
4745     }
4746     else static if (GDC_with_AVX)
4747     {
4748         return __builtin_ia32_unpcklpd256 (a, b);
4749     }
4750     else
4751     {
4752         __m256d r;
4753         r.ptr[0] = a.array[0];
4754         r.ptr[1] = b.array[0];
4755         r.ptr[2] = a.array[2];
4756         r.ptr[3] = b.array[2];
4757         return r;        
4758     } 
4759 }
4760 unittest
4761 {
4762     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
4763     __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
4764     __m256d C = _mm256_unpacklo_pd(A, B);
4765     double[4] correct =       [1.0, 5, 3, 7];
4766     assert(C.array == correct);
4767 }
4768 
4769 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of
4770 /// each 128-bit lane in `a` and `b`.
4771 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b)
4772 {
4773     static if (LDC_with_optimizations)
4774     {
4775         enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
4776             ret <8 x float> %r`;
4777         return LDCInlineIR!(ir, float8, float8, float8)(a, b);
4778     }
4779     else static if (GDC_with_AVX)
4780     {
4781         return __builtin_ia32_unpcklps256 (a, b);
4782     }
4783     else
4784     {
4785         __m256 r;
4786         r.ptr[0] = a.array[0];
4787         r.ptr[1] = b.array[0];
4788         r.ptr[2] = a.array[1];
4789         r.ptr[3] = b.array[1];
4790         r.ptr[4] = a.array[4];
4791         r.ptr[5] = b.array[4];
4792         r.ptr[6] = a.array[5];
4793         r.ptr[7] = b.array[5];
4794         return r;        
4795     } 
4796 }
4797 unittest
4798 {
4799     __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
4800     __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
4801     __m256 C = _mm256_unpacklo_ps(A, B);
4802     float[8] correct =       [0.0f,  8,  1,  9,  4, 12,  5, 13];
4803     assert(C.array == correct);
4804 }
4805 
4806 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4807 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe
4808 {
4809     return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b );
4810 }
4811 
4812 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
4813 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe
4814 {
4815     return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b );
4816 }
4817 
4818 void _mm256_zeroall () pure @safe
4819 {
4820     // PERF DMD needs to do it explicitely if AVX is ever used one day.
4821 
4822     static if (GDC_with_AVX)
4823     {
4824         __builtin_ia32_vzeroall();
4825     }
4826     else
4827     {
4828         // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM).
4829     }
4830 }
4831 
4832 void _mm256_zeroupper () pure @safe
4833 {
4834     // PERF DMD needs to do it explicitely if AVX is ever used.
4835 
4836     static if (GDC_with_AVX)
4837     {
4838         __builtin_ia32_vzeroupper();
4839     }
4840     else
4841     {
4842         // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM).
4843     }
4844     
4845 }
4846 
4847 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed.
4848 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted
4849 {
4850     __m256d r;
4851     r.ptr[0] = a.array[0];
4852     r.ptr[1] = a.array[1];
4853     r.ptr[2] = 0;
4854     r.ptr[3] = 0;
4855     return r;
4856 }
4857 unittest
4858 {
4859     __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0));
4860     double[4] correct = [2.0, -3, 0, 0];
4861     assert(R.array == correct);
4862 }
4863 
4864 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed.
4865 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted
4866 {
4867     double2 la = cast(double2)a;
4868     double4 r;
4869     r.ptr[0] = la.array[0];
4870     r.ptr[1] = la.array[1];
4871     r.ptr[2] = 0;
4872     r.ptr[3] = 0;
4873     return cast(__m256)r;
4874 }
4875 unittest
4876 {
4877     __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5));
4878     float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0];
4879     assert(R.array == correct);
4880 }
4881 
4882 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 
4883 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted
4884 {
4885     long2 la = cast(long2)a;
4886     __m256i r;
4887     r.ptr[0] = la.array[0];
4888     r.ptr[1] = la.array[1];
4889     r.ptr[2] = 0;
4890     r.ptr[3] = 0;
4891     return r;
4892 }
4893 unittest
4894 {
4895     __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99));
4896     long[4] correct = [-1, 99, 0, 0];
4897     assert(R.array == correct);
4898 }
4899 
4900 
4901 // F16C start here
4902 
4903 enum _MM_FROUND_TO_NEAREST_INT = 0,
4904      _MM_FROUND_TO_NEG_INF     = 1,
4905      _MM_FROUND_TO_POS_INF     = 2,
4906      _MM_FROUND_TO_ZERO        = 3,
4907      _MM_FROUND_CUR_DIRECTION  = 4,
4908      _MM_FROUND_RAISE_EXC      = 0,
4909      _MM_FROUND_NO_EXC         = 8,
4910      _MM_FROUND_NINT           = 0,
4911      _MM_FROUND_FLOOR = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF,
4912      _MM_FROUND_CEIL  = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF,
4913      _MM_FROUND_TRUNC = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO,
4914      _MM_FROUND_RINT  = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION,
4915      _MM_FROUND_NEARBYINT = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
4916 
4917 /// Convert 4 packed half-precision (16-bit) floating-point elements 
4918 /// in `a` to packed single-precision (32-bit) floating-point elements.
4919 /// Note: Only lowest 64-bit of input considered.
4920 ///       Preserve infinities, sign of zeroes, and NaN-ness.
4921 __m128 _mm_cvtph_ps(__m128i a) pure @trusted
4922 {
4923     short8 sa = cast(short8)a;
4924 
4925     static if (LDC_with_ARM64)
4926     {
4927         short4 h;
4928         short8 as = cast(short8) a;
4929         h.ptr[0] = as.array[0];
4930         h.ptr[1] = as.array[1];
4931         h.ptr[2] = as.array[2];
4932         h.ptr[3] = as.array[3];
4933         return vcvt_f32_f16(h);
4934     }
4935     else static if (LDC_with_F16C)
4936     {
4937         // Note: clang has a __builtin_ia32_vcvtph2ps256 but we don't
4938         // Note: LLVM IR fpext leads to  call __extendhfsf2@PLT
4939         // Same with the pragma llvm.convert.from.fp16, so not sure 
4940         // what to do
4941         return cast(__m128)__asm!(float4)("vcvtph2ps $1, $0", "=v,v", a);
4942     }
4943     else
4944     {
4945         // Reference: stb_image_resize2.h has F16C emulation.
4946         // See: 
4947         // Originated from: 
4948         __m128i mask_nosign      = _mm_set1_epi32(0x7fff);
4949         __m128i smallest_normal  = _mm_set1_epi32(0x0400);
4950         __m128i infinity         = _mm_set1_epi32(0x7c00);
4951         __m128i expadjust_normal = _mm_set1_epi32((127 - 15) << 23);
4952         __m128i magic_denorm     = _mm_set1_epi32(113 << 23);
4953         __m128i i = a;
4954         __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
4955         __m128i mnosign     = mask_nosign;
4956         __m128i eadjust     = expadjust_normal;
4957         __m128i smallest    = smallest_normal;
4958         __m128i infty       = infinity;
4959         __m128i expmant     = _mm_and_si128(mnosign, h);
4960         __m128i justsign    = _mm_xor_si128(h, expmant);
4961         __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
4962         __m128i b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
4963         __m128i shifted     = _mm_slli_epi32(expmant, 13);
4964         __m128i adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
4965         __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
4966         __m128i den1        = _mm_add_epi32(shifted, magic_denorm);
4967         __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
4968         __m128  den2        = _mm_sub_ps(cast(__m128)den1, *cast(const(__m128)*)&magic_denorm);
4969         __m128  adjusted3   = _mm_and_ps(den2, cast(__m128)b_isdenorm);
4970         __m128  adjusted4   = _mm_andnot_ps(cast(__m128)b_isdenorm, cast(__m128)adjusted2);
4971         __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
4972         __m128i sign        = _mm_slli_epi32(justsign, 16);
4973         __m128  final_      = _mm_or_ps(adjusted5, cast(__m128)sign);
4974         return final_;
4975     }
4976 }
4977 unittest
4978 {
4979     __m128i A = _mm_setr_epi16(cast(short)0x8000, 0x7C00, cast(short)0xDA90, 0x5000, 0, 0, 0, 0);
4980     float[4] correct      = [-0.0f, float.infinity, -210.0f,  32.0f];
4981     __m128 R = _mm_cvtph_ps(A);
4982     assert(R.array == correct);
4983 }
4984 
4985 /// Convert 8 packed half-precision (16-bit) floating-point elements 
4986 /// in `a` to packed single-precision (32-bit) floating-point elements.
4987 /// Note: Preserve infinities, sign of zeroes, and NaN-ness.
4988 __m256 _mm256_cvtph_ps(__m128i a) pure @trusted
4989 {
4990     static if (LDC_with_F16C)
4991     {
4992         return __asm!(float8)("vcvtph2ps $1, $0", "=v,v", a);
4993     }
4994     else
4995     {
4996         // In stb_image_resize2.h, _mm_cvtph_ps is simply hand-inlined 2x
4997         // so we do the same here.
4998         int4 ihi;
4999         ihi.ptr[0] = a.array[2];
5000         ihi.ptr[1] = a.array[3];
5001         __m128 lo = _mm_cvtph_ps(a);
5002         __m128 hi = _mm_cvtph_ps(ihi);
5003         return _mm256_set_m128(hi, lo);
5004     }
5005 }
5006 unittest
5007 {
5008     __m128i A = _mm_setr_epi16(0, cast(short)-32768, 0, cast(short)0xFC00, 0x7C00, 0x5A90,cast(short)0xDA90, 0x5000);
5009     float[8] correct      = [0.0f, -0.0f, 0.0f, -float.infinity, float.infinity, 210.0f, -210.0f,  32.0f];
5010     __m256 R = _mm256_cvtph_ps(A);
5011     assert(R.array == correct);
5012 }
5013 
5014 /// Convert packed single-precision (32-bit) floating-point elements in a to 
5015 /// packed half-precision (16-bit) floating-point elements USING NEAREST ROUNDING.
5016 ///
5017 /// Normally rounding is done according to the imm8[2:0] parameter, which can be one of:
5018 /// - _MM_FROUND_TO_NEAREST_INT // round to nearest
5019 /// - _MM_FROUND_TO_NEG_INF     // round down
5020 /// - _MM_FROUND_TO_POS_INF     // round up
5021 /// - _MM_FROUND_TO_ZERO        // truncate
5022 /// - _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5023 ///
5024 /// HOWEVER BUG: here only nearest rounding is supported, this is ensured statically.
5025 /// Note: Preserve infinities, sign of zeroes, and NaN-ness.
5026 __m128i _mm_cvtps_ph(int imm8)(__m128 a)
5027 {
5028     // Only nearest rounding is supported
5029     // If you REALLY need another rounding mode, contact intel-intrinsics authors.
5030     static assert((imm8 & 3) == _MM_FROUND_TO_NEAREST_INT);
5031 
5032     static if (LDC_with_ARM64)
5033     {
5034         // BUG: this has semantics different from the x86
5035         // version. vcvt_f16_f32 uses FPCR, while the x86
5036         // path uses nearest-rounding. 
5037 
5038         short8 r;
5039         short4 h = vcvt_f16_f32(a);
5040         r.ptr[0] = h.array[0];
5041         r.ptr[1] = h.array[1];
5042         r.ptr[2] = h.array[2];
5043         r.ptr[3] = h.array[3];
5044         r.ptr[4] = 0;
5045         r.ptr[5] = 0;
5046         r.ptr[6] = 0;
5047         r.ptr[7] = 0;
5048         return cast(__m128i) r;
5049     }
5050     else static if (GDC_or_LDC_with_F16C)
5051     {
5052         return cast(__m128i) __builtin_ia32_vcvtps2ph(a, imm8);
5053     }
5054     else
5055     {
5056         // Source: stb_image_resize2.h
5057         // Fabian Giesen's round-to-nearest-even float to half routine
5058         __m128  msign       = cast(__m128) _mm_set1_epi32(0x80000000);
5059         __m128  justsign    = _mm_and_ps(msign, a);
5060         __m128  absf        = _mm_xor_ps(a, justsign);
5061         __m128i absf_int    = cast(__m128i) absf;
5062         __m128i f16max      = _mm_set1_epi32((127 + 16) << 23);
5063         __m128  b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
5064         __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
5065         __m128i c_nanbit    = _mm_set1_epi32(0x200);
5066         __m128i c_infty_as_fp16 = _mm_set1_epi32(0x7c00);
5067         __m128i nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
5068         __m128i inf_or_nan  = _mm_or_si128(nanbit, c_infty_as_fp16); // output for specials
5069 
5070         __m128i min_normal  = _mm_set1_epi32( (127 - 14) << 23 ); // smallest FP32 that yields a normalized FP16
5071         __m128i b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
5072         __m128i c_subnorm_magic = _mm_set1_epi32( ((127 - 15) + (23 - 10) + 1) << 23 );
5073 
5074         // "result is subnormal" path
5075         __m128  subnorm1    = _mm_add_ps(absf, cast(__m128) c_subnorm_magic); // magic value to round output mantissa
5076         __m128i subnorm2    = _mm_sub_epi32(cast(__m128i)(subnorm1), c_subnorm_magic); // subtract out bias
5077 
5078         // "result is normal" path
5079         __m128i mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
5080         __m128i mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
5081 
5082         __m128i c_normal_bias = _mm_set1_epi32(0xfff - ((127 - 15) << 23) );
5083         __m128i round1      = _mm_add_epi32(absf_int, c_normal_bias);
5084         __m128i round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
5085         __m128i normal      = _mm_srli_epi32(round2, 13); // rounded result
5086 
5087         // combine the two non-specials
5088         __m128i nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
5089 
5090         // merge in specials as well
5091         __m128i joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
5092 
5093         __m128i sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
5094         __m128i final_ = _mm_or_si128(joined, sign_shift);
5095         return _mm_packs_epi32(final_, _mm_setzero_si128());
5096     }
5097 }
5098 unittest
5099 {
5100     float[4] A = [0.0f, -0.0f, 0.0f, -float.infinity];
5101     short8 RA = cast(short8) _mm_cvtps_ph!_MM_FROUND_TO_NEAREST_INT(_mm_loadu_ps(A.ptr));
5102     short[8] correctA = [0, cast(short)-32768, 0, cast(short)0xFC00, 0, 0, 0, 0];
5103     assert(RA.array == correctA);
5104 
5105     float[4] B = [float.infinity, 210.0f, -210.0f,  32.0000001f];
5106     short8 RB = cast(short8) _mm_cvtps_ph!_MM_FROUND_TO_NEAREST_INT(_mm_loadu_ps(B.ptr));
5107     short[8] correctB = [0x7C00, 0x5A90, cast(short)0xDA90, 0x5000, 0, 0, 0, 0];
5108     assert(RB.array == correctB);
5109 }
5110 
5111 /// Convert packed single-precision (32-bit) floating-point elements in a to 
5112 /// packed half-precision (16-bit) floating-point elements USING NEAREST ROUNDING.
5113 ///
5114 /// Normally rounding is done according to the imm8[2:0] parameter, which can be one of:
5115 /// - _MM_FROUND_TO_NEAREST_INT // round to nearest
5116 /// - _MM_FROUND_TO_NEG_INF     // round down
5117 /// - _MM_FROUND_TO_POS_INF     // round up
5118 /// - _MM_FROUND_TO_ZERO        // truncate
5119 /// - _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5120 ///
5121 /// HOWEVER here only nearest rounding is supported, this is ensured statically.
5122 /// Note: Preserve infinities, sign of zeroes, and NaN-ness.
5123 __m128i _mm256_cvtps_ph(int imm8)(__m256 a)
5124 {
5125     // Only nearest rounding is supported
5126     // If you REALLY need another rounding mode, contact intel-intrinsics authors.
5127     static assert((imm8 & 3) == _MM_FROUND_TO_NEAREST_INT);
5128 
5129     static if (LDC_with_ARM64)
5130     {
5131         // BUG: this has semantics different from the x86
5132         // version. vcvt_f16_f32 uses FPCR, while the x86
5133         // path uses nearest-rounding. 
5134 
5135         __m128 a_lo = _mm256_extractf128_ps!0(a);
5136         __m128 a_hi = _mm256_extractf128_ps!1(a);
5137         short8 r;
5138         short4 h1 = vcvt_f16_f32(a_lo);
5139         short4 h2 = vcvt_f16_f32(a_hi);
5140         r.ptr[0] = h1.array[0];
5141         r.ptr[1] = h1.array[1];
5142         r.ptr[2] = h1.array[2];
5143         r.ptr[3] = h1.array[3];
5144         r.ptr[4] = h2.array[0];
5145         r.ptr[5] = h2.array[1];
5146         r.ptr[6] = h2.array[2];
5147         r.ptr[7] = h2.array[3];
5148         return cast(__m128i) r;
5149     }
5150     else static if (GDC_or_LDC_with_F16C)
5151     {
5152         return cast(__m128i) __builtin_ia32_vcvtps2ph256(a, imm8);
5153     }
5154     else
5155     {
5156         __m128 a_lo = _mm256_extractf128_ps!0(a);
5157         __m128 a_hi = _mm256_extractf128_ps!1(a);
5158         long2 r_lo = cast(long2) _mm_cvtps_ph!imm8(a_lo);
5159         long2 r_hi = cast(long2) _mm_cvtps_ph!imm8(a_hi);
5160         long2 r;
5161         r.ptr[0] = r_lo.array[0];
5162         r.ptr[1] = r_hi.array[0];
5163         return cast(__m128i) r;
5164     }
5165 }
5166 unittest
5167 {
5168     float[8] A = [0.0f, -0.0f, 0.0f, -float.infinity, float.infinity, 210.0f, -210.0f,  32.0000001f];
5169     short8 R = cast(short8) _mm256_cvtps_ph!_MM_FROUND_TO_NEAREST_INT(_mm256_loadu_ps(A.ptr));
5170     short[8] correct = [0, -32768, 0, cast(short)0xFC00, 0x7C00, 0x5A90, cast(short)0xDA90, 0x5000];
5171     assert(R.array == correct);
5172 }