1 /**
2 * AVX intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avxintrin;
10 
11 // AVX instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX
13 // Note: this header will work whether you have AVX enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
15 // generate AVX instructions.
16 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively
17 // generate AVX instructions.
18 
19 public import inteli.types;
20 import inteli.internals;
21 
22 // Pull in all previous instruction set intrinsics.
23 public import inteli.tmmintrin;
24 
25 nothrow @nogc:
26 
27 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
28 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted
29 {
30     return a + b;
31 }
32 unittest
33 {
34     align(32) double[4] A = [-1, 2, -3, 40000];
35     align(32) double[4] B = [ 9, -7, 8, -0.5];
36     __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
37     double[4] correct = [8, -5, 5, 39999.5];
38     assert(R.array == correct);
39 }
40 
41 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
42 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted
43 {
44     return a + b;
45 }
46 unittest
47 {
48     align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6];
49     align(32) float[8] B = [ 9.0f, -7, 8,  -0.5, 8, 7, 3, -1];
50     __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
51     float[8] correct     = [8, -5, 5, 39999.5, 8, 10, 8, 5];
52     assert(R.array == correct);
53 }
54 
55 /// Alternatively add and subtract packed double-precision (64-bit) floating-point
56 ///  elements in `a` to/from packed elements in `b`.
57 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted
58 {
59     // PERF DMD
60     static if (GDC_or_LDC_with_AVX)
61     {
62         return __builtin_ia32_addsubpd256(a, b);
63     }
64     else
65     {
66         //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3
67         ////       LDC x86 generates addsubpd since LDC 1.18 with -O2
68         //// LDC ARM: not fantastic, ok since LDC 1.18 -O2
69         a.ptr[0] = a.array[0] + (-b.array[0]);
70         a.ptr[1] = a.array[1] + b.array[1];
71         a.ptr[2] = a.array[2] + (-b.array[2]);
72         a.ptr[3] = a.array[3] + b.array[3];
73         return a;
74     }
75 }
76 unittest
77 {
78     align(32) double[4] A = [-1, 2, -3, 40000];
79     align(32) double[4] B = [ 9, -7, 8, -0.5];
80     __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr));
81     double[4] correct = [-10, -5, -11, 39999.5];
82     assert(R.array == correct);
83 }
84 
85 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 
86 /// in `a` to/from packed elements in `b`.
87 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted
88 {
89     // PERF DMD
90     static if (GDC_or_LDC_with_AVX)
91     {
92         return __builtin_ia32_addsubps256(a, b);
93     }
94     else
95     {
96         // Note: GDC x86 generates addsubps since GDC 11 -O3
97         //               and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2
98         //       LDC x86 generates addsubps since LDC 1.18 -O2
99         //               and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1
100         // LDC ARM: neat output since LDC 1.21 -O2
101    
102         a.ptr[0] = a.array[0] + (-b.array[0]);
103         a.ptr[1] = a.array[1] + b.array[1];
104         a.ptr[2] = a.array[2] + (-b.array[2]);
105         a.ptr[3] = a.array[3] + b.array[3];
106         a.ptr[4] = a.array[4] + (-b.array[4]);
107         a.ptr[5] = a.array[5] + b.array[5];
108         a.ptr[6] = a.array[6] + (-b.array[6]);
109         a.ptr[7] = a.array[7] + b.array[7];
110         return a;
111     }
112 }
113 unittest
114 {
115     align(32) float[8] A = [-1.0f,  2,  -3, 40000,    0, 3,  5,  6];
116     align(32) float[8] B = [ 9.0f, -7,   8,  -0.5,    8, 7,  3, -1];
117     __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr));
118     float[8] correct     = [  -10, -5, -11, 39999.5, -8, 10, 2,  5];
119     assert(R.array == correct);
120 }
121 
122 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`.
123 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted
124 {
125     // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd,
126     //       but those do not seem needed at any optimization level.
127     return cast(__m256d)(cast(__m256i)a & cast(__m256i)b);
128 }
129 unittest
130 {
131     double a = 4.32;
132     double b = -78.99;
133     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
134     __m256d A = _mm256_set_pd(a, b, a, b);
135     __m256d B = _mm256_set_pd(b, a, b, a);
136     long4 R = cast(long4)( _mm256_and_pd(A, B) );
137     assert(R.array[0] == correct);
138     assert(R.array[1] == correct);
139     assert(R.array[2] == correct);
140     assert(R.array[3] == correct);
141 }
142 
143 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
144 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted
145 {
146     return cast(__m256)(cast(__m256i)a & cast(__m256i)b);
147 }
148 unittest
149 {
150     float a = 4.32f;
151     float b = -78.99f;
152     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
153     __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b);
154     __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a);
155     int8 R = cast(int8)( _mm256_and_ps(A, B) );
156     foreach(i; 0..8)
157         assert(R.array[i] == correct);
158 }
159 
160 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
161 /// and then AND with b.
162 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted
163 {
164     // PERF DMD
165     __m256i notA = _mm256_not_si256(cast(__m256i)a);
166     __m256i ib = cast(__m256i)b;
167     __m256i ab = notA & ib;
168     return cast(__m256d)ab;
169 }
170 unittest
171 {
172     double a = 4.32;
173     double b = -78.99;
174     long notA = ~ ( *cast(long*)(&a) );
175     long correct = notA & (*cast(long*)(&b));
176     __m256d A = _mm256_set_pd(a, a, a, a);
177     __m256d B = _mm256_set_pd(b, b, b, b);
178     long4 R = cast(long4)( _mm256_andnot_pd(A, B) );
179     foreach(i; 0..4)
180         assert(R.array[i] == correct);
181 }
182 
183 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
184 /// and then AND with b.
185 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted
186 {
187     // PERF DMD
188     __m256i notA = _mm256_not_si256(cast(__m256i)a);
189     __m256i ib = cast(__m256i)b;
190     __m256i ab = notA & ib;
191     return cast(__m256)ab;
192 }
193 unittest
194 {
195     float a = 4.32f;
196     float b = -78.99f;
197     int notA = ~ ( *cast(int*)(&a) );
198     int correct = notA & (*cast(int*)(&b));
199     __m256 A = _mm256_set1_ps(a);
200     __m256 B = _mm256_set1_ps(b);
201     int8 R = cast(int8)( _mm256_andnot_ps(A, B) );
202     foreach(i; 0..8)
203         assert(R.array[i] == correct);
204 }
205 
206 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 
207 /// mask `imm8`.
208 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b)
209 {
210     static assert(imm8 >= 0 && imm8 < 16);
211 
212     // PERF DMD
213     static if (GDC_with_AVX)
214     {
215         return __builtin_ia32_blendpd256 (a, b, imm8);
216     }
217     else
218     {
219         // Works great with LDC.
220         double4 r;
221         for (int n = 0; n < 4; ++n)
222         {
223             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
224         }
225         return r;
226     }
227 }
228 unittest
229 {
230     __m256d A = _mm256_setr_pd(0, 1, 2, 3);
231     __m256d B = _mm256_setr_pd(8, 9, 10, 11);
232     double4 C = _mm256_blend_pd!0x06(A, B);
233     double[4] correct =    [0, 9, 10, 3];
234     assert(C.array == correct);
235 }
236 
237 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
238 /// mask `imm8`.
239 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted
240 {
241     static assert(imm8 >= 0 && imm8 < 256);
242     // PERF DMD
243     // PERF ARM64: not awesome with some constant values, up to 8/9 instructions
244     static if (GDC_with_AVX)
245     {
246         return __builtin_ia32_blendps256 (a, b, imm8);
247     }
248     else
249     {
250         // LDC x86: vblendps generated since LDC 1.27 -O1
251         float8 r;
252         for (int n = 0; n < 8; ++n)
253         {
254             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
255         }
256         return r;
257     }
258 }
259 unittest
260 {
261     __m256 A = _mm256_setr_ps(0, 1,  2,  3,  4,  5,  6,  7);
262     __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15);
263     float8 C = _mm256_blend_ps!0xe7(A, B);
264     float[8] correct =       [8, 9, 10,  3,  4, 13, 14, 15];
265     assert(C.array == correct);
266 }
267 
268 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask.
269 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted
270 {
271     // PERF DMD
272     static if (GDC_with_AVX)
273     {
274         // Amazingly enough, GCC/GDC generates the vblendvpd instruction
275         // with -mavx2 but not -mavx.
276         // Not sure what is the reason, and there is a replacement sequence.
277         // PERF: Sounds like a bug, similar to _mm_blendv_pd
278         return __builtin_ia32_blendvpd256(a, b, mask);
279     }
280     else static if (LDC_with_AVX)
281     {
282         return __builtin_ia32_blendvpd256(a, b, mask);
283     }
284     else
285     {
286         // LDC x86: vblendvpd since LDC 1.27 -O2
287         //     arm64: only 4 instructions, since LDC 1.27 -O2
288         __m256d r;
289         long4 lmask = cast(long4)mask;
290         for (int n = 0; n < 4; ++n)
291         {
292             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
293         }
294         return r;
295     }
296 }
297 unittest
298 {
299     __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
300     __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
301     __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0);
302     __m256d R = _mm256_blendv_pd(A, B, M);
303     double[4] correct1 = [5.0, 2.0, 3.0, 8.0];
304     assert(R.array == correct1); // Note: probably the same NaN-mask oddity exist on arm64+linux than with _mm_blendv_pd
305 }
306 
307 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 
308 /// using `mask`.
309 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 
310 /// using `mask`.
311 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted
312 {
313     // PERF DMD
314     // PERF LDC/GDC without AVX could use two intrinsics for each part
315     static if (GDC_or_LDC_with_AVX)
316     {
317         return __builtin_ia32_blendvps256(a, b, mask);
318     }
319     else static if (LDC_with_ARM64)
320     {
321         int8 shift;
322         shift = 31;
323         int8 lmask = cast(int8)mask >> shift;     
324         int8 ia = cast(int8)a;   
325         int8 ib = cast(int8)b;
326         return cast(__m256)(ia ^ ((ia ^ ib) & lmask));
327     }
328     else
329     {
330         __m256 r = void; // PERF =void;
331         int8 lmask = cast(int8)mask;
332         for (int n = 0; n < 8; ++n)
333         {
334             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
335         }
336         return r;
337     }
338 }
339 unittest
340 {
341     __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
342     __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f);
343     __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f);
344     __m256 R = _mm256_blendv_ps(A, B, M);
345     float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f];
346     assert(R.array == correct1); // Note: probably the same NaN-mask oddity exist on arm64+linux than with _mm_blendv_pd
347 }
348 
349 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit)
350 /// floating-point elements) to all elements.
351 /// This effectively duplicates the 128-bit vector.
352 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted
353 {
354     // PERF DMD
355     static if (GDC_with_AVX)
356     {
357         return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr);
358     }
359     else
360     {
361         const(double)* p = cast(const(double)*) mem_addr;
362         __m256d r;
363         r.ptr[0] = p[0];
364         r.ptr[1] = p[1];
365         r.ptr[2] = p[0];
366         r.ptr[3] = p[1];
367         return r;
368     }
369 }
370 unittest
371 {
372     __m128d A = _mm_setr_pd(3, -4);
373     __m256d B = _mm256_broadcast_pd(&A);
374     double[4] correct = [3, -4, 3, -4];
375     assert(B.array == correct);
376 }
377 
378 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 
379 /// floating-point elements) to all elements.
380 /// This effectively duplicates the 128-bit vector.
381 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted
382 {
383     // PERF DMD
384     static if (GDC_with_AVX)
385     {
386         return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr);
387     }   
388     else
389     {
390         const(float)* p = cast(const(float)*)mem_addr;
391         __m256 r;
392         r.ptr[0] = p[0];
393         r.ptr[1] = p[1];
394         r.ptr[2] = p[2];
395         r.ptr[3] = p[3];
396         r.ptr[4] = p[0];
397         r.ptr[5] = p[1];
398         r.ptr[6] = p[2];
399         r.ptr[7] = p[3];
400         return r;
401     }
402 }
403 unittest
404 {
405     __m128 A = _mm_setr_ps(1, 2, 3, -4);
406     __m256 B = _mm256_broadcast_ps(&A);
407     float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4];
408     assert(B.array == correct);
409 }
410 
411 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
412 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted
413 {
414     static if (GDC_with_AVX)
415     {
416         return __builtin_ia32_vbroadcastsd256(mem_addr);
417     }
418     else
419     {
420         double a = *mem_addr;
421         __m256d r;
422         r.ptr[0] = a;
423         r.ptr[1] = a;
424         r.ptr[2] = a;
425         r.ptr[3] = a;
426         return r;
427     }
428 }
429 unittest
430 {
431     double t = 7.5f;
432     __m256d A = _mm256_broadcast_sd(&t);
433     double[4] correct = [7.5, 7.5, 7.5, 7.5];
434     assert(A.array == correct);
435 }
436 
437 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements.
438 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted
439 {
440     // PERF: DMD
441     static if (GDC_with_AVX)
442     {
443         return __builtin_ia32_vbroadcastss(mem_addr);
444     }
445     else
446     {
447         float a = *mem_addr;
448         __m128 r;
449         r.ptr[0] = a;
450         r.ptr[1] = a;
451         r.ptr[2] = a;
452         r.ptr[3] = a;
453         return r;
454     }
455 }
456 unittest
457 {
458     float t = 7.5f;
459     __m128 A = _mm_broadcast_ss(&t);
460     float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f];
461     assert(A.array == correct);
462 }
463 
464 __m256 _mm256_broadcast_ss (const(float)* mem_addr)
465 {
466     // PERF: DMD
467     static if (GDC_with_AVX)
468     {
469         return __builtin_ia32_vbroadcastss256 (mem_addr);
470     }
471     else
472     {
473         float a = *mem_addr;
474         __m256 r = __m256(a);
475         return r;
476     }
477 }
478 unittest
479 {
480     float t = 7.5f;
481     __m256 A = _mm256_broadcast_ss(&t);
482     float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f];
483     assert(A.array == correct);
484 }
485 
486 /// Cast vector of type `__m256d` to type `__m256`.
487 __m256 _mm256_castpd_ps (__m256d a) pure @safe
488 {
489     return cast(__m256)a;
490 }
491 
492 /// Cast vector of type `__m256d` to type `__m256i`.
493 __m256i _mm256_castpd_si256 (__m256d a) pure @safe
494 {
495     return cast(__m256i)a;
496 }
497 
498 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined.
499 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted
500 {
501     static if (GDC_with_AVX)
502     {
503         return __builtin_ia32_pd256_pd(a);
504     }
505     else
506     {
507         __m256d r = void;
508         r.ptr[0] = a.array[0];
509         r.ptr[1] = a.array[1];
510         return r;
511     }
512 }
513 unittest
514 {
515     __m128d A = _mm_setr_pd(4.0, -6.125);
516     __m256d B = _mm256_castpd128_pd256(A);
517     assert(B.array[0] == 4.0);
518     assert(B.array[1] == -6.125);
519 }
520 
521 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost.
522 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted
523 {
524     static if (GDC_with_AVX)
525     {
526         return __builtin_ia32_pd_pd256(a);
527     }
528     else
529     {
530         __m128d r;
531         r.ptr[0] = a.array[0];
532         r.ptr[1] = a.array[1];
533         return r;
534     }
535 }
536 unittest
537 {
538     __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0);
539     __m128d B = _mm256_castpd256_pd128(A);
540     assert(B.array[0] == 4.0);
541     assert(B.array[1] == -6.25);
542 }
543 
544 /// Cast vector of type `__m256` to type `__m256d`.
545 __m256d _mm256_castps_pd (__m256 a) pure @safe
546 {
547     return cast(__m256d)a;
548 }
549 
550 /// Cast vector of type `__m256` to type `__m256i`.
551 __m256i _mm256_castps_si256 (__m256 a) pure @safe
552 {
553     return cast(__m256i)a;
554 }
555 
556 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined.
557 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted
558 {
559     static if (GDC_with_AVX)
560     {
561         return __builtin_ia32_ps256_ps(a);
562     }
563     else
564     {
565         __m256 r = void;
566         r.ptr[0] = a.array[0];
567         r.ptr[1] = a.array[1];
568         r.ptr[2] = a.array[2];
569         r.ptr[3] = a.array[3];
570         return r;
571     }
572 }
573 unittest
574 {
575     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
576     __m256 B = _mm256_castps128_ps256(A);
577     float[4] correct = [1.0f, 2, 3, 4];
578     assert(B.array[0..4] == correct);
579 }
580 
581 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost.
582 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted
583 {
584     return *cast(const(__m128)*)(&a);
585 }
586 unittest
587 {
588     __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8);
589     __m128 B = _mm256_castps256_ps128(A);
590     float[4] correct = [1.0f, 2, 3, 4];
591     assert(B.array == correct);
592 }
593 
594 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined.
595 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted
596 {
597     long2 la = cast(long2)a;
598     long4 r = void;
599     r.ptr[0] = la.array[0];
600     r.ptr[1] = la.array[1];
601     return r;
602 }
603 unittest
604 {
605     __m128i A = _mm_setr_epi64(-1, 42);
606     __m256i B = _mm256_castsi128_si256(A);
607     long[2] correct = [-1, 42];
608     assert(B.array[0..2] == correct);
609 }
610 
611 /// Cast vector of type `__m256i` to type `__m256d`.
612 __m256d _mm256_castsi256_pd (__m256i a) pure @safe
613 {
614     return cast(__m256d)a;
615 }
616 
617 /// Cast vector of type `__m256i` to type `__m256`.
618 __m256 _mm256_castsi256_ps (__m256i a) pure @safe
619 {
620     return cast(__m256)a;
621 }
622 
623 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost.
624 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted
625 {
626     long2 r = void;
627     r.ptr[0] = a.array[0];
628     r.ptr[1] = a.array[1];
629     return cast(__m128i)r;
630 }
631 unittest
632 {
633     long4 A;
634     A.ptr[0] = -1;
635     A.ptr[1] = 42;
636     long2 B = cast(long2)(_mm256_castsi256_si128(A));
637     long[2] correct = [-1, 42];
638     assert(B.array[0..2] == correct);
639 }
640 
641 
642 // TODO __m256d _mm256_ceil_pd (__m256d a)
643 // TODO __m256 _mm256_ceil_ps (__m256 a)
644 
645 // TODO __m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8)
646 // TODO __m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)
647 // TODO __m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8)
648 // TODO __m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8)
649 // TODO __m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8)
650 // TODO __m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8)
651 
652 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 
653 /// elements.
654 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted
655 {
656     version(LDC)
657     {
658         enum ir = `
659             %r = sitofp <4 x i32> %0 to <4 x double>
660             ret <4 x double> %r`;
661         return LDCInlineIR!(ir, double4, __m128i)(a);
662     }
663     else static if (GDC_with_AVX)
664     {
665         return __builtin_ia32_cvtdq2pd256(a);
666     }
667     else
668     {
669         double4 r;
670         r.ptr[0] = a.array[0];
671         r.ptr[1] = a.array[1];
672         r.ptr[2] = a.array[2];
673         r.ptr[3] = a.array[3];
674         return r;
675     }
676 }
677 unittest
678 {
679     __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54));
680     double[4] correct = [54.0, 54, 54, 54];
681     assert(R.array == correct);
682 }
683 
684 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 
685 /// elements.
686 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted
687 {
688     version(LDC)
689     {
690         enum ir = `
691             %r = sitofp <8 x i32> %0 to <8 x float>
692             ret <8 x float> %r`;
693         return LDCInlineIR!(ir, float8, int8)(cast(int8)a);
694     }
695     else static if (GDC_with_AVX)
696     {
697         return __builtin_ia32_cvtdq2ps256(cast(int8)a);
698     }
699     else
700     {
701         int8 ia = cast(int8)a;
702         __m256 r;
703         r.ptr[0] = ia.array[0];
704         r.ptr[1] = ia.array[1];
705         r.ptr[2] = ia.array[2];
706         r.ptr[3] = ia.array[3];
707         r.ptr[4] = ia.array[4];
708         r.ptr[5] = ia.array[5];
709         r.ptr[6] = ia.array[6];
710         r.ptr[7] = ia.array[7];
711         return r;
712     }
713 }
714 unittest
715 {
716     __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5));
717     float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5];
718     assert(R.array == correct);
719 }
720 
721 // TODO __m128i _mm256_cvtpd_epi32 (__m256d a)
722 
723 
724 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 
725 /// floating-point elements.
726 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted
727 {
728     // PERF DMD
729     static if (GDC_or_LDC_with_AVX)
730     {
731         return __builtin_ia32_cvtpd2ps256(a);
732     }
733     else
734     {
735         __m128 r;
736         r.ptr[0] = a.array[0];
737         r.ptr[1] = a.array[1];
738         r.ptr[2] = a.array[2];
739         r.ptr[3] = a.array[3];
740         return r;
741     }
742 }
743 unittest
744 {
745     __m256d A = _mm256_setr_pd(1.0, 2, 3, 5);
746     __m128 R = _mm256_cvtpd_ps(A);
747     float[4] correct = [1.0f, 2, 3, 5];
748     assert(R.array == correct);
749 }
750 
751 
752 // TODO __m256i _mm256_cvtps_epi32 (__m256 a)
753 
754 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 
755 /// (64-bit) floating-point elements.
756 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted
757 {   
758     // PERF DMD
759     static if (GDC_with_AVX)
760     {
761         return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin
762     }
763     else
764     {
765         // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0
766         __m256d r;
767         r.ptr[0] = a.array[0];
768         r.ptr[1] = a.array[1];
769         r.ptr[2] = a.array[2];
770         r.ptr[3] = a.array[3];
771         return r;
772     }
773 }
774 unittest
775 {
776     __m128 A = _mm_setr_ps(1.0f, 2, 3, 5);
777     __m256d R = _mm256_cvtps_pd(A);
778     double[4] correct = [1.0, 2, 3, 5];
779     assert(R.array == correct);
780 }
781 
782 /// Return the lower double-precision (64-bit) floating-point element of `a`.
783 double _mm256_cvtsd_f64 (__m256d a) pure @safe
784 {
785     return a.array[0];
786 }
787 
788 /// Return the lower 32-bit integer in `a`.
789 int _mm256_cvtsi256_si32 (__m256i a) pure @safe
790 {
791     return (cast(int8)a).array[0];
792 }
793 
794 /// Return the lower single-precision (32-bit) floating-point element of `a`.
795 float _mm256_cvtss_f32 (__m256 a) pure @safe
796 {
797     return a.array[0];
798 }
799 
800 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 
801 /// integers with truncation.
802 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted
803 {
804     // PERF DMD
805     static if (GDC_or_LDC_with_AVX)
806     {
807         return cast(__m128i)__builtin_ia32_cvttpd2dq256(a);
808     }
809     else
810     {
811         __m128i r;
812         r.ptr[0] = cast(int)a.array[0];
813         r.ptr[1] = cast(int)a.array[1];
814         r.ptr[2] = cast(int)a.array[2];
815         r.ptr[3] = cast(int)a.array[3];
816         return r;
817     }
818 }
819 unittest
820 {
821     __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1);
822     __m128i R = _mm256_cvttpd_epi32(A);
823     int[4] correct = [3, -7, -1000, 4];
824     assert(R.array == correct);
825 }
826 
827 /// Convert packed single-precision (32-bit) floating-point elements in `a`.
828 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted
829 {
830     // PERF DMD
831     static if (GDC_or_LDC_with_AVX)
832     {
833         return cast(__m256i)__builtin_ia32_cvttps2dq256(a);
834     }
835     else
836     {
837         int8 r;
838         r.ptr[0] = cast(int)a.array[0];
839         r.ptr[1] = cast(int)a.array[1];
840         r.ptr[2] = cast(int)a.array[2];
841         r.ptr[3] = cast(int)a.array[3];
842         r.ptr[4] = cast(int)a.array[4];
843         r.ptr[5] = cast(int)a.array[5];
844         r.ptr[6] = cast(int)a.array[6];
845         r.ptr[7] = cast(int)a.array[7];
846         return cast(__m256i)r;
847     }
848 }
849 unittest
850 {
851     __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0);
852     int8 R = cast(int8) _mm256_cvttps_epi32(A);
853     int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4];
854     assert(R.array == correct);
855 }
856 
857 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
858 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe
859 {
860     return a / b;
861 }
862 unittest
863 {
864     __m256d a = [1.5, -2.0, 3.0, 1.0];
865     a = _mm256_div_pd(a, a);
866     double[4] correct = [1.0, 1.0, 1.0, 1.0];
867     assert(a.array == correct);
868 }
869 
870 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
871 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe
872 {
873     return a / b;
874 }
875 unittest
876 {
877     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f];
878     a = _mm256_div_ps(a, a);
879     float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f];
880     assert(a.array == correct);
881 }
882 
883 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 
884 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 
885 /// using the low 4 bits of `imm8`.
886 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b)
887 {
888     // PERF DMD
889     // PERF without AVX, can use 2 _mm_dp_ps exactly (beware the imm8 is tricky)
890     static if (GDC_or_LDC_with_AVX)
891     {
892         return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8);
893     }
894     else
895     {
896         __m256 zero = _mm256_setzero_ps();
897         enum ubyte op = (imm8 >>> 4) & 15;
898         __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b);
899         float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
900         float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7];
901         __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo));
902         enum ubyte op2 = (imm8 & 15);
903         return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r);
904     }
905 }
906 unittest
907 {
908     // Products:                 9    14    20   24     6    16    12   -24
909     __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f);
910     __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f);
911     float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B);
912     float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B);
913     float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B);
914     float[8] correct1 =   [67.0f, 67.0f, 67.0f,67.0f,  10,   10,   10,  10];
915     float[8] correct2 =   [23.0f, 0.0f, 23.0f,  0.0f,  22,    0,   22,   0];
916     float[8] correct3 =   [0.0f, 29.0f, 0.0f,  29.0f,   0,   18,    0,  18];
917     assert(R1.array == correct1);
918     assert(R2.array == correct2);
919     assert(R3.array == correct3);
920 }
921 
922 /// Extract a 32-bit integer from `a`, selected with `imm8`.
923 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted
924 {
925     return (cast(int8)a).array[imm8 & 7];
926 }
927 unittest
928 {
929     align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6];
930     auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr);
931     assert(_mm256_extract_epi32(A, 0) == -1);
932     assert(_mm256_extract_epi32(A, 1 + 8) == 2);
933     assert(_mm256_extract_epi32(A, 3 + 16) == 4);
934     assert(_mm256_extract_epi32(A, 7 + 32) == -6);
935 }
936 
937 /// Extract a 64-bit integer from `a`, selected with `index`.
938 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe
939 {
940     return a.array[index & 3];
941 }
942 unittest
943 {
944     __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0);
945     assert(_mm256_extract_epi64(A, -8) == -7);
946     assert(_mm256_extract_epi64(A, 1) == 6);
947     assert(_mm256_extract_epi64(A, 2 + 4) == 42);
948 }
949 
950 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1).
951 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted
952 {
953     // PERF DMD D_SIMD
954     static if (GDC_with_AVX)
955     {
956         // Note: needs to be a template intrinsics because of this builtin.
957         return __builtin_ia32_vextractf128_pd256(a, imm8 & 1);
958     }
959     else
960     {
961         double2 r = void;
962         enum int index = 2*(imm8 & 1);
963         r.ptr[0] = a.array[index+0];
964         r.ptr[1] = a.array[index+1];
965         return r;
966     }
967 }
968 unittest
969 {
970     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
971     double[4] correct = [1.0, 2, 3, 4];
972     __m128d l0 = _mm256_extractf128_pd!18(A);
973     __m128d l1 = _mm256_extractf128_pd!55(A);
974     assert(l0.array == correct[0..2]);
975     assert(l1.array == correct[2..4]);
976 }
977 
978 ///ditto
979 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted
980 {
981     // PERF DMD D_SIMD
982     static if (GDC_with_AVX)
983     {
984         return __builtin_ia32_vextractf128_ps256(a, imm8 & 1);
985     }
986     else
987     {
988         float4 r = void; // Optimize well since LDC 1.1 -O1
989         enum int index = 4*(imm8 & 1);
990         r.ptr[0] = a.array[index+0];
991         r.ptr[1] = a.array[index+1];
992         r.ptr[2] = a.array[index+2];
993         r.ptr[3] = a.array[index+3];
994         return r;
995     }
996 }
997 unittest
998 {
999     __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8);
1000     float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
1001     __m128 l0 = _mm256_extractf128_ps!8(A);
1002     __m128 l1 = _mm256_extractf128_ps!255(A);
1003     assert(l0.array == correct[0..4]);
1004     assert(l1.array == correct[4..8]);
1005 }
1006 
1007 ///ditto
1008 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted
1009 {
1010     // PERF DMD D_SIMD
1011     static if (GDC_with_AVX)
1012     {
1013         // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256
1014         // could be a non-template, however, this wins in -O0.
1015         // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd
1016         return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1);
1017     }
1018     else
1019     {
1020         long2 r = void;
1021         enum int index = 2*(imm8 & 1);
1022         r.ptr[0] = a.array[index+0];
1023         r.ptr[1] = a.array[index+1];
1024         return cast(__m128i)r;
1025     }
1026 }
1027 unittest
1028 {
1029     __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8);
1030     int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8];
1031     __m128i l0 = _mm256_extractf128_si256!0(A);
1032     __m128i l1 = _mm256_extractf128_si256!1(A);
1033     assert(l0.array == correct[0..4]);
1034     assert(l1.array == correct[4..8]);
1035 }
1036 
1037 // TODO __m256d _mm256_floor_pd (__m256d a)
1038 // TODO __m256 _mm256_floor_ps (__m256 a)
1039 
1040 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 
1041 /// and `b`. 
1042 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted
1043 {
1044     static if (GDC_or_LDC_with_AVX)
1045     {
1046         return __builtin_ia32_haddpd256(a, b);
1047     }
1048     else
1049     {
1050         __m256d res;
1051         res.ptr[0] = a.array[1] + a.array[0];
1052         res.ptr[1] = b.array[1] + b.array[0];
1053         res.ptr[2] = a.array[3] + a.array[2];
1054         res.ptr[3] = b.array[3] + b.array[2];
1055         return res;
1056     }
1057 }
1058 unittest
1059 {
1060     __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
1061     __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
1062     __m256d C = _mm256_hadd_pd(A, B);
1063     double[4] correct =      [3.5, 8.0, 30.0, 114.0];
1064     assert(C.array == correct);
1065 }
1066 
1067 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and
1068 /// `b`.
1069 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted
1070 {
1071     // PERD DMD
1072     static if (GDC_or_LDC_with_AVX)
1073     {
1074         return __builtin_ia32_haddps256(a, b);
1075     }
1076     else static if (LDC_with_ARM64)
1077     {
1078         __m128 a_hi = _mm256_extractf128_ps!1(a);
1079         __m128 a_lo = _mm256_extractf128_ps!0(a);
1080         __m128 b_hi = _mm256_extractf128_ps!1(b);
1081         __m128 b_lo = _mm256_extractf128_ps!0(b);
1082         __m128 hi = vpaddq_f32(a_hi, b_hi);
1083         __m128 lo = vpaddq_f32(a_lo, b_lo);
1084         return _mm256_set_m128(hi, lo);
1085     }
1086     else
1087     {    
1088         __m256 res;
1089         res.ptr[0] = a.array[1] + a.array[0];
1090         res.ptr[1] = a.array[3] + a.array[2];
1091         res.ptr[2] = b.array[1] + b.array[0];
1092         res.ptr[3] = b.array[3] + b.array[2];
1093         res.ptr[4] = a.array[5] + a.array[4];
1094         res.ptr[5] = a.array[7] + a.array[6];
1095         res.ptr[6] = b.array[5] + b.array[4];
1096         res.ptr[7] = b.array[7] + b.array[6];
1097         return res;
1098     }
1099 }
1100 unittest
1101 {
1102     __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
1103     __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
1104     __m256 R = _mm256_hadd_ps(A, B);
1105     float[8] correct =      [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f];
1106     assert(R.array == correct);
1107 }
1108 
1109 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in
1110 /// `a` and `b`. 
1111 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted
1112 {
1113     static if (GDC_or_LDC_with_AVX)
1114     {
1115         return __builtin_ia32_hsubpd256(a, b);
1116     }
1117     else 
1118     {
1119         // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64
1120         __m256d res;
1121         res.ptr[0] = a.array[0] - a.array[1];
1122         res.ptr[1] = b.array[0] - b.array[1];
1123         res.ptr[2] = a.array[2] - a.array[3];
1124         res.ptr[3] = b.array[2] - b.array[3];
1125         return res;
1126     }
1127 }
1128 unittest
1129 {
1130     __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0);
1131     __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0);
1132     __m256d C = _mm256_hsub_pd(A, B);
1133     double[4] correct =      [-0.5, -6.0, 12.0, 86.0];
1134     assert(C.array == correct);
1135 }
1136 
1137 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted
1138 {
1139     // PERD DMD
1140     static if (GDC_or_LDC_with_AVX)
1141     {
1142         return __builtin_ia32_hsubps256(a, b);
1143     }
1144     else
1145     {
1146         __m128 a_hi = _mm256_extractf128_ps!1(a);
1147         __m128 a_lo = _mm256_extractf128_ps!0(a);
1148         __m128 b_hi = _mm256_extractf128_ps!1(b);
1149         __m128 b_lo = _mm256_extractf128_ps!0(b);
1150         __m128 hi = _mm_hsub_ps(a_hi, b_hi);
1151         __m128 lo = _mm_hsub_ps(a_lo, b_lo);
1152         return _mm256_set_m128(hi, lo);
1153     }
1154 }
1155 unittest
1156 {
1157     __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f);
1158     __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f);
1159     __m256 R = _mm256_hsub_ps(A, B);
1160     float[8] correct =   [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f];
1161     assert(R.array == correct);
1162 }
1163 
1164 // TODO __m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index)
1165 // TODO __m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)
1166 // TODO __m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)
1167 // TODO __m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index)
1168 
1169 
1170 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 
1171 /// floating-point elements) from `b` at the location specified by `imm8`.
1172 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted
1173 {
1174     static if (GDC_with_AVX)
1175     {
1176         enum ubyte lane = imm8 & 1;
1177         return __builtin_ia32_vinsertf128_pd256(a, b, lane);
1178     }
1179     else
1180     {
1181         __m256d r = a;
1182         enum int index = (imm8 & 1) ? 2 : 0;
1183         r.ptr[index] = b.array[0];
1184         r.ptr[index+1] = b.array[1];
1185         return r;
1186     }
1187 }
1188 
1189 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point
1190 /// elements) from `b`, at the location specified by `imm8`.
1191 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted
1192 {
1193     static if (GDC_with_AVX)
1194     {
1195         enum ubyte lane = imm8 & 1;
1196         return __builtin_ia32_vinsertf128_ps256(a, b, lane);
1197     }
1198     else
1199     {
1200         __m256 r = a;
1201         enum int index = (imm8 & 1) ? 4 : 0;
1202         r.ptr[index] = b.array[0];
1203         r.ptr[index+1] = b.array[1];
1204         r.ptr[index+2] = b.array[2];
1205         r.ptr[index+3] = b.array[3];
1206         return r;
1207     }
1208 }
1209 
1210 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`.
1211 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted
1212 {
1213     static if (GDC_with_AVX)
1214     {
1215         enum ubyte lane = imm8 & 1;
1216         return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane);
1217     }
1218     else
1219     {
1220         long2 lb = cast(long2)b;
1221         __m256i r = a;
1222         enum int index = (imm8 & 1) ? 2 : 0;
1223         r.ptr[index] = lb.array[0];
1224         r.ptr[index+1] = lb.array[1];
1225         return r;
1226     }
1227 }
1228 
1229 /// Load 256-bits of integer data from unaligned memory into dst. 
1230 /// This intrinsic may perform better than `_mm256_loadu_si256` when the data crosses a cache 
1231 /// line boundary.
1232 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted
1233 {
1234     // PERF DMD D_SIMD
1235     static if (GDC_or_LDC_with_AVX)
1236     {
1237         return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr);
1238     }
1239     else
1240         return _mm256_loadu_si256(mem_addr);
1241 }
1242 unittest
1243 {
1244     int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34];
1245     int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]);
1246     assert(A.array == correct[1..9]);
1247 }
1248 
1249 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
1250 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
1251 /// exception may be generated.
1252 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted
1253 {
1254     return *cast(__m256d*)mem_addr;
1255 }
1256 unittest
1257 {
1258     static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0];
1259     __m256d A = _mm256_load_pd(correct.ptr);
1260     assert(A.array == correct);
1261 }
1262 
1263 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 
1264 /// floating-point elements) from memory. 
1265 /// `mem_addr` must be aligned on a 32-byte boundary or a 
1266 /// general-protection exception may be generated.
1267 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted
1268 {
1269     return *cast(__m256*)mem_addr;
1270 }
1271 unittest
1272 {
1273     static immutable align(32) float[8] correct = 
1274         [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2];
1275     __m256 A = _mm256_load_ps(correct.ptr);
1276     assert(A.array == correct);
1277 }
1278 
1279 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on
1280 /// any particular boundary.
1281 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org
1282 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted // TODO: signature
1283 {
1284     // PERF DMD
1285     static if (GDC_with_AVX)
1286     {
1287         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr);
1288     }
1289     else version(LDC)
1290     {
1291         return loadUnaligned!(__m256i)(cast(long*)mem_addr);
1292     }
1293     else
1294     {
1295         const(long)* p = cast(const(long)*)mem_addr; 
1296         long4 r;
1297         r.ptr[0] = p[0];
1298         r.ptr[1] = p[1];
1299         r.ptr[2] = p[2];
1300         r.ptr[3] = p[3];
1301         return r;
1302     }
1303 }
1304 unittest
1305 {
1306     align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6];
1307     int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr);
1308     assert(A.array == correct);
1309 }
1310 
1311 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 
1312 /// 32-byte boundary or a general-protection exception may be generated.
1313 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system
1314 {
1315     return *cast(__m256i*)mem_addr;
1316 }
1317 unittest
1318 {
1319     static immutable align(64) long[4] correct = [1, -2, long.min, long.max];
1320     __m256i A = _mm256_load_si256(correct.ptr);
1321     assert(A.array == correct);
1322 }
1323 
1324 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 
1325 /// from memory. `mem_addr` does not need to be aligned on any particular boundary.
1326 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system
1327 {
1328     // PERF DMD
1329     static if (GDC_with_AVX)
1330     {
1331         return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr);
1332     }
1333     else version(LDC)
1334     {
1335         return loadUnaligned!(__m256d)(cast(double*)mem_addr);
1336     }    
1337     else
1338     {
1339         const(double)* p = cast(const(double)*)mem_addr; 
1340         double4 r;
1341         r.ptr[0] = p[0];
1342         r.ptr[1] = p[1];
1343         r.ptr[2] = p[2];
1344         r.ptr[3] = p[3];
1345         return r;
1346     }
1347 }
1348 unittest
1349 {
1350     double[4] correct = [1.0, -2.0, 0.0, 768.5];
1351     __m256d A = _mm256_loadu_pd(correct.ptr);
1352     assert(A.array == correct);
1353 }
1354 
1355 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory.
1356 /// `mem_addr` does not need to be aligned on any particular boundary.
1357 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system
1358 {
1359     // PERF DMD
1360     static if (GDC_with_AVX)
1361     {
1362         return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr);
1363     }
1364     else version(LDC)
1365     {
1366         return loadUnaligned!(__m256)(cast(float*)mem_addr);
1367     }    
1368     else
1369     {
1370         const(float)* p = cast(const(float)*)mem_addr; 
1371         float8 r = void;
1372         r.ptr[0] = p[0];
1373         r.ptr[1] = p[1];
1374         r.ptr[2] = p[2];
1375         r.ptr[3] = p[3];
1376         r.ptr[4] = p[4];
1377         r.ptr[5] = p[5];
1378         r.ptr[6] = p[6];
1379         r.ptr[7] = p[7];
1380         return r;
1381     }
1382 }
1383 unittest
1384 {
1385     align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9];
1386     __m256 A = _mm256_loadu_ps(&correct[1]);
1387     assert(A.array == correct[1..9]);
1388 }
1389 
1390 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 
1391 /// elements) from memory, and combine them into a 256-bit value. 
1392 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1393 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system
1394 {
1395     // Note: no particular instruction for this in x86.
1396     return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr));
1397 }
1398 unittest
1399 {
1400     align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3];
1401     align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4];
1402     __m256 R = _mm256_loadu2_m128(&B[1], &A[1]);
1403     float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2];
1404     assert(R.array == correct);
1405 }
1406 
1407 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point
1408 /// elements) from memory, and combine them into a 256-bit value. 
1409 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1410 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system
1411 {
1412     // Note: no particular instruction for this in x86.
1413     return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr));
1414 }
1415 unittest
1416 {
1417     align(32) double[4] A = [4.5f, 2, 8, 97];
1418     align(32) double[4] B = [6.5f, 3, 9, 98];
1419     __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]);
1420     double[4] correct = [2.0, 8, 3, 9];
1421     assert(R.array == correct);
1422 }
1423 
1424 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 
1425 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
1426 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted
1427 {
1428     // Note: no particular instruction for this in x86.
1429     return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr));
1430 }
1431 unittest
1432 {
1433     align(32) long[4] A = [5, 2, 8, 97];
1434     align(32) long[4] B = [6, 3, 9, 98];
1435     __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*)  &A[1]);
1436     long[4] correct = [2, 8, 3, 9];
1437     assert(R.array == correct);
1438 }
1439 
1440 
1441 // TODO __m128d _mm_maskload_pd (double const * mem_addr, __m128i mask)
1442 // TODO __m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask)
1443 // TODO __m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
1444 // TODO __m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
1445 // TODO void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a)
1446 // TODO void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a)
1447 // TODO void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
1448 // TODO void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
1449 
1450 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
1451 /// packed maximum values.
1452 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted
1453 {    
1454     // PERF DMD D_SIMD
1455     static if (GDC_or_LDC_with_AVX)
1456     {
1457         return __builtin_ia32_maxpd256(a, b);
1458     }
1459     else
1460     {
1461         // LDC: becomes good in -O2
1462         // PERF: GDC without AVX
1463         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
1464         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
1465         a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
1466         a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
1467         return a;
1468     }
1469 }
1470 unittest
1471 {
1472     __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
1473     __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
1474     __m256d M = _mm256_max_pd(A, B);
1475     double[4] correct =       [4.0, 8.0, 0.0, double.infinity];
1476 }
1477 
1478 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
1479 /// packed maximum values.
1480 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted
1481 {
1482     // PERF DMD D_SIMD
1483     static if (GDC_or_LDC_with_AVX)
1484     {
1485         return __builtin_ia32_maxps256(a, b);
1486     }
1487     else
1488     {
1489         // LDC: becomes good in -O2, but looks brittle.
1490         // PERF GDC without AVX
1491         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
1492         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
1493         a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2];
1494         a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3];
1495         a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4];
1496         a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5];
1497         a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6];
1498         a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7];
1499         return a;
1500     }
1501 }
1502 unittest
1503 {
1504     __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
1505     __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
1506     __m256 M = _mm256_max_ps(A, B);
1507     float[8] correct =       [4.0, 8.0,  0.0, float.infinity , 4, 3, 3, 4];
1508 }
1509 
1510 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
1511 /// packed minimum values.
1512 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted
1513 {
1514     // PERF DMD D_SIMD
1515     static if (GDC_or_LDC_with_AVX)
1516     {
1517         return __builtin_ia32_minpd256(a, b);
1518     }
1519     else
1520     {
1521         // LDC: becomes good in -O2
1522         // PERF: GDC without AVX
1523         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
1524         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
1525         a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
1526         a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
1527         return a;
1528     }
1529 }
1530 unittest
1531 {
1532     __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity);
1533     __m256d B = _mm256_setr_pd(1.0, 8.0,  0.0, 100000.0);
1534     __m256d M = _mm256_min_pd(A, B);
1535     double[4] correct =       [1.0, 8.0, -9.0, 100000.0];
1536 }
1537 
1538 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 
1539 /// packed maximum values.
1540 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted
1541 {
1542     // PERF DMD D_SIMD
1543     static if (GDC_or_LDC_with_AVX)
1544     {
1545         return __builtin_ia32_minps256(a, b);
1546     }
1547     else
1548     {
1549         // LDC: becomes good in -O2, but looks brittle.
1550         // PERF GDC without AVX
1551         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
1552         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
1553         a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2];
1554         a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3];
1555         a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4];
1556         a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5];
1557         a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6];
1558         a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7];
1559         return a;
1560     }
1561 }
1562 unittest
1563 {
1564     __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4);
1565     __m256 B = _mm256_setr_ps(1.0, 8.0,  0.0, 100000.0f     , 4, 3, 2, 1);
1566     __m256 M = _mm256_min_ps(A, B);
1567     float[8] correct =       [1.0, 1.0, -9.0, 100000.0f     , 1, 2, 2, 1];
1568 }
1569 
1570 
1571 // TODO __m256d _mm256_movedup_pd (__m256d a)
1572 // TODO __m256 _mm256_movehdup_ps (__m256 a)
1573 // TODO __m256 _mm256_moveldup_ps (__m256 a)
1574 // TODO int _mm256_movemask_pd (__m256d a)
1575 // TODO int _mm256_movemask_ps (__m256 a)
1576 
1577 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`.
1578 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe
1579 {
1580     return a * b;
1581 }
1582 unittest
1583 {
1584     __m256d a = [-2.0, 1.5, -2.0, 1.5];
1585     a = _mm256_mul_pd(a, a);
1586     assert(a.array == [4.0, 2.25, 4.0, 2.25]);
1587 }
1588 
1589 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
1590 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe
1591 {
1592     return a * b;
1593 }
1594 unittest
1595 {
1596     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f];
1597     a = _mm256_mul_ps(a, a);
1598     float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f];
1599     assert(a.array == correct);
1600 }
1601 
1602 
1603 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS
1604 __m256i _mm256_not_si256 (__m256i a) pure @safe
1605 {
1606     return ~a;
1607 }
1608 unittest
1609 {
1610     __m256i A = _mm256_set1_epi64x(-748);
1611     long4 notA = cast(long4) _mm256_not_si256(A);
1612     int[4] correct = [747, 747, 747, 747];
1613     assert(notA.array == correct);
1614 }
1615 
1616 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
1617 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe
1618 {
1619     return cast(__m256d)( cast(__m256i)a | cast(__m256i)b );
1620 }
1621 
1622 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
1623 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe
1624 {
1625     return cast(__m256)( cast(__m256i)a | cast(__m256i)b );
1626 }
1627 
1628 // TODO __m128d _mm_permute_pd (__m128d a, int imm8)
1629 // TODO __m256d _mm256_permute_pd (__m256d a, int imm8)
1630 // TODO __m128 _mm_permute_ps (__m128 a, int imm8)
1631 // TODO __m256 _mm256_permute_ps (__m256 a, int imm8)
1632 // TODO __m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8)
1633 // TODO __m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
1634 // TODO __m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
1635 // TODO __m128d _mm_permutevar_pd (__m128d a, __m128i b)
1636 // TODO __m256d _mm256_permutevar_pd (__m256d a, __m256i b)
1637 // TODO __m128 _mm_permutevar_ps (__m128 a, __m128i b)
1638 // TODO __m256 _mm256_permutevar_ps (__m256 a, __m256i b)
1639 
1640 // TODO __m256 _mm256_rcp_ps (__m256 a)
1641 
1642 // TODO __m256d _mm256_round_pd (__m256d a, int rounding)
1643 // TODO __m256 _mm256_round_ps (__m256 a, int rounding)
1644 
1645 // TODO __m256 _mm256_rsqrt_ps (__m256 a)
1646 
1647 
1648 /// Set packed 16-bit integers with the supplied values.
1649 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
1650 {
1651     short16 r; // Note: = void would prevent GDC from inlining a constant short16...
1652     r.ptr[0] = e0;
1653     r.ptr[1] = e1;
1654     r.ptr[2] = e2;
1655     r.ptr[3] = e3;
1656     r.ptr[4] = e4;
1657     r.ptr[5] = e5;
1658     r.ptr[6] = e6;
1659     r.ptr[7] = e7;
1660     r.ptr[8] = e8;
1661     r.ptr[9] = e9;
1662     r.ptr[10] = e10;
1663     r.ptr[11] = e11;
1664     r.ptr[12] = e12;
1665     r.ptr[13] = e13;
1666     r.ptr[14] = e14;
1667     r.ptr[15] = e15;
1668     return cast(__m256i) r;
1669 }
1670 unittest
1671 {
1672     short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 
1673                                                7, 6, 5, 4, 3, 2, 1, 0);
1674     foreach(i; 0..16)
1675         assert(A.array[i] == i);
1676 }
1677 
1678 /// Set packed 32-bit integers with the supplied values.
1679 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
1680 {
1681     // Inlines a constant with GCC -O1, LDC -O2
1682     int8 r; // = void would prevent GCC from inlining a constant call
1683     r.ptr[0] = e0;
1684     r.ptr[1] = e1;
1685     r.ptr[2] = e2;
1686     r.ptr[3] = e3;
1687     r.ptr[4] = e4;
1688     r.ptr[5] = e5;
1689     r.ptr[6] = e6;
1690     r.ptr[7] = e7;
1691     return cast(__m256i)r;
1692 }
1693 unittest
1694 {
1695     int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1696     foreach(i; 0..8)
1697         assert(A.array[i] == i);
1698 }
1699 
1700 /// Set packed 64-bit integers with the supplied values.
1701 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted
1702 {
1703     long4 r = void;
1704     r.ptr[0] = e0;
1705     r.ptr[1] = e1;
1706     r.ptr[2] = e2;
1707     r.ptr[3] = e3;
1708     return r;
1709 }
1710 unittest
1711 {
1712     __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max);
1713     long[4] correct = [long.max, long.min, 42, -1];
1714     assert(A.array == correct);
1715 }
1716 
1717 /// Set packed 8-bit integers with the supplied values.
1718 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 
1719                          byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 
1720                          byte e15, byte e14, byte e13, byte e12, byte e11, byte e10,  byte e9,  byte e8, 
1721                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0)
1722 {
1723     // Inline a constant call in GDC -O1 and LDC -O2
1724     align(32) byte[32] result = [ e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
1725                                   e8,  e9, e10, e11, e12, e13, e14, e15,
1726                                  e16, e17, e18, e19, e20, e21, e22, e23,
1727                                  e24, e25, e26, e27, e28, e29, e30, e31 ];
1728     return *cast(__m256i*)(result.ptr);
1729 }
1730 unittest
1731 {
1732     byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7);
1733     byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0,
1734                         14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
1735     assert(R.array == correct);
1736 }
1737 
1738 /// Set packed `__m256d` vector with the supplied values.
1739 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted
1740 {
1741     // DMD PERF
1742     static if (GDC_with_AVX)
1743     {
1744         __m256 r = __builtin_ia32_ps256_ps(lo);
1745         return __builtin_ia32_vinsertf128_ps256(r, hi, 1);
1746     }
1747     else version(DigitalMars)
1748     {
1749         __m256 r = void;
1750         r.ptr[0] = lo.array[0];
1751         r.ptr[1] = lo.array[1];
1752         r.ptr[2] = lo.array[2];
1753         r.ptr[3] = lo.array[3];
1754         r.ptr[4] = hi.array[0];
1755         r.ptr[5] = hi.array[1];
1756         r.ptr[6] = hi.array[2];
1757         r.ptr[7] = hi.array[3];
1758         return r;
1759     }
1760     else
1761     {
1762         // TODO: BUG, doesn't work if AVX vector is emulated, but SSE vector is not
1763         // PERF: this crash on DMD v100.2 on Linux x86_64, find out why since 
1764         // it would be better performance wise
1765         // Note: probably because emulated AVX vectors have no alignment requisites!
1766         __m256 r = void;
1767         __m128* p = cast(__m128*)(&r);
1768         p[0] = lo;
1769         p[1] = hi;
1770         return r;
1771     }
1772 }
1773 unittest
1774 {
1775     __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4);
1776     __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6);
1777     __m256 R = _mm256_set_m128(hi, lo);
1778     float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6];
1779     assert(R.array == correct);
1780 }
1781 
1782 /// Set packed `__m256d` vector with the supplied values.
1783 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted
1784 {
1785     __m256d r = void;
1786     r.ptr[0] = lo.array[0];
1787     r.ptr[1] = lo.array[1];
1788     r.ptr[2] = hi.array[0];
1789     r.ptr[3] = hi.array[1];
1790     return r;
1791 }
1792 unittest
1793 {
1794     __m128d lo = _mm_setr_pd(1.0, 2.0);
1795     __m128d hi = _mm_setr_pd(3.0, 4.0);
1796     __m256d R = _mm256_set_m128d(hi, lo);
1797     double[4] correct = [1.0, 2.0, 3.0, 4.0];
1798     assert(R.array == correct);
1799 }
1800 
1801 /// Set packed `__m256i` vector with the supplied values.
1802 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted
1803 {
1804     // DMD PERF
1805     static if (GDC_with_AVX)
1806     {
1807         __m256i r = cast(long4) __builtin_ia32_si256_si (lo);
1808         return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1);
1809     }
1810     else version(DigitalMars)
1811     {
1812         int8 r = void;
1813         r.ptr[0] = lo.array[0];
1814         r.ptr[1] = lo.array[1];
1815         r.ptr[2] = lo.array[2];
1816         r.ptr[3] = lo.array[3];
1817         r.ptr[4] = hi.array[0];
1818         r.ptr[5] = hi.array[1];
1819         r.ptr[6] = hi.array[2];
1820         r.ptr[7] = hi.array[3];
1821         return cast(long4)r;
1822     }
1823     else
1824     {
1825         // PERF Does this also vcrash for DMD? with DMD v100.2 on Linux x86_64
1826         __m256i r = void;
1827         __m128i* p = cast(__m128i*)(&r);
1828         p[0] = lo;
1829         p[1] = hi;
1830         return r;
1831     }
1832 }
1833 unittest
1834 {
1835     __m128i lo = _mm_setr_epi32( 1,  2,  3,  4);
1836     __m128i hi =  _mm_set_epi32(-3, -4, -5, -6);
1837     int8 R = cast(int8)_mm256_set_m128i(hi, lo);
1838     int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3];
1839     assert(R.array == correct);
1840 }
1841 
1842 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
1843 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted
1844 {
1845     __m256d r = void;
1846     r.ptr[0] = e0;
1847     r.ptr[1] = e1;
1848     r.ptr[2] = e2;
1849     r.ptr[3] = e3;
1850     return r;
1851 }
1852 unittest
1853 {
1854     __m256d A = _mm256_set_pd(3, 2, 1, 546);
1855     double[4] correct = [546.0, 1.0, 2.0, 3.0];
1856     assert(A.array == correct);
1857 }
1858 
1859 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
1860 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
1861 {
1862     // PERF: see #102, use = void?
1863     __m256 r;
1864     r.ptr[0] = e0;
1865     r.ptr[1] = e1;
1866     r.ptr[2] = e2;
1867     r.ptr[3] = e3;
1868     r.ptr[4] = e4;
1869     r.ptr[5] = e5;
1870     r.ptr[6] = e6;
1871     r.ptr[7] = e7;
1872     return r;
1873 }
1874 unittest
1875 {
1876     __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0);
1877     float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0];
1878     assert(A.array == correct);
1879 }
1880 
1881 /// Broadcast 16-bit integer `a` to all elements of the return value.
1882 __m256i _mm256_set1_epi16 (short a) pure @trusted
1883 {
1884     // workaround https://issues.dlang.org/show_bug.cgi?id=21469
1885     // It used to ICE, now the codegen is just wrong.
1886     // TODO report this backend issue.
1887     version(DigitalMars) 
1888     {
1889         short16 v = a;
1890         return cast(__m256i) v;
1891     }
1892     else
1893     {
1894         pragma(inline, true);
1895         return cast(__m256i)(short16(a));
1896     }
1897 }
1898 unittest
1899 {
1900     short16 a = cast(short16) _mm256_set1_epi16(31);
1901     for (int i = 0; i < 16; ++i)
1902         assert(a.array[i] == 31);
1903 }
1904 
1905 /// Broadcast 32-bit integer `a` to all elements.
1906 __m256i _mm256_set1_epi32 (int a) pure @trusted
1907 {
1908     // Bad codegen else in DMD.
1909     // TODO report this backend issue.
1910     version(DigitalMars) 
1911     {
1912         int8 v = a;
1913         return cast(__m256i) v;
1914     }
1915     else
1916     {
1917         pragma(inline, true);
1918         return cast(__m256i)(int8(a));
1919     }
1920 }
1921 unittest
1922 {
1923     int8 a = cast(int8) _mm256_set1_epi32(31);
1924     for (int i = 0; i < 8; ++i)
1925         assert(a.array[i] == 31);
1926 }
1927 
1928 /// Broadcast 64-bit integer `a` to all elements of the return value.
1929 __m256i _mm256_set1_epi64x (long a)
1930 {
1931     return cast(__m256i)(long4(a));
1932 }
1933 unittest
1934 {
1935     long4 a = cast(long4) _mm256_set1_epi64x(-31);
1936     for (int i = 0; i < 4; ++i)
1937         assert(a.array[i] == -31);
1938 }
1939 
1940 /// Broadcast 8-bit integer `a` to all elements of the return value.
1941 __m256i _mm256_set1_epi8 (byte a) pure @trusted
1942 {
1943     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
1944     {
1945         byte32 v = a;
1946         return cast(__m256i) v;
1947     }
1948     else
1949     {
1950         pragma(inline, true);
1951         return cast(__m256i)(byte32(a));
1952     }
1953 }
1954 unittest
1955 {
1956     byte32 a = cast(byte32) _mm256_set1_epi8(31);
1957     for (int i = 0; i < 32; ++i)
1958         assert(a.array[i] == 31);
1959 }
1960 
1961 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value.
1962 __m256d _mm256_set1_pd (double a) pure @trusted
1963 {
1964     return __m256d(a);
1965 }
1966 unittest
1967 {
1968     double a = 464.21;
1969     double[4] correct = [a, a, a, a];
1970     double4 A = cast(double4) _mm256_set1_pd(a);
1971     assert(A.array == correct);
1972 }
1973 
1974 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value.
1975 __m256 _mm256_set1_ps (float a) pure @trusted
1976 {
1977     return __m256(a);
1978 }
1979 unittest
1980 {
1981     float a = 464.21f;
1982     float[8] correct = [a, a, a, a, a, a, a, a];
1983     float8 A = cast(float8) _mm256_set1_ps(a);
1984     assert(A.array == correct);
1985 }
1986 
1987 /// Set packed 16-bit integers with the supplied values in reverse order.
1988 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9,  short e8,
1989                            short e7,  short e6,  short e5,  short e4,  short e3,  short e2,  short e1,  short e0) pure @trusted
1990 {
1991     short[16] result = [ e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
1992                          e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
1993     static if (GDC_with_AVX)
1994     {
1995          return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
1996     }
1997     else version(LDC)
1998     {
1999         return cast(__m256i)( loadUnaligned!(short16)(result.ptr) );
2000     }
2001     else
2002     {
2003         short16 r;
2004         for(int n = 0; n < 16; ++n)
2005             r.ptr[n] = result[n];
2006         return cast(__m256i)r;
2007     }
2008 }
2009 unittest
2010 {
2011     short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128,
2012                                                 -1, 0, -21, 21, 42, 127, -42, -128);
2013     short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
2014                          -1, 0, -21, 21, 42, 127, -42, -128];
2015     assert(A.array == correct);
2016 }
2017 
2018 /// Set packed 32-bit integers with the supplied values in reverse order.
2019 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
2020 {
2021     // Inlines a constant with GCC -O1, LDC -O2
2022     int8 r; // = void would prevent GDC from inlining a constant call
2023     r.ptr[0] = e7;
2024     r.ptr[1] = e6;
2025     r.ptr[2] = e5;
2026     r.ptr[3] = e4;
2027     r.ptr[4] = e3;
2028     r.ptr[5] = e2;
2029     r.ptr[6] = e1;
2030     r.ptr[7] = e0;
2031     return cast(__m256i)r;
2032 }
2033 unittest
2034 {
2035     int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666);
2036     int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666];
2037     assert(A.array == correct);
2038 }
2039 
2040 /// Set packed 64-bit integers with the supplied values in reverse order.
2041 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted
2042 {
2043     long4 r = void;
2044     r.ptr[0] = e3;
2045     r.ptr[1] = e2;
2046     r.ptr[2] = e1;
2047     r.ptr[3] = e0;
2048     return r;
2049 }
2050 unittest
2051 {
2052     __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max);
2053     long[4] correct = [-1, 42, long.min, long.max];
2054     assert(A.array == correct);
2055 }
2056 
2057 /// Set packed 8-bit integers with the supplied values in reverse order.
2058 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24,
2059                           byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16,
2060                           byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9,  byte e8,
2061                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
2062 {
2063     // Inline a constant call in GDC -O1 and LDC -O2
2064     align(32) byte[32] result = [ e31,  e30,  e29,  e28,  e27,  e26,  e25,  e24,
2065                                   e23,  e22,  e21,  e20,  e19,  e18,  e17,  e16,
2066                                   e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
2067                                    e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
2068     return *cast(__m256i*)(result.ptr);
2069 }
2070 unittest
2071 {
2072     byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128,
2073                                               -1, 0, -21, 21, 42, 127, -42, -128,
2074                                               -1, 0, -21, 21, 42, 127, -42, -128,
2075                                               -1, 0, -21, 21, 42, 127, -42, -128);
2076     byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
2077                         -1, 0, -21, 21, 42, 127, -42, -128,
2078                         -1, 0, -21, 21, 42, 127, -42, -128,
2079                         -1, 0, -21, 21, 42, 127, -42, -128];
2080     assert(A.array == correct);
2081 }
2082 
2083 /// Set packed `__m256` vector with the supplied values.
2084 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi)
2085 {
2086     return _mm256_set_m128(hi, lo);
2087 }
2088 unittest
2089 {
2090     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
2091     __m128 B = _mm_setr_ps(3.0f, 4, 5, 6);
2092     __m256 R = _mm256_setr_m128(B, A);
2093     float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,];
2094     assert(R.array == correct);
2095 }
2096 
2097 /// Set packed `__m256d` vector with the supplied values.
2098 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi)
2099 {
2100     return _mm256_set_m128d(hi, lo);
2101 }
2102 unittest
2103 {
2104     __m128d A = _mm_setr_pd(1.0, 2.0);
2105     __m128d B = _mm_setr_pd(3.0, 4.0);
2106     __m256d R = _mm256_setr_m128d(B, A);
2107     double[4] correct = [3.0, 4.0, 1.0, 2.0];
2108     assert(R.array == correct);
2109 }
2110 
2111 /// Set packed `__m256i` vector with the supplied values.
2112 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi)
2113 {
2114     return _mm256_set_m128i(hi, lo);
2115 }
2116 unittest
2117 {
2118     __m128i A = _mm_setr_epi32( 1,  2,  3,  4);
2119     __m128i B =  _mm_set_epi32(-3, -4, -5, -6);
2120     int8 R = cast(int8)_mm256_setr_m128i(B, A);
2121     int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4];
2122     assert(R.array == correct);
2123 }
2124 
2125 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
2126 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted
2127 {
2128     version(LDC)
2129     {
2130         // PERF, probably not the best
2131         double[4] result = [e3, e2, e1, e0];
2132         return loadUnaligned!(double4)(result.ptr);
2133     }
2134     else
2135     {
2136         __m256d r;
2137         r.ptr[0] = e3;
2138         r.ptr[1] = e2;
2139         r.ptr[2] = e1;
2140         r.ptr[3] = e0;
2141         return r;
2142     }
2143 }
2144 unittest
2145 {
2146     __m256d A = _mm256_setr_pd(3, 2, 1, 546.125);
2147     double[4] correct = [3.0, 2.0, 1.0, 546.125];
2148     assert(A.array == correct);
2149 }
2150 
2151 
2152 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
2153 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted
2154 {
2155     // PERF DMD
2156     static if (GDC_with_AVX)
2157     {
2158         align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
2159         return *cast(__m256*)r;
2160     }
2161     else version(LDC)
2162     {
2163         align(32) float[8] r = [ e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
2164         return *cast(__m256*)r;
2165     }
2166     else
2167     {
2168         __m256 r;
2169         r.ptr[0] = e7;
2170         r.ptr[1] = e6;
2171         r.ptr[2] = e5;
2172         r.ptr[3] = e4;
2173         r.ptr[4] = e3;
2174         r.ptr[5] = e2;
2175         r.ptr[6] = e1;
2176         r.ptr[7] = e0;
2177         return r;
2178     }
2179 }
2180 unittest
2181 {
2182     __m256 A = _mm256_setr_ps(   3, 2, 1, 546.125f, 4, 5, 6, 7);
2183     float[8] correct       = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7];
2184     assert(A.array == correct);
2185 }
2186 
2187 /// Return vector of type `__m256d` with all elements set to zero.
2188 __m256d _mm256_setzero_pd() pure @safe
2189 {
2190     return double4(0.0);
2191 }
2192 unittest
2193 {
2194     __m256d A = _mm256_setzero_pd();
2195     double[4] correct = [0.0, 0.0, 0.0, 0.0];
2196     assert(A.array == correct);
2197 }
2198 
2199 /// Return vector of type `__m256` with all elements set to zero.
2200 __m256 _mm256_setzero_ps() pure @safe
2201 {
2202     return float8(0.0f);
2203 }
2204 unittest
2205 {
2206     __m256 A = _mm256_setzero_ps();
2207     float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0];
2208     assert(A.array == correct);
2209 }
2210 
2211 /// Return vector of type `__m256i` with all elements set to zero.
2212 __m256i _mm256_setzero_si256() pure @trusted
2213 {
2214     return __m256i(0);
2215 }
2216 unittest
2217 {
2218     __m256i A = _mm256_setzero_si256();
2219     long[4] correct = [0, 0, 0, 0];
2220     assert(A.array == correct);
2221 }
2222 
2223 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 
2224 /// control in `imm8`.
2225 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted
2226 {
2227     // PERF DMD D_SIMD
2228     static if (GDC_with_AVX)
2229     {
2230         return __builtin_ia32_shufpd256(a, b, imm8);
2231     }
2232     else version(LDC)
2233     {
2234         return shufflevectorLDC!(double4,        
2235                                        (imm8 >> 0) & 1,
2236                                  4 + ( (imm8 >> 1) & 1),
2237                                  2 + ( (imm8 >> 2) & 1),
2238                                  6 + ( (imm8 >> 3) & 1) )(a, b);
2239     }
2240     else
2241     {
2242         double4 r = void;
2243         r.ptr[0] = a.array[(imm8 >> 0) & 1];
2244         r.ptr[1] = b.array[(imm8 >> 1) & 1];
2245         r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)];
2246         r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)];
2247         return r;
2248     }
2249 }
2250 unittest
2251 {
2252     __m256d A = _mm256_setr_pd( 0, 1, 2, 3);
2253     __m256d B = _mm256_setr_pd( 4, 5, 6, 7);
2254     __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B);
2255     double[4] correct = [1.0, 5.0, 2.0, 7.0];
2256     assert(C.array == correct);
2257 } 
2258 
2259 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 
2260 /// the control in `imm8`.
2261 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted
2262 {
2263     // PERF DMD D_SIMD
2264     static if (GDC_with_AVX)
2265     {
2266         return __builtin_ia32_shufps256(a, b, imm8);
2267     }
2268     else version(LDC)
2269     {
2270         return shufflevectorLDC!(float8, (imm8 >> 0) & 3,
2271                                  (imm8 >> 2) & 3,
2272                                  8 + ( (imm8 >> 4) & 3),
2273                                  8 + ( (imm8 >> 6) & 3),
2274                                  4 + ( (imm8 >> 0) & 3),
2275                                  4 + ( (imm8 >> 2) & 3),
2276                                  12 + ( (imm8 >> 4) & 3),
2277                                  12 + ( (imm8 >> 6) & 3) )(a, b);
2278     }
2279     else
2280     {
2281         float8 r = void;
2282         r.ptr[0] = a.array[(imm8 >> 0) & 3];
2283         r.ptr[1] = a.array[(imm8 >> 2) & 3];
2284         r.ptr[2] = b.array[(imm8 >> 4) & 3];
2285         r.ptr[3] = b.array[(imm8 >> 6) & 3];
2286         r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )];
2287         r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )];
2288         r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )];
2289         r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )];
2290         return r;
2291     }
2292 }
2293 unittest
2294 {
2295     __m256 A = _mm256_setr_ps( 0,  1,  2,  3,  4,  5,  6,  7);
2296     __m256 B = _mm256_setr_ps( 8,  9, 10, 11, 12, 13, 14, 15);
2297     __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B);
2298     float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13];
2299     assert(C.array == correct);
2300 } 
2301 
2302 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`.
2303 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted
2304 {
2305     static if (GDC_with_AVX)
2306     {
2307         return __builtin_ia32_sqrtpd256(a);
2308     } 
2309     else version(LDC)
2310     {    
2311         return llvm_sqrt(a);
2312     }    
2313     else
2314     {
2315         a.ptr[0] = sqrt(a.array[0]);
2316         a.ptr[1] = sqrt(a.array[1]);
2317         a.ptr[2] = sqrt(a.array[2]);
2318         a.ptr[3] = sqrt(a.array[3]);
2319         return a;
2320     }
2321 }
2322 unittest
2323 {
2324     __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0));
2325     double[4] correct = [2.0, 2, 2, 2];
2326     assert(A.array == correct);
2327 }
2328 
2329 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
2330 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted
2331 {
2332     static if (GDC_with_AVX)
2333     {
2334         return __builtin_ia32_sqrtps256(a);
2335     } 
2336     else version(LDC)
2337     {    
2338         return llvm_sqrt(a);
2339     }    
2340     else
2341     {
2342         a.ptr[0] = sqrt(a.array[0]);
2343         a.ptr[1] = sqrt(a.array[1]);
2344         a.ptr[2] = sqrt(a.array[2]);
2345         a.ptr[3] = sqrt(a.array[3]);
2346         a.ptr[4] = sqrt(a.array[4]);
2347         a.ptr[5] = sqrt(a.array[5]);
2348         a.ptr[6] = sqrt(a.array[6]);
2349         a.ptr[7] = sqrt(a.array[7]);
2350         return a;
2351     }
2352 }
2353 unittest
2354 {
2355     __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f));
2356     float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2];
2357     assert(A.array == correct);
2358 }
2359 
2360 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
2361 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
2362 /// exception may be generated.
2363 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system
2364 {
2365     *cast(__m256d*)mem_addr = a;
2366 }
2367 unittest
2368 {
2369     align(32) double[4] mem;
2370     double[4] correct = [1.0, 2, 3, 4];
2371     _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4));
2372     assert(mem == correct);
2373 }
2374 
2375 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
2376 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 
2377 /// exception may be generated.
2378 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system
2379 {
2380     *cast(__m256*)mem_addr = a;
2381 }
2382 unittest
2383 {
2384     align(32) float[8] mem;
2385     float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8];
2386     _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1));
2387     assert(mem == correct);
2388 }
2389 
2390 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 
2391 /// boundary or a general-protection exception may be generated.
2392 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe
2393 {
2394     *mem_addr = a;
2395 }
2396 unittest
2397 {
2398     align(32) long[4] mem;
2399     long[4] correct = [5, -6, -7, 8];
2400     _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
2401     assert(mem == correct);
2402 }
2403 
2404 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 
2405 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
2406 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system
2407 {
2408     // PERF: DMD
2409     static if (GDC_with_AVX)
2410     {
2411         __builtin_ia32_storeupd256(mem_addr, a);
2412     }
2413     else version(LDC)
2414     {
2415         storeUnaligned!__m256d(a, mem_addr);
2416     }
2417     else
2418     {
2419         for(int n = 0; n < 4; ++n)
2420             mem_addr[n] = a.array[n];
2421     }
2422 }
2423 unittest
2424 {
2425     align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0];
2426     _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0));
2427     double[4] correct = [4.0, 4, 4, 4];
2428     assert(arr[1..5] == correct);
2429 }
2430 
2431 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 
2432 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
2433 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system
2434 {
2435     // PERF: DMD
2436     static if (GDC_with_AVX)
2437     {
2438         __builtin_ia32_storeups256(mem_addr, a);
2439     }
2440     else version(LDC)
2441     {
2442         storeUnaligned!__m256(a, mem_addr);
2443     }
2444     else
2445     {
2446         for(int n = 0; n < 8; ++n)
2447             mem_addr[n] = a.array[n];
2448     }
2449 }
2450 unittest
2451 {
2452     align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0];
2453     _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f));
2454     float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4];
2455     assert(arr[1..9] == correct);
2456 }
2457 
2458 
2459 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned
2460 ///  on any particular boundary.
2461 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted
2462 {
2463     // PERF: DMD
2464     static if (GDC_with_AVX)
2465     {
2466         __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a);
2467     }
2468     else version(LDC)
2469     {
2470         storeUnaligned!__m256i(a, cast(long*)mem_addr);
2471     }
2472     else
2473     {
2474         long4 v = cast(long4)a;
2475         long* p = cast(long*)mem_addr;
2476         for(int n = 0; n < 4; ++n)
2477             p[n] = v[n];
2478     }
2479 }
2480 unittest
2481 {
2482     align(32) long[6] arr = [0, 0, 0, 0, 0, 0];
2483     _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4));
2484     long[4] correct = [4, 4, 4, 4];
2485     assert(arr[1..5] == correct);
2486 }
2487 
2488 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 
2489 /// floating-point elements) from `a` into memory two different 128-bit locations. 
2490 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2491 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system
2492 {
2493     // This performed way better on GDC, and similarly in LDC, vs using other intrinsics
2494     loaddr[0] = a.array[0];
2495     loaddr[1] = a.array[1];
2496     loaddr[2] = a.array[2];
2497     loaddr[3] = a.array[3];
2498     hiaddr[0] = a.array[4];
2499     hiaddr[1] = a.array[5];
2500     hiaddr[2] = a.array[6];
2501     hiaddr[3] = a.array[7];
2502 }
2503 unittest
2504 {
2505     align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
2506     _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f));
2507     float[11] correct     = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0];
2508     assert(A == correct);
2509 }
2510 
2511 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit)
2512 /// floating-point elements) from `a` into memory two different 128-bit locations. 
2513 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2514 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system
2515 {
2516     loaddr[0] = a.array[0];
2517     loaddr[1] = a.array[1];
2518     hiaddr[0] = a.array[2];
2519     hiaddr[1] = a.array[3];
2520 }
2521 unittest
2522 {
2523     double[2] A;
2524     double[2] B;
2525     _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0));
2526     double[2] correct = [-43.0, -43];
2527     assert(A == correct);
2528     assert(B == correct);
2529 }
2530 
2531 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 
2532 /// different 128-bit locations. 
2533 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2534 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted // TODO: signature
2535 {
2536     long* hi = cast(long*)hiaddr;
2537     long* lo = cast(long*)loaddr;
2538     lo[0] = a.array[0];
2539     lo[1] = a.array[1];
2540     hi[0] = a.array[2];
2541     hi[1] = a.array[3];
2542 }
2543 unittest
2544 {
2545     long[2] A;
2546     long[2] B;
2547     _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42));
2548     long[2] correct = [-42, -42];
2549     assert(A == correct);
2550     assert(B == correct);
2551 }
2552 
2553 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from
2554 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
2555 /// boundary or a general-protection exception may be generated.
2556 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
2557 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system
2558 {
2559     // PERF DMD
2560     // PERF GDC + SSE2
2561     version(LDC)
2562     {
2563         enum prefix = `!0 = !{ i32 1 }`;
2564         enum ir = `
2565             store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0
2566             ret void`;
2567         LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a);
2568     }   
2569     else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions.
2570     {
2571         __builtin_ia32_movntpd256 (mem_addr, a);
2572     }
2573     else
2574     {
2575         // Regular store instead.
2576         __m256d* dest = cast(__m256d*)mem_addr;
2577         *dest = a;
2578     }
2579 }
2580 unittest
2581 {
2582     align(32) double[4] mem;
2583     double[4] correct = [5.0, -6, -7, 8];
2584     _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8));
2585     assert(mem == correct);
2586 }
2587 
2588 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from
2589 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 
2590 /// boundary or a general-protection exception may be generated.
2591 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
2592 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system
2593 {
2594     // PERF DMD
2595     // PERF GDC + SSE2
2596     version(LDC)
2597     {
2598         enum prefix = `!0 = !{ i32 1 }`;
2599         enum ir = `
2600             store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0
2601             ret void`;
2602         LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a);
2603     }   
2604     else static if (GDC_with_AVX)
2605     {
2606         __builtin_ia32_movntps256 (mem_addr, a);
2607     }
2608     else
2609     {
2610         // Regular store instead.
2611         __m256* dest = cast(__m256*)mem_addr;
2612         *dest = a;
2613     }
2614 }
2615 unittest
2616 {
2617     align(32) float[8] mem;
2618     float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4];
2619     _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4));
2620     assert(mem == correct);
2621 }
2622 
2623 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 
2624 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be
2625 /// generated.
2626 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2.
2627 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
2628 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted
2629 {
2630     // PERF DMD
2631     // PERF GDC
2632     version(LDC)
2633     {
2634         enum prefix = `!0 = !{ i32 1 }`;
2635         enum ir = `
2636             store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0
2637             ret void`;
2638         LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a);
2639     }
2640     else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions.
2641     {
2642         long2 lo, hi;
2643         lo.ptr[0] = a.array[0];
2644         lo.ptr[1] = a.array[1];
2645         hi.ptr[0] = a.array[2];
2646         hi.ptr[1] = a.array[3];
2647         _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo);
2648         _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi);
2649     }
2650     else
2651     {
2652         // Regular store instead.
2653         __m256i* dest = cast(__m256i*)mem_addr;
2654         *dest = a;
2655     }
2656 }
2657 unittest
2658 {
2659     align(32) long[4] mem;
2660     long[4] correct = [5, -6, -7, 8];
2661     _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8));
2662     assert(mem == correct);
2663 }
2664 
2665 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 
2666 /// packed double-precision (64-bit) floating-point elements in `a`.
2667 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe
2668 {
2669     return a - b;
2670 }
2671 unittest
2672 {
2673     __m256d a = [1.5, -2.0, 3.0, 200000.0];
2674     a = _mm256_sub_pd(a, a);
2675     double[4] correct = [0.0, 0, 0, 0];
2676     assert(a.array == correct);
2677 }
2678 
2679 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 
2680 /// packed single-precision (32-bit) floating-point elements in `a`.
2681 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe
2682 {
2683     return a - b;
2684 }
2685 unittest
2686 {
2687     __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f];
2688     a = _mm256_sub_ps(a, a);
2689     float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f];
2690     assert(a.array == correct);
2691 }
2692 
2693 
2694 // TODO int _mm_testc_pd (__m128d a, __m128d b)
2695 // TODO int _mm256_testc_pd (__m256d a, __m256d b)
2696 // TODO int _mm_testc_ps (__m128 a, __m128 b)
2697 // TODO int _mm256_testc_ps (__m256 a, __m256 b)
2698 // TODO int _mm256_testc_si256 (__m256i a, __m256i b)
2699 // TODO int _mm_testnzc_pd (__m128d a, __m128d b)
2700 // TODO int _mm256_testnzc_pd (__m256d a, __m256d b)
2701 // TODO int _mm_testnzc_ps (__m128 a, __m128 b)
2702 // TODO int _mm256_testnzc_ps (__m256 a, __m256 b)
2703 // TODO int _mm256_testnzc_si256 (__m256i a, __m256i b)
2704 // TODO int _mm_testz_pd (__m128d a, __m128d b)
2705 // TODO int _mm256_testz_pd (__m256d a, __m256d b)
2706 // TODO int _mm_testz_ps (__m128 a, __m128 b)
2707 // TODO int _mm256_testz_ps (__m256 a, __m256 b)
2708 // TODO int _mm256_testz_si256 (__m256i a, __m256i b)
2709 
2710 /// Return vector of type __m256d with undefined elements.
2711 __m256d _mm256_undefined_pd () pure @safe
2712 {
2713     __m256d r = void;
2714     return r;
2715 }
2716 
2717 /// Return vector of type __m256 with undefined elements.
2718 __m256 _mm256_undefined_ps () pure @safe
2719 {
2720     __m256 r = void;
2721     return r;
2722 }
2723 
2724 /// Return vector of type __m256i with undefined elements.
2725 __m256i _mm256_undefined_si256 () pure @safe
2726 {
2727     __m256i r = void;
2728     return r;
2729 }
2730 
2731 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
2732 /// each 128-bit lane in `a` and `b`.
2733 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted
2734 {
2735     version(LDC)
2736     {
2737         return shufflevectorLDC!(double4, 1, 5, 3, 7)(a, b);
2738     }
2739     else static if (GDC_with_AVX)
2740     {
2741         return __builtin_ia32_unpckhpd256 (a, b);
2742     }
2743     else
2744     {
2745         __m256d r;
2746         r.ptr[0] = a.array[1];
2747         r.ptr[1] = b.array[1];
2748         r.ptr[2] = a.array[3];
2749         r.ptr[3] = b.array[3];
2750         return r;
2751     } 
2752 }
2753 unittest
2754 {
2755     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
2756     __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
2757     __m256d C = _mm256_unpackhi_pd(A, B);
2758     double[4] correct =       [2.0, 6, 4, 8];
2759     assert(C.array == correct);
2760 }
2761 
2762 
2763 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 
2764 /// each 128-bit lane in `a` and `b`.
2765 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted
2766 {
2767     version(LDC)
2768     {
2769         return shufflevectorLDC!(float8, 2, 10, 3, 11, 6, 14, 7, 15)(a, b);
2770     }
2771     else static if (GDC_with_AVX)
2772     {
2773         return __builtin_ia32_unpckhps256 (a, b);
2774     }
2775     else
2776     {
2777         __m256 r;
2778         r.ptr[0] = a.array[2];
2779         r.ptr[1] = b.array[2];
2780         r.ptr[2] = a.array[3];
2781         r.ptr[3] = b.array[3];
2782         r.ptr[4] = a.array[6];
2783         r.ptr[5] = b.array[6];
2784         r.ptr[6] = a.array[7];
2785         r.ptr[7] = b.array[7];
2786         return r;
2787     } 
2788 }
2789 unittest
2790 {
2791     __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
2792     __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
2793     __m256 C = _mm256_unpackhi_ps(A, B);
2794     float[8] correct =       [2.0f, 10,  3, 11,  6, 14,  7, 15];
2795     assert(C.array == correct);
2796 }
2797 
2798 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 
2799 /// each 128-bit lane in `a` and `b`.
2800 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b)
2801 {
2802     version(LDC)
2803     {
2804         return shufflevectorLDC!(double4, 0, 4, 2, 6)(a, b);
2805     }
2806     else static if (GDC_with_AVX)
2807     {
2808         return __builtin_ia32_unpcklpd256 (a, b);
2809     }
2810     else
2811     {
2812         __m256d r;
2813         r.ptr[0] = a.array[0];
2814         r.ptr[1] = b.array[0];
2815         r.ptr[2] = a.array[2];
2816         r.ptr[3] = b.array[2];
2817         return r;        
2818     } 
2819 }
2820 unittest
2821 {
2822     __m256d A = _mm256_setr_pd(1.0, 2, 3, 4);
2823     __m256d B = _mm256_setr_pd(5.0, 6, 7, 8);
2824     __m256d C = _mm256_unpacklo_pd(A, B);
2825     double[4] correct =       [1.0, 5, 3, 7];
2826     assert(C.array == correct);
2827 }
2828 
2829 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of
2830 /// each 128-bit lane in `a` and `b`.
2831 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b)
2832 {
2833     version(LDC)
2834     {
2835         return shufflevectorLDC!(float8, 0, 8, 1, 9, 4, 12, 5, 13)(a, b);
2836     }
2837     else static if (GDC_with_AVX)
2838     {
2839         return __builtin_ia32_unpcklps256 (a, b);
2840     }
2841     else
2842     {
2843         __m256 r;
2844         r.ptr[0] = a.array[0];
2845         r.ptr[1] = b.array[0];
2846         r.ptr[2] = a.array[1];
2847         r.ptr[3] = b.array[1];
2848         r.ptr[4] = a.array[4];
2849         r.ptr[5] = b.array[4];
2850         r.ptr[6] = a.array[5];
2851         r.ptr[7] = b.array[5];
2852         return r;        
2853     } 
2854 }
2855 unittest
2856 {
2857     __m256 A = _mm256_setr_ps(0.0f,  1,  2,  3,  4,  5,  6,  7);
2858     __m256 B = _mm256_setr_ps(8.0f,  9, 10, 11, 12, 13, 14, 15);
2859     __m256 C = _mm256_unpacklo_ps(A, B);
2860     float[8] correct =       [0.0f,  8,  1,  9,  4, 12,  5, 13];
2861     assert(C.array == correct);
2862 }
2863 
2864 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2865 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe
2866 {
2867     return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b );
2868 }
2869 
2870 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
2871 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe
2872 {
2873     return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b );
2874 }
2875 
2876 void _mm256_zeroall () pure @safe
2877 {
2878     // PERF: DMD needs to do it explicitely if AVX is ever used.
2879 
2880     static if (GDC_with_AVX)
2881     {
2882         __builtin_ia32_vzeroall();
2883     }
2884     else
2885     {
2886         // Do nothing. The transitions penalty are supposed handled by the backend.
2887     }
2888 }
2889 
2890 void _mm256_zeroupper () pure @safe
2891 {
2892     // PERF: DMD needs to do it explicitely if AVX is ever used.
2893 
2894     static if (GDC_with_AVX)
2895     {
2896         __builtin_ia32_vzeroupper();
2897     }
2898     else
2899     {
2900         // Do nothing. The transitions penalty are supposed handled by the backend.
2901     }
2902     
2903 }
2904 
2905 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed.
2906 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted
2907 {
2908     __m256d r;
2909     r.ptr[0] = a.array[0];
2910     r.ptr[1] = a.array[1];
2911     r.ptr[2] = 0;
2912     r.ptr[3] = 0;
2913     return r;
2914 }
2915 unittest
2916 {
2917     __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0));
2918     double[4] correct = [2.0, -3, 0, 0];
2919     assert(R.array == correct);
2920 }
2921 
2922 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed.
2923 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted
2924 {
2925     double2 la = cast(double2)a;
2926     double4 r;
2927     r.ptr[0] = la.array[0];
2928     r.ptr[1] = la.array[1];
2929     r.ptr[2] = 0;
2930     r.ptr[3] = 0;
2931     return cast(__m256)r;
2932 }
2933 unittest
2934 {
2935     __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5));
2936     float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0];
2937     assert(R.array == correct);
2938 }
2939 
2940 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 
2941 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted
2942 {
2943     long2 la = cast(long2)a;
2944     __m256i r;
2945     r.ptr[0] = la.array[0];
2946     r.ptr[1] = la.array[1];
2947     r.ptr[2] = 0;
2948     r.ptr[3] = 0;
2949     return r;
2950 }
2951 unittest
2952 {
2953     __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99));
2954     long[4] correct = [-1, 99, 0, 0];
2955     assert(R.array == correct);
2956 }
2957 
2958 /+
2959 
2960 
2961 pragma(LDC_intrinsic, "llvm.x86.avx.cvtt.pd2dq.256")
2962     int4 __builtin_ia32_cvttpd2dq256(double4) pure @safe;
2963 
2964 pragma(LDC_intrinsic, "llvm.x86.avx.cvtt.ps2dq.256")
2965     int8 __builtin_ia32_cvttps2dq256(float8) pure @safe;
2966 
2967 pragma(LDC_intrinsic, "llvm.x86.avx.hadd.pd.256")
2968     double4 __builtin_ia32_haddpd256(double4, double4) pure @safe;
2969 
2970 pragma(LDC_intrinsic, "llvm.x86.avx.hadd.ps.256")
2971     float8 __builtin_ia32_haddps256(float8, float8) pure @safe;
2972 
2973 pragma(LDC_intrinsic, "llvm.x86.avx.hsub.pd.256")
2974     double4 __builtin_ia32_hsubpd256(double4, double4) pure @safe;
2975 
2976 pragma(LDC_intrinsic, "llvm.x86.avx.hsub.ps.256")
2977     float8 __builtin_ia32_hsubps256(float8, float8) pure @safe;
2978 
2979 
2980 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.pd")
2981     double2 __builtin_ia32_maskloadpd(const void*, long2);
2982 
2983 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.pd.256")
2984     double4 __builtin_ia32_maskloadpd256(const void*, long4);
2985 
2986 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.ps")
2987     float4 __builtin_ia32_maskloadps(const void*, int4);
2988 
2989 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.ps.256")
2990     float8 __builtin_ia32_maskloadps256(const void*, int8);
2991 
2992 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.pd")
2993     void __builtin_ia32_maskstorepd(void*, long2, double2);
2994 
2995 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.pd.256")
2996     void __builtin_ia32_maskstorepd256(void*, long4, double4);
2997 
2998 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.ps")
2999     void __builtin_ia32_maskstoreps(void*, int4, float4);
3000 
3001 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.ps.256")
3002     void __builtin_ia32_maskstoreps256(void*, int8, float8);
3003 
3004 
3005 
3006 pragma(LDC_intrinsic, "llvm.x86.avx.movmsk.pd.256")
3007     int __builtin_ia32_movmskpd256(double4) pure @safe;
3008 
3009 pragma(LDC_intrinsic, "llvm.x86.avx.movmsk.ps.256")
3010     int __builtin_ia32_movmskps256(float8) pure @safe;
3011 
3012 pragma(LDC_intrinsic, "llvm.x86.avx.ptestc.256")
3013     int __builtin_ia32_ptestc256(long4, long4) pure @safe;
3014 
3015 pragma(LDC_intrinsic, "llvm.x86.avx.ptestnzc.256")
3016     int __builtin_ia32_ptestnzc256(long4, long4) pure @safe;
3017 
3018 pragma(LDC_intrinsic, "llvm.x86.avx.ptestz.256")
3019     int __builtin_ia32_ptestz256(long4, long4) pure @safe;
3020 
3021 pragma(LDC_intrinsic, "llvm.x86.avx.rcp.ps.256")
3022     float8 __builtin_ia32_rcpps256(float8) pure @safe;
3023 
3024 pragma(LDC_intrinsic, "llvm.x86.avx.round.pd.256")
3025     double4 __builtin_ia32_roundpd256(double4, int) pure @safe;
3026 
3027 pragma(LDC_intrinsic, "llvm.x86.avx.round.ps.256")
3028     float8 __builtin_ia32_roundps256(float8, int) pure @safe;
3029 
3030 pragma(LDC_intrinsic, "llvm.x86.avx.rsqrt.ps.256")
3031     float8 __builtin_ia32_rsqrtps256(float8) pure @safe;
3032 
3033 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.pd")
3034     double2 __builtin_ia32_vpermilvarpd(double2, long2) pure @safe;
3035 
3036 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.pd.256")
3037     double4 __builtin_ia32_vpermilvarpd256(double4, long4) pure @safe;
3038 
3039 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.ps")
3040     float4 __builtin_ia32_vpermilvarps(float4, int4) pure @safe;
3041 
3042 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.ps.256")
3043     float8 __builtin_ia32_vpermilvarps256(float8, int8) pure @safe;
3044 
3045 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.pd")
3046     int __builtin_ia32_vtestcpd(double2, double2) pure @safe;
3047 
3048 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.pd.256")
3049     int __builtin_ia32_vtestcpd256(double4, double4) pure @safe;
3050 
3051 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.ps")
3052     int __builtin_ia32_vtestcps(float4, float4) pure @safe;
3053 
3054 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.ps.256")
3055     int __builtin_ia32_vtestcps256(float8, float8) pure @safe;
3056 
3057 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.pd")
3058     int __builtin_ia32_vtestnzcpd(double2, double2) pure @safe;
3059 
3060 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.pd.256")
3061     int __builtin_ia32_vtestnzcpd256(double4, double4) pure @safe;
3062 
3063 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.ps")
3064     int __builtin_ia32_vtestnzcps(float4, float4) pure @safe;
3065 
3066 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.ps.256")
3067     int __builtin_ia32_vtestnzcps256(float8, float8) pure @safe;
3068 
3069 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.pd")
3070     int __builtin_ia32_vtestzpd(double2, double2) pure @safe;
3071 
3072 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.pd.256")
3073     int __builtin_ia32_vtestzpd256(double4, double4) pure @safe;
3074 
3075 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.ps")
3076     int __builtin_ia32_vtestzps(float4, float4) pure @safe;
3077 
3078 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.ps.256")
3079     int __builtin_ia32_vtestzps256(float8, float8) pure @safe;
3080 
3081 +/