1 /**
2 * AVX2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2
4 *
5 * Copyright: Guillaume Piolat 2022-2025.
6 *            Johan Engelen 2022.
7 *            cet 2024.
8 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 */
10 module inteli.avx2intrin;
11 
12 // AVX2 instructions
13 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2
14 // Note: this header will work whether you have AVX2 enabled or not.
15 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively
16 // generate AVX2 instructions.
17 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively
18 // generate AVX2 instructions.
19 
20 
21 // Note: many special cases for GDC, because when suporting SIMD_COMPARISON_MASKS_32B but not having AVX2, 
22 // the replaced operators have terrible performance. Mostly a problem for -mavx on x86
23 
24 public import inteli.types;
25 import inteli.internals;
26 
27 // Pull in all previous instruction set intrinsics.
28 public import inteli.avxintrin;
29 
30 nothrow @nogc:
31 
32 /// Compute the absolute value of packed signed 16-bit integers in `a`.
33 __m256i _mm256_abs_epi16 (__m256i a) @trusted
34 {
35     // PERF DMD
36     version(LDC)
37         enum split = true; // always beneficial in LDC neon, ssse3, or even sse2
38     else
39         enum split = GDC_with_SSSE3;
40 
41     static if (GDC_with_AVX2)
42     {
43         return cast(__m256i) __builtin_ia32_pabsw256(cast(short16)a);
44     }
45     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
46     {
47         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
48         // no good way to do abs(256-bit)
49         return cast(__m256i) inteli_llvm_abs!short16(cast(short16)a, false);
50     }    
51     else static if (split)
52     {
53         __m128i a_lo = _mm256_extractf128_si256!0(a);
54         __m128i a_hi = _mm256_extractf128_si256!1(a);
55         __m128i r_lo = _mm_abs_epi16(a_lo);
56         __m128i r_hi = _mm_abs_epi16(a_hi);
57         return _mm256_set_m128i(r_hi, r_lo);
58     }    
59     else
60     {        
61         short16 sa = cast(short16)a;
62         for (int i = 0; i < 16; ++i)
63         {
64             short s = sa.array[i];
65             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
66         }  
67         return cast(__m256i)sa;
68     }
69 }
70 unittest
71 {
72     __m256i A = _mm256_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000,
73                                   1, -1, -32768, 32767, 12, -13, 1000, -1040);
74     short16 B = cast(short16) _mm256_abs_epi16(A);
75     short[16] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000,
76                          1, 1, -32768, 32767, 12, 13, 1000, 1040];
77     assert(B.array == correct);
78 }
79 
80 /// Compute the absolute value of packed signed 32-bit integers in `a`.
81 __m256i _mm256_abs_epi32 (__m256i a) @trusted
82 {
83     // PERF DMD
84     version(LDC)
85         enum split = true; // always beneficial in LDC neon, ssse3, or even sse2
86     else
87         enum split = false; // GDC manages to split and use pabsd in SSSE3 without guidance
88 
89     static if (GDC_with_AVX2)
90     {
91         return cast(__m256i) __builtin_ia32_pabsd256(cast(int8)a);
92     }
93     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
94     {
95         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
96         // no good way to do abs(256-bit)
97         return cast(__m256i) inteli_llvm_abs!int8(cast(int8)a, false);
98     }
99     else static if (split)
100     {
101         __m128i a_lo = _mm256_extractf128_si256!0(a);
102         __m128i a_hi = _mm256_extractf128_si256!1(a);
103         __m128i r_lo = _mm_abs_epi32(a_lo);
104         __m128i r_hi = _mm_abs_epi32(a_hi);
105         return _mm256_set_m128i(r_hi, r_lo);
106     }
107     else
108     {
109         int8 sa = cast(int8)a;
110         for (int i = 0; i < 8; ++i)
111         {
112             int s = sa.array[i];
113             sa.ptr[i] = (s >= 0 ? s : -s);
114         }
115         return cast(__m256i)sa;
116     }
117 }
118 unittest
119 {
120     __m256i A = _mm256_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647, -1, 0, -2_147_483_648, -2_147_483_646);
121     int8 B = cast(int8) _mm256_abs_epi32(A);
122     int[8] correct = [0, 1, -2_147_483_648, 2_147_483_647, 1, 0, -2_147_483_648, 2_147_483_646];
123     assert(B.array == correct);
124 }
125 
126 /// Compute the absolute value of packed signed 8-bit integers in `a`.
127 __m256i _mm256_abs_epi8 (__m256i a) @trusted
128 {
129     // PERF DMD
130     // PERF GDC in SSSE3 to AVX doesn't use pabsb and split is catastrophic because of _mm_min_epu8
131     version(LDC)
132         enum split = true; // always beneficial in LDC neon, ssse3, sse2
133     else
134         enum split = false;
135 
136     static if (GDC_with_AVX2)
137     {
138         return cast(__m256i) __builtin_ia32_pabsb256(cast(ubyte32)a);
139     }
140     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
141     {
142         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
143         // no good way to do abs(256-bit)
144         return cast(__m256i) inteli_llvm_abs!byte32(cast(byte32)a, false);
145     }
146     else static if (split)
147     {
148         __m128i a_lo = _mm256_extractf128_si256!0(a);
149         __m128i a_hi = _mm256_extractf128_si256!1(a);
150         __m128i r_lo = _mm_abs_epi8(a_lo);
151         __m128i r_hi = _mm_abs_epi8(a_hi);
152         return _mm256_set_m128i(r_hi, r_lo);
153     }
154     else
155     {
156         // Basically this loop is poison for LDC optimizer
157         byte32 sa = cast(byte32)a;
158         for (int i = 0; i < 32; ++i)
159         {
160             byte s = sa.array[i];
161             sa.ptr[i] = s >= 0 ? s : cast(byte)(-cast(int)(s));
162         }
163         return cast(__m256i)sa;
164     }
165 }
166 unittest
167 {
168     __m256i A = _mm256_setr_epi8(0, -1, -128, -127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
169                                  0, -1, -128, -126, 127, -6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5);
170     byte32 B = cast(byte32) _mm256_abs_epi8(A);
171     byte[32] correct =          [0,  1, -128,  127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
172                                  0,  1, -128,  126, 127,  6,  5,  4,  3,  2, 0, 1, 2, 3, 4, 5];
173     assert(B.array == correct);
174 }
175 
176 /// Add packed 16-bit integers in `a` and `b`.
177 __m256i _mm256_add_epi16 (__m256i a, __m256i b) pure @safe
178 {
179     pragma(inline, true);
180     return cast(__m256i)(cast(short16)a + cast(short16)b);
181 }
182 unittest
183 {
184     __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0,  6, -2);
185     short16 R = cast(short16) _mm256_add_epi16(A, A);
186     short[16] correct         = [ -14, -2, 0, 18, -200, 200, 468, 864,     0,    -2, 0, -2,  25536, 0, 12, -4 ];
187     assert(R.array == correct);
188 }
189 
190 /// Add packed 32-bit integers in `a` and `b`.
191 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe
192 {
193     pragma(inline, true);
194     return cast(__m256i)(cast(int8)a + cast(int8)b);
195 }
196 unittest
197 {
198     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
199     int8 R = cast(int8) _mm256_add_epi32(A, A);
200     int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ];
201     assert(R.array == correct);
202 }
203 
204 /// Add packed 64-bit integers in `a` and `b`.
205 __m256i _mm256_add_epi64 (__m256i a, __m256i b) pure @safe
206 {
207     pragma(inline, true);
208     return a + b;
209 }
210 unittest
211 {
212     __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12);
213     long4 R = cast(__m256i) _mm256_add_epi64(A, A);
214     long[4] correct = [ -2, 0, 84, -24 ];
215     assert(R.array == correct);
216 }
217 
218 /// Add packed 8-bit integers in `a` and `b`.
219 __m256i _mm256_add_epi8 (__m256i a, __m256i b) pure @safe
220 {
221     pragma(inline, true);
222     return cast(__m256i)(cast(byte32)a + cast(byte32)b);
223 }
224 unittest
225 {
226     __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78,
227                                  4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78);
228     byte32 R = cast(byte32) _mm256_add_epi8(A, A);
229     byte[32] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100,
230                         8, 18, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -4, 0, 20, -100];
231     assert(R.array == correct);
232 }
233 
234 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
235 __m256i _mm256_adds_epi16 (__m256i a, __m256i b) pure @trusted
236 {
237     // PERF DMD
238     static if (GDC_with_AVX2)
239     {
240         return cast(__m256i) __builtin_ia32_paddsw256(cast(short16)a, cast(short16)b);
241     }
242     else static if(LDC_with_saturated_intrinsics)
243     {
244         return cast(__m256i) inteli_llvm_adds!short16(cast(short16)a, cast(short16)b);
245     }
246     else
247     {
248         short16 r;
249         short16 sa = cast(short16)a;
250         short16 sb = cast(short16)b;
251         foreach(i; 0..16)
252             r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
253         return cast(__m256i)r;
254     }
255 }
256 unittest
257 {
258     short16 res = cast(short16) _mm256_adds_epi16(_mm256_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0,  7,  6,  5, -32768, 3, 3, 32767,   0),
259                                                   _mm256_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10,  7,  6,  5, -30000, 3, 1,     1, -10));
260     static immutable short[16] correctResult                    =  [14, 12, 10, -32768, 6, 4, 32767, -10, 14, 12, 10, -32768, 6, 4, 32767, -10];
261     assert(res.array == correctResult);
262 }
263 
264 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
265 __m256i _mm256_adds_epi8 (__m256i a, __m256i b) pure @trusted
266 {
267     // PERF DMD
268     static if (GDC_with_AVX2)
269     {
270         return cast(__m256i) __builtin_ia32_paddsb256(cast(ubyte32)a, cast(ubyte32)b);
271     }
272     else static if(LDC_with_saturated_intrinsics)
273     {
274         return cast(__m256i) inteli_llvm_adds!byte32(cast(byte32)a, cast(byte32)b);
275     }
276     else
277     {
278         byte32 r;
279         byte32 sa = cast(byte32)a;
280         byte32 sb = cast(byte32)b;
281         foreach(i; 0..32)
282             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
283         return cast(__m256i)r;
284     }
285 }
286 unittest
287 {
288     byte32 res = cast(byte32) _mm256_adds_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
289                                                _mm256_setr_epi8(15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0, 15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0));
290     static immutable byte[32] correctResult                  = [30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0, 30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0]; 
291     assert(res.array == correctResult);
292 }
293 
294 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
295 __m256i _mm256_adds_epu16 (__m256i a, __m256i b) pure @trusted
296 {
297     // PERF DMD
298     static if (GDC_with_AVX2)
299     {
300         return cast(__m256i) __builtin_ia32_paddusw256(cast(short16)a, cast(short16)b);
301     }
302     else static if(LDC_with_saturated_intrinsics)
303     {
304         return cast(__m256i) inteli_llvm_addus!short16(cast(short16)a, cast(short16)b);
305     }
306     else
307     {
308         short16 r;
309         short16 sa = cast(short16)a;
310         short16 sb = cast(short16)b;
311         foreach(i; 0..16)
312             r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
313         return cast(__m256i)r;
314     }
315 }
316 unittest
317 {
318     short16 res = cast(short16) _mm256_adds_epu16(_mm256_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
319                                              _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0));
320     static immutable short[16] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
321     assert(res.array == correctResult);
322 }
323 
324 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
325 __m256i _mm256_adds_epu8 (__m256i a, __m256i b) pure @trusted
326 {
327     // PERF DMD
328     static if (GDC_with_AVX2)
329     {
330         return cast(__m256i) __builtin_ia32_paddusb256(cast(ubyte32)a, cast(ubyte32)b);
331     }
332     else static if(LDC_with_saturated_intrinsics)
333     {
334         return cast(__m256i) inteli_llvm_addus!byte32(cast(byte32)a, cast(byte32)b);
335     }
336     else
337     {
338         byte32 r;
339         byte32 sa = cast(byte32)a;
340         byte32 sb = cast(byte32)b;
341         foreach(i; 0..32)
342             r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
343         return cast(__m256i)r;
344     }
345 }
346 unittest
347 {
348     __m256i A          = _mm256_setr_epi8(0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0);
349     __m256i B          = _mm256_setr_epi8(0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,             1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0,            40, 0, 0, 0, 0, 0, 0);
350     byte32 R = cast(byte32) _mm256_adds_epu8(A, B);
351     static immutable byte[32] correct =  [0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, cast(byte)176, 0, 0, 0, 0, 0, 0];
352     assert(R.array == correct);
353 }
354 
355 /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the 
356 /// result right by `imm8` bytes, and return the low 16 bytes of that in each lane.
357 __m256i _mm256_alignr_epi8(ubyte count)(__m256i a, __m256i b) pure @trusted
358 {
359 
360     // PERF DMD
361     static if (GDC_with_AVX2)
362     {
363         return cast(__m256i)__builtin_ia32_palignr256(a, b, count * 8);
364     }
365     else
366     {
367         // Note that palignr 256-bit does the same as palignr 128-bit by lane. Can split.
368         // With LDC 1.24 + avx2 feature + -02, that correctly gives a AVX2 vpalignr despite being split.
369         // I guess we could do it with a big 32-items shufflevector but not sure if best.
370         // 2 inst on ARM64 neon, which is optimal.
371         __m128i a_lo = _mm256_extractf128_si256!0(a);
372         __m128i a_hi = _mm256_extractf128_si256!1(a);
373         __m128i b_lo = _mm256_extractf128_si256!0(b);
374         __m128i b_hi = _mm256_extractf128_si256!1(b);
375         __m128i r_lo = _mm_alignr_epi8!count(a_lo, b_lo);
376         __m128i r_hi = _mm_alignr_epi8!count(a_hi, b_hi);
377         return _mm256_set_m128i(r_hi, r_lo);   
378     }
379 }
380 unittest
381 {
382     __m128i A = _mm_setr_epi8( 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16);
383     __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
384     __m256i AA = _mm256_set_m128i(A, A);
385     __m256i BB = _mm256_set_m128i(B, B);
386 
387     {
388         byte32 C = cast(byte32) _mm256_alignr_epi8!0(AA, BB);
389         byte[32] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
390         assert(C.array == correct);
391     }
392     {
393         byte32 C = cast(byte32) _mm256_alignr_epi8!20(AA, BB);
394         byte[32] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
395         assert(C.array == correct);
396     }
397     {
398         byte32 C = cast(byte32) _mm256_alignr_epi8!34(AA, BB);
399         byte[32] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
400         assert(C.array == correct);
401     }
402 }
403 
404 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`.
405 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe
406 {
407     pragma(inline, true);
408     return a & b;
409 }
410 unittest
411 {
412     __m256i A = _mm256_set1_epi32(7);
413     __m256i B = _mm256_set1_epi32(14);
414     int8 R = cast(int8) _mm256_and_si256(A, B);
415     int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6];
416     assert(R.array == correct);
417 }
418 
419 /// Compute the bitwise NOT of 256 bits (representing integer data) in `a` and then AND with `b`.
420 __m256i _mm256_andnot_si256 (__m256i a, __m256i b) pure @safe
421 {
422     // See: https://issues.dlang.org/show_bug.cgi?id=24283, 
423     // need workaround if we ever use DMD AVX codegen
424 
425     pragma(inline, true);
426     return (~a) & b;
427 }
428 unittest
429 {
430     __m256i A = _mm256_setr_epi32(7, -2, 9, 54654, 7, -2, 9, 54654);
431     __m256i B = _mm256_setr_epi32(14, 78, 111, -256, 14, 78, 111, -256);
432     int8 R = cast(int8) _mm256_andnot_si256(A, B);
433     int[8] correct = [8, 0, 102, -54784, 8, 0, 102, -54784];
434     assert(R.array == correct);
435 }
436 
437 /// Average packed unsigned 16-bit integers in `a` and `b`.
438 __m256i _mm256_avg_epu16 (__m256i a, __m256i b) pure @trusted
439 {
440     static if (GDC_with_AVX2)
441     {
442         return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b);
443     }
444     else static if (LDC_with_AVX2 && __VERSION__ >= 2094)
445     {
446         return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b);
447     }
448     else
449     {
450         // Splitting is always beneficial here, except -O0
451         __m128i a_lo = _mm256_extractf128_si256!0(a);
452         __m128i a_hi = _mm256_extractf128_si256!1(a);
453         __m128i b_lo = _mm256_extractf128_si256!0(b);
454         __m128i b_hi = _mm256_extractf128_si256!1(b);
455         __m128i r_lo = _mm_avg_epu16(a_lo, b_lo);
456         __m128i r_hi = _mm_avg_epu16(a_hi, b_hi);
457         return _mm256_set_m128i(r_hi, r_lo);
458     }
459 }
460 unittest
461 {
462     __m256i A = _mm256_set1_epi16(31457);
463     __m256i B = _mm256_set1_epi16(cast(short)64000);
464     short16 avg = cast(short16)(_mm256_avg_epu16(A, B));
465     foreach(i; 0..16)
466         assert(avg.array[i] == cast(short)47729);
467 }
468 
469 /// Average packed unsigned 8-bit integers in `a` and `b`.
470 __m256i _mm256_avg_epu8 (__m256i a, __m256i b) pure @trusted
471 {
472     static if (GDC_with_AVX2)
473     {
474         return cast(__m256i) __builtin_ia32_pavgb256(cast(ubyte32)a, cast(ubyte32)b);
475     }
476     else static if (LDC_with_AVX2 && __VERSION__ >= 2094)
477     {
478         return cast(__m256i) __builtin_ia32_pavgb256(cast(byte32)a, cast(byte32)b);
479     }
480     else
481     {
482         // Splitting is always beneficial here, except -O0
483         __m128i a_lo = _mm256_extractf128_si256!0(a);
484         __m128i a_hi = _mm256_extractf128_si256!1(a);
485         __m128i b_lo = _mm256_extractf128_si256!0(b);
486         __m128i b_hi = _mm256_extractf128_si256!1(b);
487         __m128i r_lo = _mm_avg_epu8(a_lo, b_lo);
488         __m128i r_hi = _mm_avg_epu8(a_hi, b_hi);
489         return _mm256_set_m128i(r_hi, r_lo);
490     }
491 }
492 unittest
493 {
494     __m256i A = _mm256_set1_epi8(-1);
495     __m256i B = _mm256_set1_epi8(13);
496     byte32 avg = cast(byte32)(_mm256_avg_epu8(A, B));
497     foreach(i; 0..32)
498         assert(avg.array[i] == cast(byte)134);
499 }
500 
501 /// Blend packed 16-bit integers from `a` and `b` within 128-bit lanes using 8-bit control
502 /// mask `imm8`, in each of the two lanes.
503 /// Note: this is functionally equivalent to two `_mm_blend_epi16`.
504 __m256i _mm256_blend_epi16(int imm8) (__m256i a, __m256i b) pure @trusted
505 {
506     // PERF DMD
507     assert(imm8 >= 0 && imm8 < 256);
508     enum bool split = true; // makes things better, except on ARM32 which is no better than naive
509 
510     static if (GDC_with_AVX2)
511     {
512         return cast(__m256i) __builtin_ia32_pblendw256(cast(short16)a, cast(short16)b, imm8);
513     }
514     else static if (split)
515     {
516         __m128i a_lo = _mm256_extractf128_si256!0(a);
517         __m128i a_hi = _mm256_extractf128_si256!1(a);
518         __m128i b_lo = _mm256_extractf128_si256!0(b);
519         __m128i b_hi = _mm256_extractf128_si256!1(b);
520         __m128i r_lo = _mm_blend_epi16!(imm8)(a_lo, b_lo);
521         __m128i r_hi = _mm_blend_epi16!(imm8)(a_hi, b_hi);
522         return _mm256_set_m128i(r_hi, r_lo);
523     }
524 }
525 unittest
526 {
527     __m256i A = _mm256_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7,  0, -1,  -2,  -3,  -4,  -5,  -6,  -7);
528     __m256i B = _mm256_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15, -8, -9, -10, -11, -12, -13, -14, -15);
529     short16 C = cast(short16) _mm256_blend_epi16!147(A, B); // 10010011 10010011
530     short[16] correct =        [8, 9,  2,  3, 12,  5,  6, 15, -8, -9,  -2, -3, -12,  -5,  -6, -15];
531     assert(C.array == correct);
532 }
533 
534 /// Blend packed 32-bit integers from `a` and `b` using 4-bit control mask `imm8`.
535 __m128i _mm_blend_epi32(int imm8)(__m128i a, __m128i b) pure @trusted
536 {
537     // This one is interesting, it is functionally equivalent to SSE4.1 blendps (_mm_blend_ps)
538     // So without AVX2 we can always fallback to _mm_blend_ps
539     // And indeed, a shufflevector!int4 doesn't even use vpblendd with LDC, and prefer
540     // blendps and shufps so why bother.
541 
542     // PERF DMD
543     static assert(imm8 >= 0 && imm8 < 16);
544     static if (GDC_with_AVX2)
545     {
546         return __builtin_ia32_pblendd128(a, b, imm8);
547     }
548     else
549     {
550         return cast(__m128i) _mm_blend_ps!imm8(cast(__m128)a, cast(__m128)b);
551     }
552 }
553 unittest
554 {
555     __m128i A = _mm_setr_epi32(0, 1,  2,  3);
556     __m128i B = _mm_setr_epi32(8, 9, 10, 11);
557     int4 C = _mm_blend_epi32!13(A, B); // 1101
558     int[4] correct =    [8, 1, 10, 11];
559     assert(C.array == correct);
560 }
561 
562 /// Blend packed 32-bit integers from `a` and `b` using 8-bit control mask `imm8`.
563 __m256i _mm256_blend_epi32(int imm8)(__m256i a, __m256i b) pure @trusted
564 {
565     // This one is functionally equivalent to AVX _mm256_blend_ps, except with integers.
566     // With LDC, doing a shufflevector here would select the vblendps instruction anyway,
567     // so we might as well defer to _mm256_blend_ps.
568 
569     // PERF DMD
570     static assert(imm8 >= 0 && imm8 < 256);
571     static if (GDC_with_AVX2)
572     {
573         return cast(__m256i) __builtin_ia32_pblendd256 (cast(int8)a, cast(int8)b, imm8);
574     }
575     else
576     {
577         return cast(__m256i) _mm256_blend_ps!imm8(cast(__m256)a, cast(__m256)b);
578     }
579 }
580 unittest
581 {
582     __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
583     __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 147, 15);
584     int8 C = cast(int8) _mm256_blend_epi32!0xe7(A, B);
585     int[8] correct =             [8, 9, 10,  3,  4, 13, 147, 15];
586     assert(C.array == correct);
587 }
588 
589 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
590 /// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`.
591  __m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask) pure @safe
592  {
593     // BUG PERF: this would fail the CI with GDC 12
594     /*static if (GDC_with_AVX2)
595         return cast(__m256i)__builtin_ia32_pblendvb256(cast(ubyte32)a, cast(ubyte32)b, cast(ubyte32)mask);
596     else 
597 */
598 
599     static if (LDC_with_AVX2)
600     {
601         return cast(__m256i) __builtin_ia32_pblendvb256(cast(byte32)a, cast(byte32)b, cast(byte32)mask);
602     }
603     else
604     {
605         __m128i a_lo = _mm256_extractf128_si256!0(a);
606         __m128i a_hi = _mm256_extractf128_si256!1(a);
607         __m128i b_lo = _mm256_extractf128_si256!0(b);
608         __m128i b_hi = _mm256_extractf128_si256!1(b);
609         __m128i m_lo = _mm256_extractf128_si256!0(mask);
610         __m128i m_hi = _mm256_extractf128_si256!1(mask);
611         __m128i r_lo = _mm_blendv_epi8(a_lo, b_lo, m_lo);
612         __m128i r_hi = _mm_blendv_epi8(a_hi, b_hi, m_hi);
613         return _mm256_set_m128i(r_hi, r_lo);
614     }
615 }
616 unittest
617 {
618     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
619                                8,  9, 10, 11, 12, 13, 14, 15);
620     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
621                               24, 25, 26, 27, 28, 29, 30, 31);
622     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
623                                1,  1, -1, -1,  4,  1,  8, -128);
624     __m256i AA = _mm256_set_m128i(A, A);
625     __m256i BB = _mm256_set_m128i(B, B);
626     __m256i MM = _mm256_set_m128i(M, M);
627     byte32 R = cast(byte32) _mm256_blendv_epi8(AA, BB, MM);
628     byte[32] correct =      [  0, 17,  2,  3, 20,  5, 22,  7, 8,  9, 26, 27, 12, 13, 14, 31,
629                                0, 17,  2,  3, 20,  5, 22,  7, 8,  9, 26, 27, 12, 13, 14, 31 ];
630     assert(R.array == correct);
631 }
632 
633 /// Broadcast the low packed 8-bit integer from `a` to all elements of result.
634 __m128i _mm_broadcastb_epi8 (__m128i a) pure @safe
635 {
636     byte16 ba = cast(byte16)a;
637     byte16 r;
638     r = ba.array[0];
639     return cast(__m128i)r;
640 }
641 unittest
642 {
643     byte16 A;
644     A.ptr[0] = 2;
645     byte16 B = cast(byte16) _mm_broadcastb_epi8(cast(__m128i)A);
646     byte[16] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2];
647     assert(B.array == correct);
648 }
649 
650 /// Bro0adcast the low packed 8-bit integer from `a` to all elements of result.
651 __m256i _mm256_broadcastb_epi8(__m128i a) pure @safe
652 {
653     byte16 ba = cast(byte16)a;
654     byte32 r;
655     r = ba.array[0];
656     return cast(__m256i)r;
657 }
658 unittest
659 {
660     byte16 A;
661     A.ptr[0] = 2;
662     byte32 B = cast(byte32) _mm256_broadcastb_epi8(cast(__m128i)A);
663     byte[32] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
664                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2];
665     assert(B.array == correct);
666 }
667 
668 /// Broadcast the low packed 32-bit integer from `a` to all elements of result.
669 __m128i _mm_broadcastd_epi32 (__m128i a) pure @safe
670 {
671     int4 ba = cast(int4)a;
672     int4 r;
673     r = ba.array[0];
674     return cast(__m128i)r;
675 }
676 unittest
677 {
678     int4 A;
679     A.ptr[0] = -2;
680     int4 B = cast(int4) _mm_broadcastd_epi32(cast(__m128i)A);
681     int[4] correct = [-2, -2, -2, -2];
682     assert(B.array == correct);
683 }
684 
685 /// Broadcast the low packed 32-bit integer from `a` to all elements of result.
686 __m256i _mm256_broadcastd_epi32 (__m128i a) pure @safe
687 {
688     int4 ba = cast(int4)a;
689     int8 r;
690     r = ba.array[0];
691     return cast(__m256i)r;
692 }
693 unittest
694 {
695     int4 A;
696     A.ptr[0] = -2;
697     int8 B = cast(int8) _mm256_broadcastd_epi32(cast(__m128i)A);
698     int[8] correct = [-2, -2, -2, -2, -2, -2, -2, -2];
699     assert(B.array == correct);
700 }
701 
702 /// Broadcast the low packed 64-bit integer from `a` to all elements of result.
703 __m128i _mm_broadcastq_epi64 (__m128i a) pure @safe
704 {
705     long2 ba = cast(long2)a;
706     long2 r;
707     r = ba.array[0];
708     return cast(__m128i)r;
709 }
710 unittest
711 {
712     long2 A;
713     A.ptr[0] = -2;
714     long2 B = cast(long2) _mm_broadcastq_epi64(cast(__m128i)A);
715     long[2] correct = [-2, -2];
716     assert(B.array == correct);
717 }
718 
719 /// Broadcast the low packed 64-bit integer from `a` to all elements of result.
720 __m256i _mm256_broadcastq_epi64 (__m128i a) pure @safe
721 {
722     long2 ba = cast(long2)a;
723     long4 r;
724     r = ba.array[0];
725     return cast(__m256i)r;
726 }
727 unittest
728 {
729     long2 A;
730     A.ptr[0] = -2;
731     long4 B = cast(long4) _mm256_broadcastq_epi64(cast(__m128i)A);
732     long[4] correct = [-2, -2, -2, -2];
733     assert(B.array == correct);
734 }
735 
736 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result.
737 __m128d _mm_broadcastsd_pd (__m128d a) pure @safe
738 {
739     double2 r;
740     r = a.array[0];
741     return r;
742 }
743 unittest
744 {
745     double2 A;
746     A.ptr[0] = 2;
747     double2 B = _mm_broadcastsd_pd(A);
748     double[2] correct = [2.0, 2.0];
749     assert(B.array == correct);
750 }
751 
752 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result.
753 __m256d _mm256_broadcastsd_pd (__m128d a) pure @safe
754 {
755     double4 r;
756     r = a.array[0];
757     return r;
758 }
759 unittest
760 {
761     double2 A;
762     A.ptr[0] = 3;
763     double4 B = _mm256_broadcastsd_pd(A);
764     double[4] correct = [3.0, 3, 3, 3];
765     assert(B.array == correct);
766 }
767 
768 /// Broadcast 128 bits of integer data from ``a to all 128-bit lanes in result.
769 /// Note: also exist with name `_mm256_broadcastsi128_si256` which is identical.
770 __m256i _mm_broadcastsi128_si256 (__m128i a) pure @trusted
771 {
772     // Note that GDC will prefer vinserti128 to vbroadcast, for some reason
773     // So in the end it's the same as naive code.
774     // For this reason, __builtin_ia32_vbroadcastsi256 isn't used
775     long2 ba = cast(long2)a;
776     long4 r;
777     r.ptr[0] = ba.array[0];
778     r.ptr[1] = ba.array[1];
779     r.ptr[2] = ba.array[0];
780     r.ptr[3] = ba.array[1];
781     return cast(__m256i)r;
782 }
783 unittest
784 {
785     long2 A;
786     A.ptr[0] = 34;
787     A.ptr[1] = -56;
788     long4 B = cast(long4) _mm_broadcastsi128_si256(cast(__m128i)A);
789     long[4] correct = [34, -56, 34, -56];
790     assert(B.array == correct);
791 }
792 
793 ///ditto
794 alias _mm256_broadcastsi128_si256 = _mm_broadcastsi128_si256; // intrinsics is duplicated in the Guide, for some reason
795 
796 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result.
797 __m128 _mm_broadcastss_ps (__m128 a) pure @safe
798 {
799     float4 r;
800     r = a.array[0];
801     return r;
802 }
803 unittest
804 {
805     float4 A;
806     A.ptr[0] = 2;
807     float4 B = _mm_broadcastss_ps(A);
808     float[4] correct = [2.0f, 2, 2, 2];
809     assert(B.array == correct);
810 }
811 
812 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result.
813 __m256 _mm256_broadcastss_ps (__m128 a) pure @safe
814 {
815     float8 r;
816     r = a.array[0];
817     return r;
818 }
819 unittest
820 {
821     float4 A;
822     A.ptr[0] = 2;
823     float8 B = _mm256_broadcastss_ps(A);
824     float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2];
825     assert(B.array == correct);
826 }
827 
828 /// Broadcast the low packed 16-bit integer from `a` to all elements of result.
829 __m128i _mm_broadcastw_epi16 (__m128i a) pure @safe
830 {
831     short8 ba = cast(short8)a;
832     short8 r;
833     r = ba.array[0];
834     return cast(__m128i)r;
835 }
836 unittest
837 {
838     short8 A;
839     A.ptr[0] = 13;
840     short8 B = cast(short8) _mm_broadcastw_epi16(cast(__m128i)A);
841     short[8] correct = [13, 13, 13, 13, 13, 13, 13, 13];
842     assert(B.array == correct);
843 }
844 
845 /// Broadcast the low packed 16-bit integer from `a` to all elements of result.
846 __m256i _mm256_broadcastw_epi16 (__m128i a) pure @safe
847 {
848     short8 ba = cast(short8)a;
849     short16 r;
850     r = ba.array[0];
851     return cast(__m256i)r;
852 }
853 unittest
854 {
855     short8 A;
856     A.ptr[0] = 13;
857     short16 B = cast(short16) _mm256_broadcastw_epi16(cast(__m128i)A);
858     short[16] correct = [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13];
859     assert(B.array == correct);
860 }
861 
862 
863 /// Shift 128-bit lanes in `a` left by `bytes` bytes while shifting in zeroes.
864 __m256i _mm256_bslli_epi128(ubyte bytes)(__m256i a) pure @trusted
865 {
866     // Note: can't use __builtin_ia32_pslldqi256 with GDC, wants an immediate
867     //       and even string mixin do not make it
868     // PERF: hence GDC AVX2 doesn't use the instruction, and nothing inlines very well in GDC either
869     static if (bytes >= 16)
870     {
871         return _mm256_setzero_si256();
872     }
873     else static if (LDC_with_AVX2)
874     {
875         return cast(__m256i)__asm!(long4)("vpslldq $2, $1, $0", "=v,v,I", a, bytes);
876     }
877     else // split
878     {
879         __m128i lo = _mm_slli_si128!bytes(_mm256_extractf128_si256!0(a));
880         __m128i hi = _mm_slli_si128!bytes(_mm256_extractf128_si256!1(a));
881         return _mm256_set_m128i(hi, lo);
882     }
883 }
884 unittest
885 {
886     __m256i a = _mm256_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
887     assert(_mm256_bslli_epi128!7(a).array == [72057594037927936, 650777868590383874, 1224979098644774912, 1808220633999610642]);
888 }
889 
890 /// Shift 128-bit lanes in `a` right by `bytes` bytes while shifting in zeroes.
891 __m256i _mm256_bsrli_epi128(ubyte bytes)(__m256i a) pure @trusted
892 {
893     // Note: can't use __builtin_ia32_psrldqi256 with GDC, wants an immediate
894     //       and even string mixin do not make it
895     // PERF: hence GDC AVX2 doesn't use the instruction, and nothing inlines very well in GDC either
896     static if (bytes >= 16)
897     {
898         return _mm256_setzero_si256();
899     }
900     else static if (LDC_with_AVX2)
901     {
902         return cast(__m256i)__asm!(long4)("vpsrldq $2, $1, $0", "=v,v,I", a, bytes);
903     }
904     else // split
905     {
906         __m128i lo = _mm_srli_si128!bytes(_mm256_extractf128_si256!0(a));
907         __m128i hi = _mm_srli_si128!bytes(_mm256_extractf128_si256!1(a));
908         return _mm256_set_m128i(hi, lo);
909     }
910 }
911 unittest
912 {
913     __m256i a = _mm256_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
914     assert(_mm256_bsrli_epi128!7(a).array == [1084818905618843912, 16, 2242261671028070680, 32]);
915 }
916 
917 /// Compare packed 16-bit integers in `a` and `b` for equality.
918 __m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b) pure @trusted
919 {
920     // PERF: GDC without AVX
921     // PERF: DMD
922     static if (SIMD_COMPARISON_MASKS_32B)
923     {
924         // PERF: catastrophic in GDC without AVX2
925         return cast(__m256i)(cast(short16)a == cast(short16)b);
926     }
927     else static if (GDC_with_AVX2)
928     {
929         return cast(__m256i) __builtin_ia32_pcmpeqw256(cast(short16)a, cast(short16)b);
930     }
931     else version(LDC)
932     {
933         return cast(__m256i) equalMask!short16(cast(short16)a, cast(short16)b);
934     }
935     else
936     {
937         short16 sa = cast(short16)a;
938         short16 sb = cast(short16)b;
939         short16 sr;
940         for (int n = 0; n < 16; ++n)
941         {
942             bool cond = sa.array[n] == sb.array[n];
943             sr.ptr[n] = cond ? -1 : 0;
944         }
945         return cast(__m256i) sr;
946     }
947 }
948 unittest
949 {
950     short16   A = [-3, -2, -1,  0,  0,  1,  2,  3, -3, -2, -1,  0,  0,  1,  2,  3];
951     short16   B = [ 4,  3,  2,  1,  0, -1, -2, -3, -3,  3,  2,  1,  0, -1, -2, -3];
952     short[16] E = [ 0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0];
953     short16   R = cast(short16)(_mm256_cmpeq_epi16(cast(__m256i)A, cast(__m256i)B));
954     assert(R.array == E);
955 }
956 
957 /// Compare packed 32-bit integers in `a` and `b` for equality.
958 __m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b) pure @trusted
959 {
960     // PERF: GDC without AVX
961     // PERF: DMD
962     static if (SIMD_COMPARISON_MASKS_32B)
963     {
964         // Quite bad in GDC -mavx (with no AVX2)
965         return cast(__m256i)(cast(int8)a == cast(int8)b);
966     }
967     else static if (GDC_with_AVX2)
968     {
969         return cast(__m256i) __builtin_ia32_pcmpeqd256(cast(int8)a, cast(int8)b);
970     }
971     else version(LDC)
972     {
973         return cast(__m256i) equalMask!int8(cast(int8)a, cast(int8)b);
974     }
975     else
976     {
977         int8 ia = cast(int8)a;
978         int8 ib = cast(int8)b;
979         int8 ir;
980         for (int n = 0; n < 8; ++n)
981         {
982             bool cond = ia.array[n] == ib.array[n];
983             ir.ptr[n] = cond ? -1 : 0;
984         }
985         return cast(__m256i) ir;
986     }
987 }
988 unittest
989 {
990     int8   A = [-3, -2, -1,  0, -3, -2, -1,  0];
991     int8   B = [ 4, -2,  2,  0,  4, -2,  2,  0];
992     int[8] E = [ 0, -1,  0, -1,  0, -1,  0, -1];
993     int8   R = cast(int8)(_mm256_cmpeq_epi32(cast(__m256i)A, cast(__m256i)B));
994     assert(R.array == E);
995 }
996 
997 /// Compare packed 64-bit integers in `a` and `b` for equality.
998 __m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) pure @trusted
999 {
1000     // PERF: GDC without AVX
1001     // PERF: DMD
1002     static if (SIMD_COMPARISON_MASKS_32B)
1003     {
1004         // Note: enabling this with DMD will probably lead to same bug as _mm_cmpeq_epi64
1005         return cast(__m256i)(cast(long4)a == cast(long4)b);
1006     }
1007     else static if (GDC_with_AVX2)
1008     {
1009         return cast(__m256i)__builtin_ia32_pcmpeqq256(cast(long4)a, cast(long4)b);
1010     }
1011     else version(LDC)
1012     {
1013         return cast(__m256i) equalMask!long4(cast(long4)a, cast(long4)b);
1014     }
1015     else
1016     {
1017         long4 la = cast(long4)a;
1018         long4 lb = cast(long4)b;
1019         long4 res;
1020         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
1021         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
1022         res.ptr[2] = (la.array[2] == lb.array[2]) ? -1 : 0;
1023         res.ptr[3] = (la.array[3] == lb.array[3]) ? -1 : 0;
1024         return cast(__m256i)res;
1025     }
1026 }
1027 unittest
1028 {
1029     __m256i A = _mm256_setr_epi64(-1, -2, -1, -2);
1030     __m256i B = _mm256_setr_epi64(-3, -2, -3, -3);
1031     __m256i C = _mm256_setr_epi64(-1, -4, -1, -2);
1032     long4 AB = cast(long4) _mm256_cmpeq_epi64(A, B);
1033     long4 AC = cast(long4) _mm256_cmpeq_epi64(A, C);
1034     long[4] correct1 = [ 0, -1,  0,  0];
1035     long[4] correct2 = [-1,  0, -1, -1];
1036     assert(AB.array == correct1);
1037     assert(AC.array == correct2);
1038 }
1039 
1040 /// Compare packed 8-bit integers in `a` and `b` for equality.
1041 __m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b) pure @trusted
1042 {
1043     // PERF: GDC without AVX2, need split
1044     // PERF: DMD
1045     static if (SIMD_COMPARISON_MASKS_32B)
1046     {
1047         return cast(__m256i)(cast(byte32)a == cast(byte32)b);
1048     }
1049     else static if (GDC_with_AVX2)
1050     {
1051         return cast(__m256i) __builtin_ia32_pcmpeqb256(cast(ubyte32)a, cast(ubyte32)b);
1052     }
1053     else version(LDC)
1054     {
1055         return cast(__m256i) equalMask!byte32(cast(byte32)a, cast(byte32)b);
1056     }
1057     else
1058     {
1059         byte32 ba = cast(byte32)a;
1060         byte32 bb = cast(byte32)b;
1061         byte32 br;
1062         for (int n = 0; n < 32; ++n)
1063         {
1064             bool cond = ba.array[n] == bb.array[n];
1065             br.ptr[n] = cond ? -1 : 0;
1066         }
1067         return cast(__m256i) br;
1068     }
1069 }
1070 unittest
1071 {
1072     __m256i A = _mm256_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1,
1073                                  1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 42);
1074     __m256i B = _mm256_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1,
1075                                  2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
1076     byte32 C = cast(byte32) _mm256_cmpeq_epi8(A, B);
1077     byte[32] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1,
1078                               0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0,  0];
1079     assert(C.array == correct);
1080 }
1081 
1082 /// Compare packed signed 16-bit integers in `a` and `b` for greater-than.
1083 __m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b) pure @safe
1084 {
1085     version(GNU)
1086         enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2
1087     else
1088         enum bool mayUseComparisonOperator = true;
1089 
1090     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
1091     {
1092         return cast(__m256i)(cast(short16)a > cast(short16)b);
1093     }
1094     else static if (GDC_with_AVX2)
1095     {
1096         return cast(__m256i) __builtin_ia32_pcmpgtw256(cast(short16)a, cast(short16)b);
1097     }
1098     else // split
1099     {
1100         __m128i a_lo = _mm256_extractf128_si256!0(a);
1101         __m128i a_hi = _mm256_extractf128_si256!1(a);
1102         __m128i b_lo = _mm256_extractf128_si256!0(b);
1103         __m128i b_hi = _mm256_extractf128_si256!1(b);
1104         __m128i r_lo = _mm_cmpgt_epi16(a_lo, b_lo);
1105         __m128i r_hi = _mm_cmpgt_epi16(a_hi, b_hi);
1106         return _mm256_set_m128i(r_hi, r_lo);
1107     }
1108 }
1109 unittest
1110 {
1111     short16   A = [-3, -2, -1,  0,  0,  1,  2,  3, -3, -2, -1,  0,  0,  1,  2,  3];
1112     short16   B = [ 4,  3,  2,  1,  0, -1, -2, -3,  4, -3,  2,  1,  0, -1, -2, -3];
1113     short[16] E = [ 0,  0,  0,  0,  0, -1, -1, -1,  0, -1,  0,  0,  0, -1, -1, -1];
1114     short16   R = cast(short16)(_mm256_cmpgt_epi16(cast(__m256i)A, cast(__m256i)B));
1115     assert(R.array == E);
1116 }
1117 
1118 /// Compare packed signed 32-bit integers in `a` and `b` for greater-than.
1119 __m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b) pure @safe
1120 {
1121     version(GNU)
1122         enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else
1123     else
1124         enum bool mayUseComparisonOperator = true;
1125 
1126     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
1127     {
1128         return cast(__m256i)(cast(int8)a > cast(int8)b);
1129     }
1130     else static if (GDC_with_AVX2)
1131     {
1132         return cast(__m256i) __builtin_ia32_pcmpgtd256(cast(int8)a, cast(int8)b);
1133     }
1134     else // split
1135     {
1136         __m128i a_lo = _mm256_extractf128_si256!0(a);
1137         __m128i a_hi = _mm256_extractf128_si256!1(a);
1138         __m128i b_lo = _mm256_extractf128_si256!0(b);
1139         __m128i b_hi = _mm256_extractf128_si256!1(b);
1140         __m128i r_lo = _mm_cmpgt_epi32(a_lo, b_lo);
1141         __m128i r_hi = _mm_cmpgt_epi32(a_hi, b_hi);
1142         return _mm256_set_m128i(r_hi, r_lo);
1143     }
1144 }
1145 unittest
1146 {
1147     int8   A = [-3,  2, -1,  0, -3,  2, -1,  0];
1148     int8   B = [ 4, -2,  2,  0,  4, -2,  2,  0];
1149     int[8] E = [ 0, -1,  0,  0,  0, -1,  0,  0];
1150     int8   R = cast(int8) _mm256_cmpgt_epi32(cast(__m256i)A, cast(__m256i)B);
1151     assert(R.array == E);
1152 }
1153 
1154 __m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b) pure @safe
1155 {
1156     version(GNU)
1157         enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else
1158     else
1159         enum bool mayUseComparisonOperator = true;
1160 
1161     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
1162     {
1163         return cast(__m256i)(cast(long4)a > cast(long4)b);
1164     }
1165     else static if (GDC_with_AVX2)
1166     {
1167         return cast(__m256i) __builtin_ia32_pcmpgtq256(cast(long4)a, cast(long4)b);
1168     }
1169     else // split
1170     {
1171         __m128i a_lo = _mm256_extractf128_si256!0(a);
1172         __m128i a_hi = _mm256_extractf128_si256!1(a);
1173         __m128i b_lo = _mm256_extractf128_si256!0(b);
1174         __m128i b_hi = _mm256_extractf128_si256!1(b);
1175         __m128i r_lo = _mm_cmpgt_epi64(a_lo, b_lo);
1176         __m128i r_hi = _mm_cmpgt_epi64(a_hi, b_hi);
1177         return _mm256_set_m128i(r_hi, r_lo);
1178     }
1179 }
1180 unittest
1181 {
1182     __m256i A = _mm256_setr_epi64(-3,  2, 70,  2);
1183     __m256i B = _mm256_setr_epi64 (4, -2,  4, -2);
1184     long[4] correct = [ 0, -1, -1, -1 ];
1185     long4 R = cast(long4)(_mm256_cmpgt_epi64(A, B));
1186     assert(R.array == correct);
1187 }
1188 
1189 /// Compare packed signed 8-bit integers in `a` and `b` for greater-than.
1190 __m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b) pure @safe
1191 {
1192     version(GNU)
1193     {
1194         // too slow in GDC without AVX2, but also doesn't 
1195         // work in CI? BUG PERF
1196         enum bool mayUseComparisonOperator = false; 
1197     }
1198     else
1199         enum bool mayUseComparisonOperator = true;
1200 
1201     static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator)
1202     {
1203         return cast(__m256i)(cast(byte32)a > cast(byte32)b);
1204     }
1205     /*else static if (GDC_with_AVX2)
1206     {
1207         return cast(__m256i) __builtin_ia32_pcmpgtb256(cast(ubyte32)a, cast(ubyte32)b);
1208     }*/
1209     else // split
1210     {
1211         __m128i a_lo = _mm256_extractf128_si256!0(a);
1212         __m128i a_hi = _mm256_extractf128_si256!1(a);
1213         __m128i b_lo = _mm256_extractf128_si256!0(b);
1214         __m128i b_hi = _mm256_extractf128_si256!1(b);
1215         __m128i r_lo = _mm_cmpgt_epi8(a_lo, b_lo);
1216         __m128i r_hi = _mm_cmpgt_epi8(a_hi, b_hi);
1217         return _mm256_set_m128i(r_hi, r_lo);
1218     }
1219 }
1220 unittest
1221 {
1222     __m256i A = _mm256_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1,   1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
1223     __m256i B = _mm256_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1,   2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 0);
1224     byte32 C = cast(byte32) _mm256_cmpgt_epi8(A, B);
1225     byte[32] correct =          [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0,   0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1,-1];
1226     assert(C.array == correct);
1227 }
1228 
1229 
1230 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
1231 __m256i _mm256_cvtepi16_epi32 (__m128i a) pure @trusted
1232 {
1233     static if (GDC_with_AVX2)
1234     {
1235         return cast(__m256i) __builtin_ia32_pmovsxwd256(cast(short8)a);
1236     }
1237     else static if (LDC_with_optimizations)
1238     {
1239         enum ir = `
1240             %r = sext <8 x i16> %0 to <8 x i32>
1241             ret <8 x i32> %r`;
1242         return cast(__m256i) LDCInlineIR!(ir, int8, short8)(cast(short8)a);
1243     }
1244     else
1245     {
1246         short8 sa = cast(short8)a;
1247         int8 r;
1248         r.ptr[0] = sa.array[0];
1249         r.ptr[1] = sa.array[1];
1250         r.ptr[2] = sa.array[2];
1251         r.ptr[3] = sa.array[3];
1252         r.ptr[4] = sa.array[4];
1253         r.ptr[5] = sa.array[5];
1254         r.ptr[6] = sa.array[6];
1255         r.ptr[7] = sa.array[7];
1256         return cast(__m256i)r;
1257     }
1258 }
1259 unittest
1260 {
1261     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
1262     int8 C = cast(int8) _mm256_cvtepi16_epi32(A);
1263     int[8] correct = [-1, 0, -32768, 32767, -1, 0, -32768, 32767];
1264     assert(C.array == correct);
1265 }
1266 
1267 
1268 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
1269 __m256i _mm256_cvtepi16_epi64 (__m128i a) pure @trusted
1270 {
1271     static if (GDC_with_AVX2)
1272     {
1273         return cast(__m256i) __builtin_ia32_pmovsxwq256(cast(short8)a);
1274     }
1275     else static if (LDC_with_optimizations)
1276     {
1277         enum ir = `
1278             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
1279             %r = sext <4 x i16> %v to <4 x i64>
1280             ret <4 x i64> %r`;
1281         return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a);
1282     }
1283     else
1284     {
1285         // LDC x86 generates vpmovsxwq since LDC 1.12 -O1
1286         short8 sa = cast(short8)a;
1287         long4 r;
1288         r.ptr[0] = sa.array[0];
1289         r.ptr[1] = sa.array[1];
1290         r.ptr[2] = sa.array[2];
1291         r.ptr[3] = sa.array[3];
1292         return cast(__m256i)r;
1293     }
1294 }
1295 unittest
1296 {
1297     __m128i A = _mm_setr_epi16(-1, 0, short.min, short.max, 2, 3, 4, 5);
1298     long4 C = cast(long4) _mm256_cvtepi16_epi64(A);
1299     long[4] correct = [-1, 0, short.min, short.max];
1300     assert(C.array == correct);
1301 }
1302 
1303 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
1304 __m256i _mm256_cvtepi32_epi64 (__m128i a) pure @trusted
1305 {
1306     long4 r;
1307     r.ptr[0] = a.array[0];
1308     r.ptr[1] = a.array[1];
1309     r.ptr[2] = a.array[2];
1310     r.ptr[3] = a.array[3];
1311     return cast(__m256i)r;
1312 }
1313 unittest
1314 {
1315     __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max);
1316     long4 C = cast(long4) _mm256_cvtepi32_epi64(A);
1317     long[4] correct = [-1, 0, int.min, int.max];
1318     assert(C.array == correct);
1319 }
1320 
1321 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
1322 __m256i _mm256_cvtepi8_epi16 (__m128i a) pure @trusted
1323 {
1324     static if (GDC_with_AVX2)
1325     {
1326         return cast(__m256i) __builtin_ia32_pmovsxbw256(cast(ubyte16)a);
1327     }
1328     else static if (LDC_with_optimizations)
1329     {
1330         enum ir = `
1331             %r = sext <16 x i8> %0 to <16 x i16>
1332             ret <16 x i16> %r`;
1333         return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a);
1334     }
1335     else
1336     {
1337         short16 r;
1338         byte16 ba = cast(byte16)a;
1339         for (int n = 0; n < 16; ++n)
1340         {
1341             r.ptr[n] = ba.array[n];
1342         }
1343         return cast(__m256i)r; 
1344     }
1345 }
1346 unittest
1347 {
1348     __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1349     short16 C = cast(short16) _mm256_cvtepi8_epi16(A);
1350     short[16] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
1351     assert(C.array == correct);
1352 }
1353 
1354 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
1355 __m256i _mm256_cvtepi8_epi32 (__m128i a) pure @trusted
1356 {
1357     static if (GDC_with_AVX2)
1358     {
1359         return cast(__m256i) __builtin_ia32_pmovsxbd256(cast(ubyte16)a);
1360     }
1361     else static if (LDC_with_optimizations)
1362     {
1363         enum ir = `
1364             %v = shufflevector <16 x i8> %0,<16 x i8> undef, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7>
1365             %r = sext <8 x i8> %v to <8 x i32>
1366             ret <8 x i32> %r`;
1367         return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a);
1368     }
1369     else
1370     {
1371         // PERF This is rather bad in GDC without AVX, or with DMD
1372         // should split that
1373         int8 r;
1374         byte16 ba = cast(byte16)a;
1375         for (int n = 0; n < 8; ++n)
1376         {
1377             r.ptr[n] = ba.array[n];
1378         }
1379         return cast(__m256i)r; 
1380     }
1381 }
1382 unittest
1383 {
1384     __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1385     int8 C = cast(int8) _mm256_cvtepi8_epi32(A);
1386     int[8] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5];
1387     assert(C.array == correct);
1388 }
1389 
1390 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
1391 __m256i _mm256_cvtepi8_epi64 (__m128i a) pure @trusted
1392 {
1393     // PERF This is rather bad in GDC without AVX
1394     static if (GDC_with_AVX2)
1395     {
1396         return cast(__m256i) __builtin_ia32_pmovsxbq256(cast(ubyte16)a);
1397     }
1398     else static if (LDC_with_ARM64)
1399     {
1400         // 4 inst since LDC 1.22 -O2 
1401         return _mm256_cvtepi16_epi64(_mm_cvtepi8_epi16(a));
1402     }
1403     else static if (LDC_with_optimizations)
1404     {
1405         enum ir = `
1406             %v = shufflevector <16 x i8> %0,<16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1407             %r = sext <4 x i8> %v to <4 x i64>
1408             ret <4 x i64> %r`;
1409         return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a);
1410     }
1411     else
1412     {
1413         long4 r;
1414         byte16 ba = cast(byte16)a;
1415         for (int n = 0; n < 4; ++n)
1416         {
1417             r.ptr[n] = ba.array[n];
1418         }
1419         return cast(__m256i)r; 
1420     }
1421 }
1422 unittest
1423 {
1424     __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1425     long4 C = cast(long4) _mm256_cvtepi8_epi64(A);
1426     long[4] correct = [-1, 0, byte.min, byte.max];
1427     assert(C.array == correct);
1428 }
1429 
1430 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
1431 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted
1432 {
1433     static if (GDC_with_AVX2)
1434     {
1435         return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a);
1436     }
1437     else
1438     {
1439         short8 sa = cast(short8)a;
1440         int8 r;
1441         r.ptr[0] = cast(ushort)sa.array[0];
1442         r.ptr[1] = cast(ushort)sa.array[1];
1443         r.ptr[2] = cast(ushort)sa.array[2];
1444         r.ptr[3] = cast(ushort)sa.array[3];
1445         r.ptr[4] = cast(ushort)sa.array[4];
1446         r.ptr[5] = cast(ushort)sa.array[5];
1447         r.ptr[6] = cast(ushort)sa.array[6];
1448         r.ptr[7] = cast(ushort)sa.array[7];
1449         return cast(__m256i)r;
1450     }
1451 }
1452 unittest
1453 {
1454     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
1455     int8 C = cast(int8) _mm256_cvtepu16_epi32(A);
1456     int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767];
1457     assert(C.array == correct);
1458 }
1459 
1460 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
1461 __m256i _mm256_cvtepu16_epi64(__m128i a) pure @trusted
1462 {
1463     static if (GDC_with_AVX2)
1464     {
1465         return cast(__m256i) __builtin_ia32_pmovzxwq256(cast(short8)a);
1466     }
1467     else static if (LDC_with_optimizations)
1468     {
1469         enum ir = `
1470             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
1471             %r = zext <4 x i16> %v to <4 x i64>
1472             ret <4 x i64> %r`;
1473         return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a);
1474     }
1475     else
1476     {
1477         short8 sa = cast(short8)a;
1478         long4 r;
1479         r.ptr[0] = cast(ushort)sa.array[0];
1480         r.ptr[1] = cast(ushort)sa.array[1];
1481         r.ptr[2] = cast(ushort)sa.array[2];
1482         r.ptr[3] = cast(ushort)sa.array[3];
1483         return cast(__m256i)r;
1484     }
1485 }
1486 unittest
1487 {
1488     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 2, 3, 4, 5);
1489     long4 C = cast(long4) _mm256_cvtepu16_epi64(A);
1490     long[4] correct = [65535, 0, 32768, 32767];
1491     assert(C.array == correct);
1492 }
1493 
1494 /// Zero-extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
1495 __m256i _mm256_cvtepu32_epi64 (__m128i a) pure @trusted
1496 {
1497     static if (GDC_with_AVX2)
1498     {
1499         return cast(__m256i) __builtin_ia32_pmovzxdq256(cast(int4)a);
1500     }
1501     else static if (LDC_with_optimizations)
1502     {
1503         enum ir = `
1504             %r = zext <4 x i32> %0 to <4 x i64>
1505             ret <4 x i64> %r`;
1506         return cast(__m256i) LDCInlineIR!(ir, long4, int4)(cast(int4)a);
1507     }
1508     else
1509     {
1510         long4 r;
1511         r.ptr[0] = cast(uint)a.array[0];
1512         r.ptr[1] = cast(uint)a.array[1];
1513         r.ptr[2] = cast(uint)a.array[2];
1514         r.ptr[3] = cast(uint)a.array[3];
1515         return cast(__m256i)r; 
1516     }
1517 }
1518 unittest
1519 {
1520     __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max);
1521     long4 C = cast(long4) _mm256_cvtepu32_epi64(A);
1522     long[4] correct = [uint.max, 0, 2_147_483_648, int.max];
1523     assert(C.array == correct);
1524 }
1525 
1526 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
1527 __m256i _mm256_cvtepu8_epi16 (__m128i a) pure @trusted
1528 {
1529     static if (GDC_with_AVX2)
1530     {
1531         return cast(__m256i) __builtin_ia32_pmovzxbw256(cast(ubyte16)a);
1532     }
1533     else static if (LDC_with_optimizations)
1534     {
1535         enum ir = `
1536             %r = zext <16 x i8> %0 to <16 x i16>
1537             ret <16 x i16> %r`;
1538         return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a);
1539     }
1540     else
1541     {
1542         short16 r;
1543         byte16 ba = cast(byte16)a;
1544         for (int n = 0; n < 16; ++n)
1545         {
1546             r.ptr[n] = cast(ubyte)ba.array[n];
1547         }
1548         return cast(__m256i)r; 
1549     }
1550 }
1551 unittest
1552 {
1553     __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1554     short16 C = cast(short16) _mm256_cvtepu8_epi16(A);
1555     short[16] correct     = [255, 0,  128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
1556     assert(C.array == correct);
1557 }
1558 
1559 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
1560 __m256i _mm256_cvtepu8_epi32 (__m128i a) pure @trusted
1561 {
1562     static if (GDC_with_AVX2)
1563     {
1564         return cast(__m256i) __builtin_ia32_pmovzxbd256(cast(ubyte16)a);
1565     }
1566     else static if (LDC_with_optimizations)
1567     {
1568         enum ir = `
1569             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7>
1570             %r = zext <8 x i8> %v to <8 x i32>
1571             ret <8 x i32> %r`;
1572         return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a);
1573     }
1574     else
1575     {
1576         int8 r;
1577         byte16 ba = cast(byte16)a;
1578         for (int n = 0; n < 8; ++n)
1579         {
1580             r.ptr[n] = cast(ubyte)ba.array[n];
1581         }
1582         return cast(__m256i)r; 
1583     }
1584 }
1585 unittest
1586 {
1587     __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1588     int8 C = cast(int8) _mm256_cvtepu8_epi32(A);
1589     int[8] correct     = [255, 0,  128, 127, 2, 3, 4, 5];
1590     assert(C.array == correct);
1591 }
1592 
1593 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 64-bit integers.
1594 __m256i _mm256_cvtepu8_epi64 (__m128i a) pure @trusted
1595 {
1596     // PERF ARM64+LDC, not awesome
1597     static if (GDC_with_AVX2)
1598     {
1599         return cast(__m256i) __builtin_ia32_pmovzxbq256(cast(ubyte16)a);
1600     }
1601     else static if (LDC_with_optimizations)
1602     {
1603         enum ir = `
1604             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
1605             %r = zext <4 x i8> %v to <4 x i64>
1606             ret <4 x i64> %r`;
1607         return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a);
1608     }
1609     else
1610     {
1611         long4 r;
1612         byte16 ba = cast(byte16)a;
1613         for (int n = 0; n < 4; ++n)
1614         {
1615             r.ptr[n] = cast(ubyte)ba.array[n];
1616         }
1617         return cast(__m256i)r; 
1618     }
1619 }
1620 unittest
1621 {
1622     __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1623     long4 C = cast(long4) _mm256_cvtepu8_epi64(A);
1624     long[4] correct     = [255, 0,  128, 127];
1625     assert(C.array == correct);
1626 }
1627 
1628 /// Extract a 16-bit integer from `a`, selected with index.
1629 int _mm256_extract_epi16 (__m256i a, int index) pure @trusted
1630 {
1631     short16 sa = cast(short16)a;
1632     return sa.ptr[index & 15];
1633 }
1634 unittest
1635 {
1636     short16 b;
1637     b = 43;
1638     assert(_mm256_extract_epi16(cast(__m256i)b, 7) == 43);
1639 }
1640 
1641 /// Extract a 8-bit integer from `a`, selected with index.
1642 int _mm256_extract_epi8 (__m256i a, int index) pure @trusted
1643 {
1644     byte32 sa = cast(byte32)a;
1645     return sa.ptr[index & 31];
1646 }
1647 unittest
1648 {
1649     byte32 b;
1650     b = -44;
1651     assert(_mm256_extract_epi8(cast(__m256i)b, 5) == -44);
1652     assert(_mm256_extract_epi8(cast(__m256i)b, 5 + 32) == -44);
1653 }
1654 
1655 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
1656 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted
1657     if ( (imm8 == 0) || (imm8 == 1) )
1658 {
1659     pragma(inline, true);
1660 
1661     static if (GDC_with_AVX2)
1662     {
1663         return cast(__m128i) __builtin_ia32_extract128i256(a, imm8);
1664     }
1665     else static if (LDC_with_optimizations)
1666     {
1667         enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>";
1668         enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~
1669                   "ret <2 x i64> %r";
1670         return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a);
1671     }
1672     else
1673     {
1674         long4 al = cast(long4) a;
1675         long2 ret;
1676         ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0];
1677         ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1];
1678         return cast(__m128i) ret;
1679     }
1680 }
1681 unittest
1682 {
1683     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 );
1684     int[4] correct0 = [ -7, -1, 0, 9 ];
1685     int[4] correct1 = [ -100, 100, 234, 432 ];
1686     __m128i R0 = _mm256_extracti128_si256!(0)(A);
1687     __m128i R1 = _mm256_extracti128_si256!(1)(A);
1688     assert(R0.array == correct0);
1689     assert(R1.array == correct1);
1690 }
1691 
1692 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
1693 __m256i _mm256_hadd_epi16 (__m256i a, __m256i b) pure @safe
1694 {
1695     static if (GDC_or_LDC_with_AVX2)
1696     {
1697         return cast(__m256i) __builtin_ia32_phaddw256(cast(short16)a, cast(short16)b);
1698     }
1699     else
1700     {
1701         __m128i a_lo = _mm256_extractf128_si256!0(a);
1702         __m128i a_hi = _mm256_extractf128_si256!1(a);
1703         __m128i b_lo = _mm256_extractf128_si256!0(b);
1704         __m128i b_hi = _mm256_extractf128_si256!1(b);
1705         __m128i r_lo = _mm_hadd_epi16(a_lo, b_lo);
1706         __m128i r_hi = _mm_hadd_epi16(a_hi, b_hi);
1707         return _mm256_set_m128i(r_hi, r_lo);
1708     }
1709 }
1710 unittest
1711 {
1712     __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768, 1, -2, 4, 8, 16, 32, -1, -32768);
1713     short16 C = cast(short16) _mm256_hadd_epi16(A, A);
1714     short[16] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767,  -1, 12, 48, 32767, -1, 12, 48, 32767];
1715     assert(C.array == correct);
1716 }
1717 
1718 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
1719 __m256i _mm256_hadd_epi32 (__m256i a, __m256i b) pure @safe
1720 {
1721     static if (GDC_or_LDC_with_AVX2)
1722     {
1723         return cast(__m256i) __builtin_ia32_phaddd256(cast(int8)a, cast(int8)b);
1724     }
1725     else
1726     {
1727         __m128i a_lo = _mm256_extractf128_si256!0(a);
1728         __m128i a_hi = _mm256_extractf128_si256!1(a);
1729         __m128i b_lo = _mm256_extractf128_si256!0(b);
1730         __m128i b_hi = _mm256_extractf128_si256!1(b);
1731         __m128i r_lo = _mm_hadd_epi32(a_lo, b_lo);
1732         __m128i r_hi = _mm_hadd_epi32(a_hi, b_hi);
1733         return _mm256_set_m128i(r_hi, r_lo);
1734     }
1735 }
1736 unittest
1737 {
1738     __m256i A = _mm256_setr_epi32(1, -2, int.min, -1, 1, -2, int.min, -1);
1739     __m256i B = _mm256_setr_epi32(1, int.max, 4, -4, 1, int.max, 4, -4);
1740     int8 C = cast(int8) _mm256_hadd_epi32(A, B);
1741     int[8] correct = [ -1, int.max, int.min, 0, -1, int.max, int.min, 0 ];
1742     assert(C.array == correct);
1743 }
1744 
1745 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, and pack the signed 16-bit results.
1746 __m256i _mm256_hadds_epi16 (__m256i a, __m256i b) pure @safe
1747 {
1748     static if (GDC_or_LDC_with_AVX2)
1749     {
1750         return cast(__m256i) __builtin_ia32_phaddsw256(cast(short16)a, cast(short16)b);
1751     }
1752     else
1753     {
1754         __m128i a_lo = _mm256_extractf128_si256!0(a);
1755         __m128i a_hi = _mm256_extractf128_si256!1(a);
1756         __m128i b_lo = _mm256_extractf128_si256!0(b);
1757         __m128i b_hi = _mm256_extractf128_si256!1(b);
1758         __m128i r_lo = _mm_hadds_epi16(a_lo, b_lo);
1759         __m128i r_hi = _mm_hadds_epi16(a_hi, b_hi);
1760         return _mm256_set_m128i(r_hi, r_lo);
1761     }
1762 }
1763 unittest
1764 {
1765     __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768, 1, -2, 4, 8, 16, 32, -1, -32768);
1766     short16 C = cast(short16) _mm256_hadds_epi16(A, A);
1767     short[16] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768, -1, 12, 48, -32768, -1, 12, 48, -32768];
1768     assert(C.array == correct);
1769 }
1770 
1771 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
1772 __m256i _mm256_hsub_epi16 (__m256i a, __m256i b) pure @safe
1773 {
1774     static if (GDC_or_LDC_with_AVX2)
1775     {
1776         return cast(__m256i) __builtin_ia32_phsubw256(cast(short16)a, cast(short16)b);
1777     }
1778     else
1779     {
1780         __m128i a_lo = _mm256_extractf128_si256!0(a);
1781         __m128i a_hi = _mm256_extractf128_si256!1(a);
1782         __m128i b_lo = _mm256_extractf128_si256!0(b);
1783         __m128i b_hi = _mm256_extractf128_si256!1(b);
1784         __m128i r_lo = _mm_hsub_epi32(a_lo, b_lo);
1785         __m128i r_hi = _mm_hsub_epi32(a_hi, b_hi);
1786         return _mm256_set_m128i(r_hi, r_lo);
1787     }
1788 }
1789 unittest
1790 {
1791     __m256i A = _mm256_setr_epi32(1, 2, int.min, 1, 1, 2, int.min, 1);
1792     __m256i B = _mm256_setr_epi32(int.max, -1, 4, 4, int.max, -1, 4, 4);
1793     int8 C = cast(int8) _mm256_hsub_epi32(A, B);
1794     int[8] correct = [ -1, int.max, int.min, 0, -1, int.max, int.min, 0 ];
1795     assert(C.array == correct);
1796 }
1797 
1798 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
1799 __m256i _mm256_hsub_epi32 (__m256i a, __m256i b) pure @safe
1800 {
1801     static if (GDC_or_LDC_with_AVX2)
1802     {
1803         return cast(__m256i) __builtin_ia32_phsubd256(cast(int8)a, cast(int8)b);
1804     }
1805     else
1806     {
1807         __m128i a_lo = _mm256_extractf128_si256!0(a);
1808         __m128i a_hi = _mm256_extractf128_si256!1(a);
1809         __m128i b_lo = _mm256_extractf128_si256!0(b);
1810         __m128i b_hi = _mm256_extractf128_si256!1(b);
1811         __m128i r_lo = _mm_hsub_epi32(a_lo, b_lo);
1812         __m128i r_hi = _mm_hsub_epi32(a_hi, b_hi);
1813         return _mm256_set_m128i(r_hi, r_lo);
1814     }
1815 }
1816 unittest
1817 {
1818     __m256i A = _mm256_setr_epi32(1, 2, int.min, 1, 1, 2, int.min, 1);
1819     __m256i B = _mm256_setr_epi32(int.max, -1, 4, 4, int.max, -1, 4, 4);
1820     int8 C = cast(int8) _mm256_hsub_epi32(A, B);
1821     int[8] correct = [ -1, int.max, int.min, 0,  -1, int.max, int.min, 0 ];
1822     assert(C.array == correct);
1823 }
1824 
1825 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, and pack the signed 16-bit results.
1826 __m256i _mm256_hsubs_epi16 (__m256i a, __m256i b) pure @safe
1827 {
1828     static if (GDC_or_LDC_with_AVX2)
1829     {
1830         return cast(__m256i) __builtin_ia32_phsubsw256(cast(short16)a, cast(short16)b);
1831     }
1832     else
1833     {
1834         __m128i a_lo = _mm256_extractf128_si256!0(a);
1835         __m128i a_hi = _mm256_extractf128_si256!1(a);
1836         __m128i b_lo = _mm256_extractf128_si256!0(b);
1837         __m128i b_hi = _mm256_extractf128_si256!1(b);
1838         __m128i r_lo = _mm_hsubs_epi16(a_lo, b_lo);
1839         __m128i r_hi = _mm_hsubs_epi16(a_hi, b_hi);
1840         return _mm256_set_m128i(r_hi, r_lo);
1841     }
1842 }
1843 unittest
1844 {
1845     __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767, 1, -2, 4, 8, 32767, -1, -10, 32767);
1846     short16 C = cast(short16) _mm256_hsubs_epi16(A, A);
1847     short[16] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768, 3, -4, 32767, -32768, 3, -4, 32767, -32768 ];
1848     assert(C.array == correct);
1849 }
1850 
1851 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded 
1852 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
1853 /// (each index is scaled by the factor in `scale`). Return gathered elements. 
1854 /// `scale` should be 1, 2, 4 or 8.
1855 __m128i _mm_i32gather_epi32(int scale)(const(int)* base_addr, __m128i vindex) @system
1856 {
1857     __m128i src;
1858     return _mm_mask_i32gather_epi32!scale(src, base_addr, vindex, _mm_set1_epi32(-1));
1859 }
1860 unittest
1861 {
1862     int[8] data = [0, 1, 2, 3, 
1863                    4, 5, 6, 7]; 
1864     __m128i vindex = _mm_setr_epi32(-2, 0, 4, 6);
1865     int4 A = cast(int4) _mm_i32gather_epi32!2(&data[1], vindex);
1866     int[4] correctA = [0, 1, 3, 4];
1867     assert(A.array == correctA);
1868 }
1869 
1870 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded 
1871 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
1872 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using 
1873 /// `mask` (elements are copied from `src` when the highest bit is not set in the 
1874 /// corresponding element). `scale` should be 1, 2, 4 or 8.
1875 __m128i _mm_mask_i32gather_epi32(int scale)(__m128i src, const(int)* base_addr, __m128i vindex, __m128i mask) @system
1876 {
1877     static assert(isValidSIBScale(scale));
1878     static if (LDC_with_AVX2)
1879     {
1880         return cast(__m128i) __builtin_ia32_gatherd_d(src, base_addr, vindex, mask, cast(ubyte)scale);
1881     }
1882     else static if (GDC_with_AVX2)
1883     {
1884         // Not pure, so the intrinsic cannot be pure.
1885         return cast(__m128i) __builtin_ia32_gathersiv4si (src, base_addr, vindex, mask, scale);
1886     }
1887     else
1888     {
1889         __m128i r;
1890         for (int n = 0; n < 4; ++n)
1891         {
1892             int index = vindex.array[n];
1893             long offset = cast(long)index * scale;
1894             void* p = cast(void*)(base_addr);
1895             if (mask.array[n] < 0)
1896                 r.ptr[n] = *cast(int*)(p + offset);
1897             else
1898                 r.ptr[n] = src.ptr[n];
1899         }
1900         return r;
1901     }
1902 }
1903 unittest
1904 {
1905     int[24] data = [0, 1, 2, 3, 
1906                     4, 5, 6, 7, 
1907                     8, 9, 10, 11, 
1908                     12, 13, 14, 15,
1909                     16, 17, 18, 19,
1910                     20, 21, 22, 23];
1911     __m128i src    = _mm_setr_epi32(-1, -2, -3, -4);
1912     __m128i mask   = _mm_setr_epi32(-4,  4, -1, -2);
1913     __m128i vindex = _mm_setr_epi32(-4,  4,  0,  8);
1914 
1915     int4 A = cast(int4) _mm_mask_i32gather_epi32!1(src, &data[10], vindex, mask);
1916     int4 B = cast(int4) _mm_mask_i32gather_epi32!2(src, &data[10], vindex, mask);
1917     int4 C = cast(int4) _mm_mask_i32gather_epi32!4(src, &data[10], vindex, mask);
1918     int[4] correctA = [9, -2, 10, 12];
1919     int[4] correctB = [8, -2, 10, 14];
1920     int[4] correctC = [6, -2, 10, 18];
1921     assert(A.array == correctA);
1922     assert(B.array == correctB);
1923     assert(C.array == correctC);
1924 }
1925 
1926 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded 
1927 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
1928 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 
1929 /// `scale` should be 1, 2, 4 or 8.
1930 __m256i _mm256_i32gather_epi32(int scale)(const(int)* base_addr, __m256i vindex) @system
1931 {
1932     __m256i src;
1933     return _mm256_mask_i32gather_epi32!scale(src, base_addr, vindex, _mm256_set1_epi32(-1));
1934 }
1935 unittest
1936 {
1937     int[8] data = [0, 1, 2, 3, 
1938                    4, 5, 6, 7]; 
1939     __m256i vindex = _mm256_setr_epi32(-1, 0, 2, 1, -2, -1, 1, 1);
1940     int8 A = cast(int8) _mm256_i32gather_epi32!4(&data[3], vindex);
1941     int[8] correctA = [2, 3, 5, 4, 1, 2, 4, 4];
1942     assert(A.array == correctA);
1943 }
1944 
1945 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded
1946 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
1947 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask
1948 /// (elements are copied from `src` when the highest bit is not set in the corresponding element).
1949 /// `scale` should be 1, 2, 4 or 8.
1950 __m256i _mm256_mask_i32gather_epi32(int scale)(__m256i src, const(int)* base_addr, __m256i vindex, __m256i mask) @system
1951 {
1952     static assert(isValidSIBScale(scale));
1953     static if (LDC_with_AVX2)
1954     {
1955         // Not pure, so the intrinsic cannot be pure.
1956         return cast(__m256i) __builtin_ia32_gatherd_d256(cast(int8)src, base_addr, cast(int8)vindex, cast(int8)mask, cast(ubyte)scale);
1957     }
1958     else static if (GDC_with_AVX2)
1959     {
1960         return cast(__m256i) __builtin_ia32_gathersiv8si (cast(int8)src, base_addr, cast(int8)vindex, cast(int8)mask, scale);
1961     }
1962     else
1963     {
1964         int8 r;
1965         int8 vindexi = cast(int8)vindex;
1966         int8 srci = cast(int8)src;
1967         int8 maski = cast(int8)mask;
1968         for (int n = 0; n < 8; ++n)
1969         {
1970             int index = vindexi.array[n];
1971             long offset = cast(long)index * scale;
1972             void* p = cast(void*)(base_addr);
1973             if (maski.array[n] < 0)
1974                 r.ptr[n] = *cast(int*)(p + offset);
1975             else
1976                 r.ptr[n] = srci.ptr[n];
1977         }
1978         return cast(__m256i)r;
1979     }
1980 }
1981 unittest
1982 {
1983     int[24] data = [0, 1, 2, 3, 
1984                     4, 5, 6, 7, 
1985                     8, 9, 10, 11, 
1986                     12, 13, 14, 15,
1987                     16, 17, 18, 19,
1988                     20, 21, 22, 23];
1989     __m256i src    = _mm256_setr_epi32(-1, -2, -3, -4, -5, -6, -7, -8);
1990     __m256i mask   = _mm256_setr_epi32(-4,  4, -1, -2,  0,  0, -8, -9);
1991     __m256i vindex = _mm256_setr_epi32(-4,  4,  0,  8,  0, 12, -8,  4);
1992 
1993     int8 A = cast(int8) _mm256_mask_i32gather_epi32!1(src, &data[10], vindex, mask);
1994     int8 B = cast(int8) _mm256_mask_i32gather_epi32!2(src, &data[10], vindex, mask);
1995     int8 C = cast(int8) _mm256_mask_i32gather_epi32!4(src, &data[10], vindex, mask);
1996     int[8] correctA = [9, -2, 10, 12, -5, -6, 8, 11];
1997     int[8] correctB = [8, -2, 10, 14, -5, -6, 6, 12];
1998     int[8] correctC = [6, -2, 10, 18, -5, -6, 2, 14];
1999     assert(A.array == correctA);
2000     assert(B.array == correctB);
2001     assert(C.array == correctC);
2002 }
2003 
2004 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 
2005 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
2006 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 
2007 /// `scale` should be 1, 2, 4 or 8.
2008 __m128i _mm_i32gather_epi64(int scale)(const(long)* base_addr, __m128i vindex) @system
2009 {
2010     __m128i src;
2011     return _mm_mask_i32gather_epi64!scale(src, base_addr, vindex, _mm_set1_epi64x(-1));
2012 }
2013 unittest
2014 {
2015     long[8] data = [0, 1, 2, 3, 
2016                     4, 5, 6, 7]; 
2017     __m128i vindex = _mm_setr_epi32(-4, 24, 420, 420);
2018     long2 A = cast(long2) _mm_i32gather_epi64!2(&data[1], vindex);
2019     long[2] correctA = [0, 7];
2020     assert(A.array == correctA);
2021 }
2022 
2023 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 
2024 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
2025 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask 
2026 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2027 /// `scale` should be 1, 2, 4 or 8.
2028 __m128i _mm_mask_i32gather_epi64(int scale)(__m128i src, const(long)* base_addr, __m128i vindex, __m128i mask) @system
2029 {
2030     static assert(isValidSIBScale(scale));
2031     static if (GDC_with_AVX2)
2032     {
2033         return cast(__m128i) __builtin_ia32_gathersiv2di(cast(long2)src, base_addr, cast(int4)vindex, cast(long2)mask, scale);
2034     }
2035     else static if (LDC_with_AVX2)
2036     {
2037         return cast(__m128i) __builtin_ia32_gatherd_q(cast(long2)src, base_addr, cast(int4)vindex, cast(long2)mask, scale);
2038     }
2039     else
2040     {
2041         // Note: top 2 indexes in vindex are unused
2042         long2 r;
2043         int4 vindexi = cast(int4)vindex;
2044         long2 srci = cast(long2)src;
2045         long2 maski = cast(long2)mask;
2046         for (int n = 0; n < 2; ++n)
2047         {
2048             int index = vindexi.array[n];
2049             long offset = cast(long)index * scale;
2050             void* p = cast(void*)(base_addr);
2051             if (maski.array[n] < 0)
2052                 r.ptr[n] = *cast(long*)(p + offset);
2053             else
2054                 r.ptr[n] = srci.ptr[n];
2055         }
2056         return cast(__m128i)r;
2057     }
2058 }
2059 unittest
2060 {
2061     long[8] data = [0, 1, 2, 3, 
2062                     4, 5, 6, 7]; 
2063     __m128i src    = _mm_setr_epi64(-1, -2);
2064     __m128i mask   = _mm_setr_epi64(0, -1);
2065     __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 420);
2066     long2 A = cast(long2) _mm_mask_i32gather_epi64!2(src, &data[1], vindex, mask);
2067     long[2] correctA = [-1, 7];
2068     assert(A.array == correctA);
2069 }
2070 
2071 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 
2072 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex`
2073 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 
2074 /// `scale` should be 1, 2, 4 or 8.
2075 __m256i _mm256_i32gather_epi64(int scale)(const(long)* base_addr, __m128i vindex) @system
2076 {
2077     __m256i src;
2078     return _mm256_mask_i32gather_epi64!scale(src, base_addr, vindex, _mm256_set1_epi64x(-1));
2079 }
2080 unittest
2081 {
2082     long[8] data = [0, 1, 2, 3, 
2083                     4, 5, 6, 7]; 
2084     __m128i vindex = _mm_setr_epi32(-4, 24, 0, 12);
2085     long4 A = cast(long4) _mm256_i32gather_epi64!2(&data[1], vindex);
2086     long[4] correctA = [0, 7, 1, 4];
2087     assert(A.array == correctA);
2088 }
2089 
2090 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 
2091 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
2092 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask 
2093 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2094 /// `scale` should be 1, 2, 4 or 8.
2095 __m256i _mm256_mask_i32gather_epi64(int scale)(__m256i src, const(long)* base_addr, __m128i vindex, __m256i mask) @system
2096 {
2097     static assert(isValidSIBScale(scale));
2098     static if (LDC_with_AVX2)
2099     {
2100         return cast(__m256i) __builtin_ia32_gatherd_q256(cast(long4)src, base_addr, cast(int4)vindex, cast(long4)mask, cast(ubyte)scale);
2101     }
2102     else static if (GDC_with_AVX2)
2103     {
2104         return cast(__m256i) __builtin_ia32_gathersiv4di (cast(long4)src, base_addr, cast(int4)vindex, cast(long4)mask, scale);
2105     }
2106     else
2107     {
2108         long4 r;
2109         int4 vindexi = cast(int4)vindex;
2110         long4 srci = cast(long4)src;
2111         long4 maski = cast(long4)mask;
2112         for (int n = 0; n < 4; ++n)
2113         {
2114             int index = vindexi.array[n];
2115             long offset = cast(long)index * scale;
2116             void* p = cast(void*)(base_addr);
2117             if (maski.array[n] < 0)
2118                 r.ptr[n] = *cast(long*)(p + offset);
2119             else
2120                 r.ptr[n] = srci.ptr[n];
2121         }
2122         return cast(__m256i)r;
2123     }
2124 }
2125 unittest
2126 {
2127     long[8] data = [0, 1, 2, 3, 
2128                     4, 5, 6, 7]; 
2129     __m256i src    = _mm256_setr_epi64(-1, -2, -3, -4);
2130     __m256i mask   = _mm256_setr_epi64(0, -1, 0, -1);
2131     __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 4);
2132     long4 A = cast(long4) _mm256_mask_i32gather_epi64!2(src, &data[1], vindex, mask);
2133     long[4] correctA = [-1, 7, -3, 2];
2134     assert(A.array == correctA);
2135 }
2136 
2137 // Note: the floating point gathers reuse the integer gathers
2138 
2139 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 
2140 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2141 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned.
2142 /// `scale` should be 1, 2, 4 or 8.
2143 __m128d _mm_i32gather_pd(int scale)(const(double)* base_addr, __m128i vindex) @system
2144 {
2145     return cast(__m128d) _mm_i32gather_epi64!scale(cast(const(long)*) base_addr, vindex);
2146 }
2147 unittest
2148 {
2149     double[8] data = [0.0, 1.0, 2.0, 3.0, 
2150                       4.0, 5.0, 6.0, 7.0]; 
2151     __m128i vindex = _mm_setr_epi32(-4, 24, 420, 420);
2152     __m128d A = _mm_i32gather_pd!2(&data[1], vindex);
2153     double[2] correctA = [0.0, 7.0];
2154     assert(A.array == correctA);
2155 }
2156 
2157 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 
2158 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2159 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 
2160 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2161 /// `scale` should be 1, 2, 4 or 8.
2162 __m128d _mm_mask_i32gather_pd(int scale)(__m128d src, const(double)* base_addr, __m128i vindex, __m128d mask) @system
2163 {
2164     return cast(__m128d) _mm_mask_i32gather_epi64!scale(cast(__m128i)src, cast(const(long)*) base_addr, vindex, cast(__m128i)mask);
2165 }
2166 unittest
2167 {
2168     double[8] data = [0.0, 1.0, 2.0, 3.0, 
2169                       4.0, 5.0, 6.0, 7.0]; 
2170     __m128d src    = _mm_setr_pd(-1.0, -2.0);
2171     __m128d mask   = _mm_setr_pd(0.0, -1.0);
2172     __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 420);
2173     __m128d A = _mm_mask_i32gather_pd!2(src, &data[1], vindex, mask);
2174     double[2] correctA = [-1.0, 7.0];
2175     assert(A.array == correctA);
2176 }
2177 
2178 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 
2179 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2180 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned.
2181 /// `scale` should be 1, 2, 4 or 8.
2182 __m256d _mm256_i32gather_pd(int scale)(const(double)* base_addr, __m128i vindex) @system
2183 {
2184     return cast(__m256d) _mm256_i32gather_epi64!scale(cast(const(long)*) base_addr, vindex);
2185 }
2186 unittest
2187 {
2188     double[8] data = [0.0, 1.0, 2.0, 3.0, 
2189                       4.0, 5.0, 6.0, 7.0]; 
2190     __m128i vindex = _mm_setr_epi32(-4, 24, 0, 12);
2191     __m256d A = _mm256_i32gather_pd!2(&data[1], vindex);
2192     double[4] correctA = [0.0, 7.0, 1.0, 4.0];
2193     assert(A.array == correctA);
2194 }
2195 
2196 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 
2197 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2198 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 
2199 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2200 /// `scale` should be 1, 2, 4 or 8.
2201 __m256d _mm256_mask_i32gather_pd(int scale)(__m256d src, const(double)* base_addr, __m128i vindex, __m256d mask) @system
2202 {
2203     return cast(__m256d) _mm256_mask_i32gather_epi64!scale(cast(__m256i)src, cast(const(long)*) base_addr, vindex, cast(__m256i)mask);
2204 }
2205 unittest
2206 {    
2207     double[8] data = [0.0, 1.0, 2.0, 3.0, 
2208                       4.0, 5.0, 6.0, 7.0]; 
2209     __m256d src    = _mm256_setr_pd(-1.0, -2.0, -3.0, -4.0);
2210     __m256d mask   = _mm256_setr_pd(0.0, -1.0, 0.0, -1.0);
2211     __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 4);
2212     __m256d A = _mm256_mask_i32gather_pd!2(src, &data[1], vindex, mask);
2213     double[4] correctA = [-1.0, 7.0, -3.0, 2.0];
2214     assert(A.array == correctA);
2215 }
2216 
2217 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 
2218 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2219 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned.
2220 /// `scale` should be 1, 2, 4 or 8.
2221 __m128 _mm_i32gather_ps(int scale)(const(float)* base_addr, __m128i vindex) @system
2222 {
2223     return cast(__m128) _mm_i32gather_epi32!scale(cast(const(int)*) base_addr, vindex);
2224 }
2225 unittest
2226 {
2227     float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 
2228                      4.0f, 5.0f, 6.0f, 7.0f]; 
2229     __m128i vindex = _mm_setr_epi32(-2, 12, 0, 4);
2230     __m128 A = _mm_i32gather_ps!2(&data[1], vindex);
2231     float[4] correctA = [0.0f, 7.0f, 1.0f, 3.0f];
2232     assert(A.array == correctA);
2233 }
2234 
2235 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices.
2236 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2237 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 
2238 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2239 /// `scale` should be 1, 2, 4 or 8.
2240 __m128 _mm_mask_i32gather_ps(int scale)(__m128 src, const(float)* base_addr, __m128i vindex, __m128 mask) @system
2241 {
2242     return cast(__m128) _mm_mask_i32gather_epi32!scale(cast(__m128i)src, cast(const(int)*) base_addr, vindex, cast(__m128i)mask);
2243 }
2244 unittest
2245 {
2246     float[24] data = [0.0f, 1.0f, 2.0f, 3.0f, 
2247                       4.0f, 5.0f, 6.0f, 7.0f, 
2248                       8.0f, 9.0f, 10.0f, 11.0f, 
2249                       12.0f, 13.0f, 14.0f, 15.0f,
2250                       16.0f, 17.0f, 18.0f, 19.0f,
2251                       20.0f, 21.0f, 22.0f, 23.0f];
2252     __m128 src    = _mm_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f);
2253     __m128 mask   = _mm_setr_ps(-4.0f,  4.0f, -1.0f, -2.0f);
2254     __m128i vindex = _mm_setr_epi32(-4,  4,  0,  8);
2255     __m128 A = _mm_mask_i32gather_ps!1(src, &data[10], vindex, mask);
2256     float[4] correctA = [9.0f, -2.0f, 10.0f, 12.0f];
2257     assert(A.array == correctA);
2258 }
2259 
2260 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 
2261 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2262 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned.
2263 /// `scale` should be 1, 2, 4 or 8.
2264 __m256 _mm256_i32gather_ps(int scale)(const(float)* base_addr, __m256i vindex) @system
2265 {
2266     return cast(__m256) _mm256_i32gather_epi32!scale(cast(const(int)*) base_addr, vindex);
2267 }
2268 unittest
2269 {
2270     float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 
2271                      4.0f, 5.0f, 6.0f, 7.0f]; 
2272     __m256i vindex = _mm256_setr_epi32(-1, 0, 2, 1, -2, -1, 1, 1);
2273     __m256 A = _mm256_i32gather_ps!4(&data[3], vindex);
2274     float[8] correctA = [2.0f, 3.0f, 5.0f, 4.0f, 1.0f, 2.0f, 4.0f, 4.0f];
2275     assert(A.array == correctA);
2276 }
2277 
2278 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 
2279 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 
2280 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 
2281 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2282 /// `scale` should be 1, 2, 4 or 8.
2283 __m256 _mm256_mask_i32gather_ps(int scale)(__m256 src, const(float)* base_addr, __m256i vindex, __m256 mask) @system
2284 {
2285     return cast(__m256) _mm256_mask_i32gather_epi32!scale(cast(__m256i)src, cast(const(int)*) base_addr, vindex, cast(__m256i)mask);
2286 }
2287 unittest 
2288 {
2289     float[24] data = [0.0f, 1.0f, 2.0f, 3.0f,
2290                       4.0f, 5.0f, 6.0f, 7.0f,
2291                       8.0f, 9.0f, 10.0f, 11.0f,
2292                       12.0f, 13.0f, 14.0f, 15.0f,
2293                       16.0f, 17.0f, 18.0f, 19.0f,
2294                       20.0f, 21.0f, 22.0f, 23.0f];
2295     __m256 src    = _mm256_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f);
2296     __m256 mask   = _mm256_setr_ps(-4.0f, 4.0f, -1.0f, -2.0f, 0.0f, 0.0f, -8.0f, -9.0f);
2297     __m256i vindex = _mm256_setr_epi32(-4, 4, 0, 8, 0, 12, -8, 4);
2298 
2299     __m256 A = _mm256_mask_i32gather_ps!2(src, &data[10], vindex, mask);
2300     float[8] correctA = [8.0f, -2.0f, 10.0f, 14.0f, -5.0f, -6.0f, 6.0f, 12.0f];
2301     assert(A.array == correctA);
2302 }
2303 
2304 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 
2305 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 
2306 /// (each index is scaled by the factor in `scale`). Return gathered elements. 
2307 /// `scale` should be 1, 2, 4 or 8.
2308 __m128i _mm_i64gather_epi32(int scale)(const(int)* base_addr, __m128i vindex) @system
2309 {
2310     __m128i src;
2311     return _mm_mask_i64gather_epi32!scale(src, base_addr, vindex, _mm_set1_epi32(-1));
2312 }
2313 unittest
2314 {
2315     int[8] data = [0, 1, 2, 3, 
2316                    4, 5, 6, 7]; 
2317     __m128i vindex = _mm_setr_epi64(-2, 4);
2318     int4 A = cast(int4) _mm_i64gather_epi32!2(&data[1], vindex);
2319     int[4] correctA = [0, 3, 0, 0];
2320     assert(A.array == correctA);
2321 } 
2322 
2323 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 
2324 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 
2325 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using 
2326 /// `mask` (elements are copied from `src` when the highest bit is not set in the 
2327 /// corresponding element). `scale` should be 1, 2, 4 or 8.
2328 __m128i _mm_mask_i64gather_epi32(int scale)(__m128i src, const(int)* base_addr, __m128i vindex, __m128i mask) @system
2329 {
2330     static assert(isValidSIBScale(scale));
2331 
2332     static if (GDC_with_AVX2)
2333     {
2334         return cast(__m128i) __builtin_ia32_gatherdiv4si(cast(int4)src, base_addr, cast(long2)vindex, cast(int4)mask, scale);
2335     }
2336     else static if (LDC_with_AVX2)
2337     {
2338         return cast(__m128i) __builtin_ia32_gatherq_d(cast(int4)src, base_addr, cast(long2)vindex, cast(int4)mask, scale);
2339     }
2340     else
2341     {
2342         __m128i r;
2343         long2 vindexl = cast(long2)vindex;
2344         int4 srci = cast(int4)src;
2345         int4 maski = cast(int4)mask;
2346         for (int n = 0; n < 2; ++n)
2347         {
2348             long index = vindexl.array[n];
2349             long offset = index * scale;
2350             void* p = cast(void*)(base_addr);
2351             if (maski.array[n] < 0)
2352                 r.ptr[n] = *cast(int*)(p + offset);
2353             else
2354                 r.ptr[n] = srci.array[n];
2355         }
2356         r.ptr[2] = 0;
2357         r.ptr[3] = 0;
2358         return r;
2359     }
2360 }
2361 unittest
2362 {
2363     int[24] data = [0, 1, 2, 3, 
2364                     4, 5, 6, 7, 
2365                     8, 9, 10, 11, 
2366                     12, 13, 14, 15,
2367                     16, 17, 18, 19,
2368                     20, 21, 22, 23];
2369     __m128i src    = _mm_setr_epi32(-1, -2, -3, -4);
2370     __m128i mask   = _mm_setr_epi32(-4,  4, -1, -2);
2371     __m128i vindex = _mm_setr_epi64(-4,  8);
2372     int4 C = cast(int4) _mm_mask_i64gather_epi32!4(src, &data[10], vindex, mask);
2373     int[4] correctC = [6, -2, 0, 0];
2374     assert(C.array == correctC);
2375 }
2376 
2377 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 
2378 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 
2379 /// (each index is scaled by the factor in `scale`). Return gathered elements. 
2380 /// `scale` should be 1, 2, 4 or 8.
2381 __m128i _mm256_i64gather_epi32(int scale)(const(int)* base_addr, __m256i vindex) @system
2382 {
2383     __m128i src;
2384     return _mm256_mask_i64gather_epi32!scale(src, base_addr, vindex, _mm_set1_epi32(-1));
2385 }
2386 unittest
2387 {
2388     int[8] data = [0, 1, 2, 3, 
2389                    4, 5, 6, 7]; 
2390     __m256i vindex = _mm256_setr_epi64(-2, 4, 0, 2);
2391     int4 A = cast(int4) _mm256_i64gather_epi32!2(&data[1], vindex);
2392     int[4] correctA = [0, 3, 1, 2];
2393     assert(A.array == correctA);
2394 }
2395 
2396 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 
2397 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 
2398 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using 
2399 /// `mask` (elements are copied from `src` when the highest bit is not set in the 
2400 /// corresponding element). `scale` should be 1, 2, 4 or 8.
2401 __m128i _mm256_mask_i64gather_epi32(int scale)(__m128i src, const(int)* base_addr, __m256i vindex, __m128i mask) @system
2402 {
2403     static assert(isValidSIBScale(scale));
2404 
2405     static if (GDC_with_AVX2)
2406     {
2407         return cast(__m128i) __builtin_ia32_gatherdiv4si256(cast(int4)src, base_addr, cast(long4)vindex, cast(int4)mask, scale);
2408     }
2409     else static if (LDC_with_AVX2)
2410     {
2411         return cast(__m128i) __builtin_ia32_gatherq_d256(cast(int4)src, base_addr, cast(long4)vindex, cast(int4)mask, cast(ubyte)scale);
2412     }
2413     else
2414     {
2415         __m128i r = src;
2416         long4 vindexl = cast(long4)vindex;
2417         int4 srci = cast(int4)src;
2418         int4 maski = cast(int4)mask;
2419         for (int n = 0; n < 4; ++n)
2420         {
2421             long index = vindexl.array[n];
2422             long offset = index * scale;
2423             void* p = cast(void*)(base_addr);
2424             if (maski.array[n] < 0)
2425                 r.ptr[n] = *cast(int*)(p + offset);
2426             else
2427                 r.ptr[n] = srci.ptr[n];
2428         }
2429         return r;
2430     }
2431 }
2432 unittest
2433 {
2434     int[24] data = [0, 1, 2, 3, 
2435                     4, 5, 6, 7, 
2436                     8, 9, 10, 11, 
2437                     12, 13, 14, 15,
2438                     16, 17, 18, 19,
2439                     20, 21, 22, 23];
2440     __m128i src    = _mm_setr_epi32(-1, -2, -3, -4);
2441     __m128i mask   = _mm_setr_epi32(-4,  4, -1, -2);
2442     __m256i vindex = _mm256_setr_epi64(-4,  8, 0, 12);
2443 
2444     int4 A = cast(int4) _mm256_mask_i64gather_epi32!1(src, &data[10], vindex, mask);
2445     int4 B = cast(int4) _mm256_mask_i64gather_epi32!2(src, &data[10], vindex, mask);
2446     int4 C = cast(int4) _mm256_mask_i64gather_epi32!4(src, &data[10], vindex, mask);
2447     int[4] correctA = [9, -2, 10, 13];
2448     int[4] correctB = [8, -2, 10, 16];
2449     int[4] correctC = [6, -2, 10, 22];
2450     assert(A.array == correctA);
2451     assert(B.array == correctB);
2452     assert(C.array == correctC);
2453 }
2454 
2455 /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded 
2456 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 
2457 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 
2458 /// `scale` should be 1, 2, 4 or 8.
2459 __m128i _mm_i64gather_epi64(int scale)(const(long)* base_addr, __m128i vindex) @system
2460 {
2461     __m128i src;
2462     return _mm_mask_i64gather_epi64!scale(src, base_addr, vindex, _mm_set1_epi64x(-1));
2463 }
2464 unittest
2465 {
2466     long[8] data = [0, 1, 2, 3, 
2467                     4, 5, 6, 7]; 
2468     __m128i vindex = _mm_setr_epi64(-4, 24);
2469     long2 A = cast(long2) _mm_i64gather_epi64!2(&data[1], vindex);
2470     long[2] correctA = [0, 7];
2471     assert(A.array == correctA);
2472 }
2473 
2474 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 
2475 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 
2476 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask 
2477 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2478 /// `scale` should be 1, 2, 4 or 8.
2479 __m128i _mm_mask_i64gather_epi64(int scale)(__m128i src, const(long)* base_addr, __m128i vindex, __m128i mask) @system
2480 {
2481     static assert(isValidSIBScale(scale));
2482 
2483     static if (GDC_with_AVX2)
2484     {
2485         return cast(__m128i) __builtin_ia32_gatherdiv2di(cast(long2)src, base_addr, cast(long2)vindex, cast(long2)mask, scale);
2486     }
2487     else static if (LDC_with_AVX2)
2488     {
2489         return cast(__m128i) __builtin_ia32_gatherq_q(cast(long2)src, base_addr, cast(long2)vindex, cast(long2)mask, scale);
2490     }
2491     else
2492     {
2493         // Note: top 2 indexes in vindex are unused
2494         long2 r;
2495         long2 vindexi = cast(long2)vindex;
2496         long2 srci = cast(long2)src;
2497         long2 maski = cast(long2)mask;
2498         for (int n = 0; n < 2; ++n)
2499         {
2500             long index = vindexi.array[n];
2501             long offset = index * scale;
2502             void* p = cast(void*)(base_addr);
2503             if (maski.array[n] < 0)
2504                 r.ptr[n] = *cast(long*)(p + offset);
2505             else
2506                 r.ptr[n] = srci.array[n];
2507         }
2508         return cast(__m128i)r;
2509     }
2510 }
2511 unittest
2512 {
2513     long[8] data = [0, 1, 2, 3, 
2514                     4, 5, 6, 7]; 
2515     __m128i src    = _mm_setr_epi64(-1, -2);
2516     __m128i mask   = _mm_setr_epi64(0, -1);
2517     __m128i vindex = _mm_setr_epi64(-400, 3*8);
2518     long2 A = cast(long2) _mm_mask_i64gather_epi64!2(src, &data[1], vindex, mask);
2519     long2 B = cast(long2) _mm_mask_i64gather_epi64!1(src, &data[1], vindex, mask);
2520     long[2] correctA = [-1, 7];
2521     long[2] correctB = [-1, 4];
2522     assert(A.array == correctA);
2523     assert(B.array == correctB);
2524 }
2525 
2526 /// Gather 64-bit integers from memory using 64-bit indices. 
2527 /// 64-bit elements are loaded from addresses starting at `base_addr` and 
2528 /// offset by each 64-bit element in `vindex` (each index is scaled by the 
2529 /// factor in `scale`). Gathered elements are returned. 
2530 /// `scale` should be 1, 2, 4 or 8.
2531 __m256i _mm256_i64gather_epi64(int scale)(const(long)* base_addr, __m256i vindex) @system
2532 {
2533     __m256i src;
2534     return _mm256_mask_i64gather_epi64!scale(src, base_addr, vindex, _mm256_set1_epi64x(-1));
2535 }
2536 unittest
2537 {
2538     long[8] data = [0, 1, 2, 3, 
2539                     4, 5, 6, 7]; 
2540     __m256i vindex = _mm256_setr_epi64(-4, 24, 12, 4);
2541     long4 A = cast(long4) _mm256_i64gather_epi64!2(&data[1], vindex);
2542     long[4] correctA = [0, 7, 4, 2];
2543     assert(A.array == correctA);
2544 }
2545 
2546 /// Gather 64-bit integers from memory using 64-bit indices. 
2547 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each
2548 /// 64-bit element in `vindex` (each index is scaled by the factor in `scale`). 
2549 /// Gathered elements are merged into `dst` using mask (elements are copied from `src`
2550 /// when the highest bit is not set in the corresponding element). 
2551 /// `scale` should be 1, 2, 4 or 8.
2552 __m256i _mm256_mask_i64gather_epi64(int scale)(__m256i src, const(long)* base_addr, __m256i vindex, __m256i mask) @system
2553 {
2554     static assert(isValidSIBScale(scale));
2555     static if (LDC_with_AVX2)
2556     {
2557         return cast(__m256i) __builtin_ia32_gatherq_q256(cast(long4)src, base_addr, cast(long4)vindex, cast(long4)mask, scale);
2558     }
2559     else static if (GDC_with_AVX2)
2560     {
2561         return cast(__m256i) __builtin_ia32_gatherdiv4di (cast(long4)src, base_addr, cast(long4)vindex, cast(long4)mask, scale);
2562     }
2563     else 
2564     {
2565         long4 r;
2566         long4 vindexi = cast(long4)vindex;
2567         long4 srci = cast(long4)src;
2568         long4 maski = cast(long4)mask;
2569         for (int n = 0; n < 4; ++n)
2570         {
2571             long index = vindexi.array[n];
2572             long offset = index * scale;
2573             void* p = cast(void*)(base_addr);
2574             if (maski.array[n] < 0)
2575                 r.ptr[n] = *cast(long*)(p + offset);
2576             else
2577                 r.ptr[n] = srci.array[n];
2578         }
2579         return cast(__m256i)r;
2580     }
2581 }
2582 unittest
2583 {
2584     long[8] data = [0, 1, 2, 3, 
2585                     4, 5, 6, 7]; 
2586     __m256i src    = _mm256_setr_epi64(-1, -2, -3, -4);
2587     __m256i mask   = _mm256_setr_epi64(0, -1, 0, -1);
2588     __m256i vindex = _mm256_setr_epi64(-400, 3*8, 420, 4);
2589     long4 A = cast(long4) _mm256_mask_i64gather_epi64!2(src, &data[1], vindex, mask);
2590     long[4] correctA = [-1, 7, -3, 2];
2591     assert(A.array == correctA);
2592 }
2593 
2594 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 
2595 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 
2596 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are 
2597 /// returned. `scale` should be 1, 2, 4 or 8.
2598 __m128d _mm_i64gather_pd(int scale)(const(double)* base_addr, __m128i vindex) @system
2599 {
2600     return cast(__m128d) _mm_i64gather_epi64!scale(cast(const(long)*)base_addr, vindex);
2601 }
2602 unittest
2603 {
2604     double[8] data = [0.0, 1.0, 2.0, 3.0, 
2605                       4.0, 5.0, 6.0, 7.0]; 
2606     __m128i vindex = _mm_setr_epi64(-4, 24);
2607     __m128d A = _mm_i64gather_pd!2(&data[1], vindex);
2608     double[2] correctA = [0.0, 7.0];
2609     assert(A.array == correctA);
2610 }
2611 
2612 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 
2613 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 
2614 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged 
2615 /// into `dst` using `mask` (elements are copied from `src` when the highest bit is not set in the 
2616 /// corresponding element). `scale` should be 1, 2, 4 or 8.
2617 __m128d _mm_mask_i64gather_pd(int scale)(__m128d src, const(double)* base_addr, __m128i vindex, __m128d mask) @system
2618 {
2619     return cast(__m128d) _mm_mask_i64gather_epi64!scale(cast(__m128i)src, cast(const(long)*)base_addr, vindex, cast(__m128i) mask);
2620 }
2621 unittest
2622 {
2623     double[8] data = [0.0, 1.0, 2.0, 3.0, 
2624                       4.0, 5.0, 6.0, 7.0]; 
2625     __m128d src    = _mm_setr_pd(-1.0, -2.0);
2626     __m128d mask   = _mm_setr_pd(0.0, -1.0);
2627     __m128i vindex = _mm_setr_epi64(-400, 3*8);
2628     __m128d A = _mm_mask_i64gather_pd!2(src, &data[1], vindex, mask);
2629     double[2] correctA = [-1.0, 7.0];
2630     assert(A.array == correctA);
2631 }
2632 
2633 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices.
2634 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit
2635 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned.
2636 /// `scale` should be 1, 2, 4 or 8.
2637 __m256d _mm256_i64gather_pd(int scale)(const(double)* base_addr, __m256i vindex) @system
2638 {
2639     return cast(__m256d) _mm256_i64gather_epi64!scale(cast(const(long)*)base_addr, vindex);
2640 }
2641 unittest
2642 {
2643     double[8] data = [0.0, 1.0, 2.0, 3.0,
2644                       4.0, 5.0, 6.0, 7.0];
2645     __m256i vindex = _mm256_setr_epi64(-4, 24, 0, 12);
2646     __m256d A = _mm256_i64gather_pd!2(&data[1], vindex);
2647     double[4] correctA = [0.0, 7.0, 1.0, 4.0];
2648     assert(A.array == correctA);
2649 }
2650 
2651 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 
2652 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 
2653 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 
2654 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2655 /// `scale` should be 1, 2, 4 or 8.
2656 __m256d _mm256_mask_i64gather_pd(int scale)(__m256d src, const(double)* base_addr, __m256i vindex, __m256d mask) @system
2657 {
2658     return cast(__m256d) _mm256_mask_i64gather_epi64!scale(cast(__m256i)src, cast(const(long)*)base_addr, vindex, cast(__m256i) mask);
2659 }
2660 unittest
2661 {    
2662     double[8] data = [0.0, 1.0, 2.0, 3.0,
2663                       4.0, 5.0, 6.0, 7.0];
2664     __m256d src    = _mm256_setr_pd(-1.0, -2.0, -3.0, -4.0);
2665     __m256d mask   = _mm256_setr_pd(0.0, -1.0, 0.0, -1.0);
2666     __m256i vindex = _mm256_setr_epi64(-400, 3*8, 420, 4);
2667     __m256d A = _mm256_mask_i64gather_pd!2(src, &data[1], vindex, mask);
2668     double[4] correctA = [-1.0, 7.0, -3.0, 2.0];
2669     assert(A.array == correctA);
2670 }
2671 
2672 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 
2673 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 
2674 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned.
2675 /// `scale` should be 1, 2, 4 or 8.
2676 __m128 _mm_i64gather_ps(int scale)(const(float)* base_addr, __m128i vindex) @system
2677 {
2678     return cast(__m128) _mm_i64gather_epi32!scale(cast(const(int)*)base_addr, vindex);
2679 }
2680 unittest
2681 {
2682     float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 
2683                      4.0f, 5.0f, 6.0f, 7.0f]; 
2684     __m128i vindex = _mm_setr_epi64(-2, 12);
2685     __m128 A = _mm_i64gather_ps!2(&data[1], vindex);
2686     float[4] correctA = [0.0f, 7.0f, 0.0f, 0.0f];
2687     assert(A.array == correctA);
2688 }
2689 
2690 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
2691 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 
2692 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 
2693 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2694 /// `scale` should be 1, 2, 4 or 8.
2695 __m128 _mm_mask_i64gather_ps(int scale)(__m128 src, const(float)* base_addr, __m128i vindex, __m128 mask) @system
2696 {
2697     return cast(__m128) _mm_mask_i64gather_epi32!scale(cast(__m128i) src, cast(const(int)*) base_addr, vindex, cast(__m128i) mask);
2698 }
2699 unittest
2700 {
2701     float[24] data = [0.0f, 1.0f, 2.0f, 3.0f, 
2702                       4.0f, 5.0f, 6.0f, 7.0f, 
2703                       8.0f, 9.0f, 10.0f, 11.0f, 
2704                       12.0f, 13.0f, 14.0f, 15.0f,
2705                       16.0f, 17.0f, 18.0f, 19.0f,
2706                       20.0f, 21.0f, 22.0f, 23.0f];
2707     __m128 src    = _mm_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f);
2708     __m128 mask   = _mm_setr_ps(-4.0f,  4.0f, -1.0f, -2.0f);
2709     __m128i vindex = _mm_setr_epi64(-4,  4);
2710     __m128 A = _mm_mask_i64gather_ps!1(src, &data[10], vindex, mask);
2711     float[4] correctA = [9.0f, -2.0f, 0.0f, 0.0f];
2712     assert(A.array == correctA);
2713 }
2714 
2715 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 
2716 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 
2717 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned.
2718 /// `scale` should be 1, 2, 4 or 8.
2719 __m128 _mm256_i64gather_ps(int scale)(const(float)* base_addr, __m256i vindex) @system
2720 {
2721     return cast(__m128) _mm256_i64gather_epi32!scale(cast(const(int)*)base_addr, vindex);
2722 }
2723 unittest
2724 {
2725     float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 
2726                      4.0f, 5.0f, 6.0f, 7.0f]; 
2727     __m256i vindex = _mm256_setr_epi64(-1, 0, 2, 1);
2728     __m128 A = _mm256_i64gather_ps!4(&data[3], vindex);
2729     float[4] correctA = [2.0f, 3.0f, 5.0f, 4.0f];
2730     assert(A.array == correctA);
2731 }
2732 
2733 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
2734 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 
2735 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 
2736 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 
2737 /// `scale` should be 1, 2, 4 or 8.
2738 __m128 _mm256_mask_i64gather_ps(int scale)(__m128 src, const(float)* base_addr, __m256i vindex, __m128 mask) @system
2739 {
2740     return cast(__m128) _mm256_mask_i64gather_epi32!scale(cast(__m128i) src, cast(const(int)*) base_addr, vindex, cast(__m128i) mask);
2741 }
2742 unittest 
2743 {
2744     float[24] data = [0.0f, 1.0f, 2.0f, 3.0f,
2745                       4.0f, 5.0f, 6.0f, 7.0f,
2746                       8.0f, 9.0f, 10.0f, 11.0f,
2747                       12.0f, 13.0f, 14.0f, 15.0f,
2748                       16.0f, 17.0f, 18.0f, 19.0f,
2749                       20.0f, 21.0f, 22.0f, 23.0f];
2750     __m128 src    = _mm_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f);
2751     __m128 mask   = _mm_setr_ps(-4.0f, 4.0f, -1.0f, -2.0f);
2752     __m256i vindex = _mm256_setr_epi64(-4, 4, 0, 8);
2753     __m128 A = _mm256_mask_i64gather_ps!2(src, &data[10], vindex, mask);
2754     float[4] correctA = [8.0f, -2.0f, 10.0f, 14.0f];
2755     assert(A.array == correctA);
2756 }
2757 
2758 /// Copy `a` to result, then insert 128 bits from `b` into result at the location specified by 
2759 /// `imm8`.
2760 __m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8) pure @trusted
2761 {
2762     long2 lb = cast(long2)b;
2763     a.ptr[(imm8 & 1)*2  ] = lb.array[0];
2764     a.ptr[(imm8 & 1)*2+1] = lb.array[1];
2765     return a; 
2766 }
2767 unittest
2768 {
2769     __m256i A = [0, 1, 2, 3];
2770     long2 B = [4, 5];
2771     __m256i C = _mm256_inserti128_si256(A, cast(__m128i)B, 0 + 8);
2772     __m256i D = _mm256_inserti128_si256(A, cast(__m128i)B, 1);
2773     long[4] correctC = [4, 5, 2, 3]; 
2774     long[4] correctD = [0, 1, 4, 5];
2775     assert(C.array == correctC);
2776     assert(D.array == correctD);
2777 }
2778 
2779 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2780 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2781 /// and pack the results in destination.
2782 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted
2783 {
2784     static if (GDC_with_AVX2)
2785     {
2786         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
2787     }
2788     else static if (LDC_with_AVX2)
2789     {
2790         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
2791     }
2792     else
2793     {
2794         // split is beneficial for ARM64, LDC and GDC without AVX2
2795         __m128i a_lo = _mm256_extractf128_si256!0(a);
2796         __m128i a_hi = _mm256_extractf128_si256!1(a);
2797         __m128i b_lo = _mm256_extractf128_si256!0(b);
2798         __m128i b_hi = _mm256_extractf128_si256!1(b);
2799         __m128i r_lo = _mm_madd_epi16(a_lo, b_lo);
2800         __m128i r_hi = _mm_madd_epi16(a_hi, b_hi);
2801         return _mm256_set_m128i(r_hi, r_lo);
2802     }
2803 }
2804 unittest
2805 {
2806     short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
2807     short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
2808     int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B);
2809     int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767];
2810     assert(R.array == correct);
2811 }
2812 
2813 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
2814 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
2815 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
2816 /// and pack the saturated results.
2817 __m256i _mm256_maddubs_epi16 (__m256i a, __m256i b) @safe
2818 {
2819     static if (GDC_with_AVX2)
2820     {
2821         return cast(__m256i)__builtin_ia32_pmaddubsw256(cast(ubyte32)a, cast(ubyte32)b);
2822     }
2823     else static if (LDC_with_AVX2)
2824     {
2825         return cast(__m256i)__builtin_ia32_pmaddubsw256(cast(byte32)a, cast(byte32)b);
2826     }
2827     else
2828     {
2829         __m128i a_lo = _mm256_extractf128_si256!0(a);
2830         __m128i a_hi = _mm256_extractf128_si256!1(a);
2831         __m128i b_lo = _mm256_extractf128_si256!0(b);
2832         __m128i b_hi = _mm256_extractf128_si256!1(b);
2833         __m128i r_lo = _mm_maddubs_epi16(a_lo, b_lo);
2834         __m128i r_hi = _mm_maddubs_epi16(a_hi, b_hi);
2835         return _mm256_set_m128i(r_hi, r_lo);
2836     }
2837 }
2838 unittest
2839 {
2840     __m128i A = _mm_setr_epi8(  -1,  10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8
2841     __m128i B = _mm_setr_epi8(-128, -30, 100,  127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8
2842     __m256i AA = _mm256_set_m128i(A, A);
2843     __m256i BB = _mm256_set_m128i(B, B);
2844     short16 C = cast(short16) _mm256_maddubs_epi16(AA, BB);
2845     short[16] correct =       [   -32768,     26256, 0, 0, 0, 0, 0, 0,
2846                                   -32768,     26256, 0, 0, 0, 0, 0, 0];
2847     assert(C.array == correct);
2848 }
2849 
2850 version(DigitalMars)
2851 {
2852     // this avoids a bug with DMD < 2.099 -a x86 -O
2853     private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099);
2854 }
2855 else
2856 {
2857     private enum bool maskLoadWorkaroundDMD = false;
2858 }
2859 
2860 /// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest
2861 /// bit is not set in the corresponding element).
2862 /// Warning: See "Note about mask load/store" to know why you must address valid memory only.
2863 __m128i _mm_maskload_epi32 (const(int)* mem_addr, __m128i mask) /* pure */ @system
2864 {
2865     // PERF DMD
2866     static if (LDC_with_AVX2)
2867     {
2868         // MAYDO report that the builtin is impure
2869         return __builtin_ia32_maskloadd(mem_addr, mask);
2870     }
2871     else static if (GDC_with_AVX2)
2872     {
2873         return __builtin_ia32_maskloadd(cast(__m128i*)mem_addr, mask);
2874     }
2875     else
2876     {
2877         return cast(__m128i) _mm_maskload_ps(cast(const(float)*)mem_addr, mask);
2878     }
2879 }
2880 unittest
2881 {
2882     static if (!maskLoadWorkaroundDMD)
2883     {
2884         int[4] A = [7, 1, 2, 3];
2885         int4 B = _mm_maskload_epi32(A.ptr, _mm_setr_epi32(1, -1, -1, 1));  // can NOT address invalid memory with mask load and writes!
2886         int[4] correct = [0, 1, 2, 0];
2887         assert(B.array == correct);
2888     }
2889 }
2890 
2891 /// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest 
2892 /// bit is not set in the corresponding element).
2893 /// Warning: See "Note about mask load/store" to know why you must address valid memory only.
2894 __m256i _mm256_maskload_epi32 (const(int)* mem_addr, __m256i mask) /* pure */ @system
2895 {
2896     static if (LDC_with_AVX2)
2897     {
2898         return cast(__m256i) __builtin_ia32_maskloadd256(mem_addr, cast(int8)mask);
2899     }
2900     else static if (GDC_with_AVX2)
2901     {
2902         return cast(__m256i)__builtin_ia32_maskloadd256(cast(__m256i*)mem_addr, cast(int8)mask);
2903     }
2904     else
2905     {
2906         return cast(__m256i) _mm256_maskload_ps(cast(const(float*)) mem_addr, mask);
2907     }
2908 }
2909 unittest
2910 {
2911     int[8] A = [7, 1, 2, 3, 8, -2, 4, 5];
2912     int8 B = cast(int8) _mm256_maskload_epi32(A.ptr, _mm256_setr_epi32(1, -1, -1, 1, -1, -1, 1, 1));
2913     int[8] correct = [0, 1, 2, 0, 8, -2, 0, 0];
2914     assert(B.array == correct);
2915 }
2916 
2917 /// Load packed 64-bit integers from memory using `mask` (elements are zeroed out when the highest 
2918 /// bit is not set in the corresponding element).
2919 /// Warning: See "Note about mask load/store" to know why you must address valid memory only.
2920 __m128i _mm_maskload_epi64 (const(long)* mem_addr, __m128i mask) @system
2921 {
2922     // PERF DMD
2923     static if (LDC_with_AVX2)
2924     {
2925         return cast(__m128i) __builtin_ia32_maskloadq(mem_addr, cast(long2) mask);
2926     }
2927     else static if (GDC_with_AVX2)
2928     {
2929         return cast(__m128i) __builtin_ia32_maskloadq(cast(long2*)mem_addr, cast(long2) mask);
2930     }
2931     else
2932     {
2933         return cast(__m128i) _mm_maskload_pd(cast(const(double)*)mem_addr, mask);
2934     }
2935 }
2936 unittest
2937 {
2938     static if (!maskLoadWorkaroundDMD)
2939     {
2940         long[2] A = [-7, -8];
2941         long2 B = cast(long2) _mm_maskload_epi64(A.ptr, _mm_setr_epi64(1, -1));
2942         long[2] correct = [0, -8];
2943         assert(B.array == correct);
2944     }
2945 }
2946 
2947 /// Load packed 64-bit integers from memory using `mask` (elements are zeroed out when the highest 
2948 /// bit is not set in the corresponding element).
2949 /// Warning: See "Note about mask load/store" to know why you must address valid memory only.
2950 __m256i _mm256_maskload_epi64 (const(long)* mem_addr, __m256i mask) /* pure */ @system
2951 {
2952     static if (LDC_with_AVX2)
2953     {
2954         return cast(__m256i) __builtin_ia32_maskloadq256(mem_addr, cast(long4)mask);
2955     }
2956     else static if (GDC_with_AVX2)
2957     {
2958         return cast(__m256i)__builtin_ia32_maskloadq256(cast(__m256i*)mem_addr, cast(long4)mask);
2959     }
2960     else
2961     {
2962         return cast(__m256i) _mm256_maskload_pd(cast(const(double*)) mem_addr, mask);
2963     }
2964 }
2965 unittest
2966 {
2967     long[4] A = [ 8, -2, 4, 5];
2968     long4 B = cast(long4) _mm256_maskload_epi64(A.ptr, _mm256_setr_epi64(1, -1, -1, 1));
2969     long[4] correct = [0, -2, 4, 0];
2970 }
2971 
2972 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2973 __m256i _mm256_max_epi16 (__m256i a, __m256i b) pure @safe
2974 {
2975     // PERF D_SIMD
2976     version(GNU)
2977         enum bool split = true;
2978     else static if (SIMD_COMPARISON_MASKS_32B)
2979         enum bool split = false;
2980     else
2981         enum bool split = true;
2982 
2983     static if (GDC_with_AVX2)
2984     {
2985         return cast(__m256i) __builtin_ia32_pmaxsw256(cast(short16)a, cast(short16)b);
2986     }
2987     else static if (split)
2988     {
2989         // split
2990         __m128i a_lo = _mm256_extractf128_si256!0(a);
2991         __m128i a_hi = _mm256_extractf128_si256!1(a);
2992         __m128i b_lo = _mm256_extractf128_si256!0(b);
2993         __m128i b_hi = _mm256_extractf128_si256!1(b);
2994         __m128i r_lo = _mm_max_epi16(a_lo, b_lo);
2995         __m128i r_hi = _mm_max_epi16(a_hi, b_hi);
2996         return _mm256_set_m128i(r_hi, r_lo);
2997     }
2998     else static if (SIMD_COMPARISON_MASKS_32B)
2999     {
3000         // catastrophic with GDC x86 for some reason. Sad.
3001         short16 sa = cast(short16)a;
3002         short16 sb = cast(short16)b;
3003         short16 greater = sa > sb;
3004         return cast(__m256i)( (greater & sa) | (~greater & sb) );
3005     }
3006     else
3007         static assert(0);    
3008 }
3009 unittest
3010 {
3011     short16 R = cast(short16) _mm256_max_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, 0),
3012                                                _mm256_setr_epi16(   -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
3013     short[16] correct =                                         [32767, 1,  9,  7, 9,     7, 0,  0, 1, 2, 0, 4, 2, 1, 2, 0];
3014     assert(R.array == correct);
3015 }
3016 
3017 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed maximum values.
3018 __m256i _mm256_max_epi32 (__m256i a, __m256i b) pure @safe
3019 {
3020     // PERF D_SIMD
3021     version(GNU)
3022         enum bool split = true;
3023     else static if (SIMD_COMPARISON_MASKS_32B)
3024         enum bool split = false;
3025     else
3026         enum bool split = true;
3027 
3028     static if (GDC_with_AVX2)
3029     {
3030         return cast(__m256i) __builtin_ia32_pmaxsd256(cast(int8)a, cast(int8)b);
3031     }
3032     else static if (split)
3033     {
3034         // split
3035         __m128i a_lo = _mm256_extractf128_si256!0(a);
3036         __m128i a_hi = _mm256_extractf128_si256!1(a);
3037         __m128i b_lo = _mm256_extractf128_si256!0(b);
3038         __m128i b_hi = _mm256_extractf128_si256!1(b);
3039         __m128i r_lo = _mm_max_epi32(a_lo, b_lo);
3040         __m128i r_hi = _mm_max_epi32(a_hi, b_hi);
3041         return _mm256_set_m128i(r_hi, r_lo);
3042     }
3043     else static if (SIMD_COMPARISON_MASKS_32B) 
3044     {
3045         // catastrophic with GDC x86 for some reason, like for 16-bit numbers.
3046         int8 sa = cast(int8)a;
3047         int8 sb = cast(int8)b;
3048         int8 greater = sa > sb;
3049         return cast(__m256i)( (greater & sa) | (~greater & sb) );
3050     }
3051     else
3052         static assert(0);    
3053 }
3054 unittest
3055 {
3056     int8 R = cast(int8) _mm256_max_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4,  7, 0x7fffffff, 2, -4,  7),
3057                                          _mm256_setr_epi32(        -4,-8,  9, -8,-0x80000000,-8,  9, -8));
3058     int[8] correct =                                      [0x7fffffff, 1,  9,  7, 0x7fffffff, 2,  9,  7];
3059     assert(R.array == correct);
3060 }
3061 
3062 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed maximum values.
3063 __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @trusted
3064 {
3065     // PERF D_SIMD
3066     version(GNU)
3067         enum bool split = true;
3068     else static if (SIMD_COMPARISON_MASKS_32B)
3069         enum bool split = false;
3070     else
3071         enum bool split = true;
3072     static if (GDC_with_AVX2)
3073     {
3074         // Strangely, GDC asks for unsigned ubyte32
3075         return cast(__m256i) __builtin_ia32_pmaxsb256(cast(ubyte32)a, cast(ubyte32)b);
3076     }
3077     else static if (split)
3078     {
3079         // split
3080         __m128i a_lo = _mm256_extractf128_si256!0(a);
3081         __m128i a_hi = _mm256_extractf128_si256!1(a);
3082         __m128i b_lo = _mm256_extractf128_si256!0(b);
3083         __m128i b_hi = _mm256_extractf128_si256!1(b);
3084         __m128i r_lo = _mm_max_epi8(a_lo, b_lo);
3085         __m128i r_hi = _mm_max_epi8(a_hi, b_hi);
3086         return _mm256_set_m128i(r_hi, r_lo);
3087     }
3088     else static if (SIMD_COMPARISON_MASKS_32B)
3089     {
3090         // This is real bad with GDC, again
3091         byte32 sa = cast(byte32)a;
3092         byte32 sb = cast(byte32)b;
3093         byte32 greater = cast(byte32)(sa > sb);
3094         return cast(__m256i)( (greater & sa) | (~greater & sb) );
3095     }
3096     else
3097         static assert(false);
3098 }
3099 unittest
3100 {
3101     __m256i A = _mm256_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
3102     __m256i B = _mm256_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 4, 0, 0);
3103     byte32 R = cast(byte32) _mm256_max_epi8(A, B);
3104     byte[32] correct =          [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 4, 0, 0];
3105     assert(R.array == correct);
3106 }
3107 
3108 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum values.
3109 __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @trusted
3110 {
3111     // PERF D_SIMD
3112     version(GNU)
3113         enum bool split = true;
3114     else static if (SIMD_COMPARISON_MASKS_32B)
3115         enum bool split = false;
3116     else
3117         enum bool split = true;
3118 
3119     static if (GDC_with_AVX2)
3120     {
3121         return cast(__m256i) __builtin_ia32_pmaxuw256(cast(short16)a, cast(short16)b);
3122     }
3123     else static if (split)
3124     {
3125         // split
3126         __m128i a_lo = _mm256_extractf128_si256!0(a);
3127         __m128i a_hi = _mm256_extractf128_si256!1(a);
3128         __m128i b_lo = _mm256_extractf128_si256!0(b);
3129         __m128i b_hi = _mm256_extractf128_si256!1(b);
3130         __m128i r_lo = _mm_max_epu16(a_lo, b_lo);
3131         __m128i r_hi = _mm_max_epu16(a_hi, b_hi);
3132         return _mm256_set_m128i(r_hi, r_lo);
3133     }
3134     else static if (SIMD_COMPARISON_MASKS_32B)
3135     {
3136         // catastrophic with GDC x86_64, good with LDC
3137         short16 sa = cast(short16)a;
3138         short16 sb = cast(short16)b;
3139         short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb);
3140         return cast(__m256i)( (greater & sa) | (~greater & sb) );
3141     }
3142     else
3143         static assert(false);
3144 }
3145 unittest
3146 {
3147     short16 R = cast(short16) _mm256_max_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6),
3148                                                 _mm256_setr_epi16(  -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
3149     short[16] correct =                                            [-4,-8, -4, -8, 9,-32768, 0,-57, 1, 2, 0, 4, 2, 1, 2, -4];
3150     assert(R.array == correct);
3151 }
3152 
3153 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
3154 __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe
3155 {
3156     // PERF D_SIMD
3157     version(GNU)
3158         enum bool split = true;
3159     else static if (SIMD_COMPARISON_MASKS_32B)
3160         enum bool split = false;
3161     else
3162         enum bool split = true;
3163 
3164     static if (GDC_with_AVX2)
3165     {
3166         return cast(__m256i) __builtin_ia32_pmaxud256(cast(int8)a, cast(int8)b);
3167     }
3168     else static if (split)
3169     {
3170         // split
3171         __m128i a_lo = _mm256_extractf128_si256!0(a);
3172         __m128i a_hi = _mm256_extractf128_si256!1(a);
3173         __m128i b_lo = _mm256_extractf128_si256!0(b);
3174         __m128i b_hi = _mm256_extractf128_si256!1(b);
3175         __m128i r_lo = _mm_max_epu32(a_lo, b_lo);
3176         __m128i r_hi = _mm_max_epu32(a_hi, b_hi);
3177         return _mm256_set_m128i(r_hi, r_lo);
3178     }
3179     else static if (SIMD_COMPARISON_MASKS_32B) 
3180     {
3181         // catastrophic with GDC x86 for some reason, like for 16-bit numbers.
3182         uint8 sa = cast(uint8)a;
3183         uint8 sb = cast(uint8)b;
3184         uint8 greater = sa > sb;
3185         return cast(__m256i)( (greater & sa) | (~greater & sb) );
3186     }
3187     else
3188         static assert(0);
3189 }
3190 unittest
3191 {
3192     int8 R = cast(int8) _mm256_max_epu32(_mm256_setr_epi32(0x7fffffff, 1,  4, -7, 0x7fffffff, 1, 11, -7),
3193                                          _mm256_setr_epi32(        -4,-8,  9, -8,         -4,-8,  9, -8));
3194     int[8] correct =                                      [        -4,-8,  9, -7,         -4,-8, 11, -7];
3195     assert(R.array == correct);
3196 }
3197 
3198 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
3199 __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe
3200 {
3201     // PERF D_SIMD
3202     version(GNU)
3203         enum bool split = true;
3204     else static if (SIMD_COMPARISON_MASKS_32B)
3205         enum bool split = false;
3206     else
3207         enum bool split = true;
3208     static if (GDC_with_AVX2)
3209     {
3210         return cast(__m256i) __builtin_ia32_pmaxub256(cast(ubyte32)a, cast(ubyte32)b);
3211     }
3212     else static if (split)
3213     {
3214         // split
3215         __m128i a_lo = _mm256_extractf128_si256!0(a);
3216         __m128i a_hi = _mm256_extractf128_si256!1(a);
3217         __m128i b_lo = _mm256_extractf128_si256!0(b);
3218         __m128i b_hi = _mm256_extractf128_si256!1(b);
3219         __m128i r_lo = _mm_max_epu8(a_lo, b_lo);
3220         __m128i r_hi = _mm_max_epu8(a_hi, b_hi);
3221         return _mm256_set_m128i(r_hi, r_lo);
3222     }
3223     else static if (SIMD_COMPARISON_MASKS_32B)
3224     {
3225         // This is real bad with GDC, again
3226         ubyte32 sa = cast(ubyte32)a;
3227         ubyte32 sb = cast(ubyte32)b;
3228         ubyte32 greater = cast(ubyte32)(sa > sb);
3229         return cast(__m256i)( (greater & sa) | (~greater & sb) );
3230     }
3231     else
3232         static assert(false);
3233 }
3234 unittest
3235 {
3236     byte32 R = cast(byte32) _mm256_max_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0,   45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
3237                                             _mm256_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57,   -4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
3238     byte[32] correct =                                      [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57,   -4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
3239     assert(R.array == correct);
3240 }
3241 
3242 // Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
3243 __m256i _mm256_min_epi16 (__m256i a, __m256i b) pure @safe
3244 {
3245     // PERF D_SIMD
3246     version(GNU)
3247         enum bool split = true;
3248     else static if (SIMD_COMPARISON_MASKS_32B)
3249         enum bool split = false;
3250     else
3251         enum bool split = true;
3252 
3253     static if (GDC_with_AVX2)
3254     {
3255         return cast(__m256i) __builtin_ia32_pminsw256(cast(short16)a, cast(short16)b);
3256     }
3257     else static if (split)
3258     {
3259         // split
3260         __m128i a_lo = _mm256_extractf128_si256!0(a);
3261         __m128i a_hi = _mm256_extractf128_si256!1(a);
3262         __m128i b_lo = _mm256_extractf128_si256!0(b);
3263         __m128i b_hi = _mm256_extractf128_si256!1(b);
3264         __m128i r_lo = _mm_min_epi16(a_lo, b_lo);
3265         __m128i r_hi = _mm_min_epi16(a_hi, b_hi);
3266         return _mm256_set_m128i(r_hi, r_lo);
3267     }
3268     else static if (SIMD_COMPARISON_MASKS_32B)
3269     {
3270         // same as _mm256_min_epi16, this is catastrophic with GDC -mavx
3271         short16 sa = cast(short16)a;
3272         short16 sb = cast(short16)b;
3273         short16 greater = sa > sb;
3274         return cast(__m256i)( (~greater & sa) | (greater & sb) );
3275     }
3276     else
3277         static assert(0);
3278 }
3279 unittest
3280 {
3281     short16 R = cast(short16) _mm256_min_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0,  0),
3282                                                _mm256_setr_epi16(   -4,-8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
3283     short[16] correct =                                         [   -4,-8, -4, -8, 0,-32768, 0,-57, 0, 0, 0, 0, 1, 0, 0, -4];
3284     assert(R.array == correct);
3285 }
3286 
3287 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed minimum values.
3288 __m256i _mm256_min_epi32 (__m256i a, __m256i b) pure @safe
3289 {
3290     // PERF D_SIMD
3291     version(GNU)
3292         enum bool split = true;
3293     else static if (SIMD_COMPARISON_MASKS_32B)
3294         enum bool split = false;
3295     else
3296         enum bool split = true;
3297 
3298     static if (GDC_with_AVX2)
3299     {
3300         return cast(__m256i) __builtin_ia32_pminsd256(cast(int8)a, cast(int8)b);
3301     }
3302     else static if (split)
3303     {
3304         // split
3305         __m128i a_lo = _mm256_extractf128_si256!0(a);
3306         __m128i a_hi = _mm256_extractf128_si256!1(a);
3307         __m128i b_lo = _mm256_extractf128_si256!0(b);
3308         __m128i b_hi = _mm256_extractf128_si256!1(b);
3309         __m128i r_lo = _mm_min_epi32(a_lo, b_lo);
3310         __m128i r_hi = _mm_min_epi32(a_hi, b_hi);
3311         return _mm256_set_m128i(r_hi, r_lo);
3312     }
3313     else static if (SIMD_COMPARISON_MASKS_32B) 
3314     {
3315         // Not checked this one, probably same badness issue with GDC
3316         int8 sa = cast(int8)a;
3317         int8 sb = cast(int8)b;
3318         int8 greater = sa > sb;
3319         return cast(__m256i)( (~greater & sa) | (greater & sb) );
3320     }
3321     else
3322         static assert(0);    
3323 }
3324 unittest
3325 {
3326     int8 R = cast(int8) _mm256_min_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4,  7, 0x7fffffff, 2, -4,  7),
3327                                          _mm256_setr_epi32(        -4,-8,  9, -8,-0x80000000,-8,  9, -8));
3328     int[8] correct =                                      [ -       4,-8, -4, -8,-0x80000000,-8, -4, -8];
3329     assert(R.array == correct);
3330 }
3331 
3332 
3333 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed minimum values.
3334 __m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @trusted
3335 {
3336     // PERF D_SIMD
3337     version(GNU)
3338         enum bool split = true;
3339     else static if (SIMD_COMPARISON_MASKS_32B)
3340         enum bool split = false;
3341     else
3342         enum bool split = true;
3343     static if (GDC_with_AVX2)
3344     {
3345         // Strangely, GDC asks for unsigned ubyte32
3346         return cast(__m256i) __builtin_ia32_pminsb256(cast(ubyte32)a, cast(ubyte32)b);
3347     }
3348     else static if (split)
3349     {
3350         // split
3351         __m128i a_lo = _mm256_extractf128_si256!0(a);
3352         __m128i a_hi = _mm256_extractf128_si256!1(a);
3353         __m128i b_lo = _mm256_extractf128_si256!0(b);
3354         __m128i b_hi = _mm256_extractf128_si256!1(b);
3355         __m128i r_lo = _mm_min_epi8(a_lo, b_lo);
3356         __m128i r_hi = _mm_min_epi8(a_hi, b_hi);
3357         return _mm256_set_m128i(r_hi, r_lo);
3358     }
3359     else static if (SIMD_COMPARISON_MASKS_32B)
3360     {
3361         // This is real bad with GDC, again
3362         byte32 sa = cast(byte32)a;
3363         byte32 sb = cast(byte32)b;
3364         byte32 greater = cast(byte32)(sa > sb);
3365         return cast(__m256i)( (~greater & sa) | (greater & sb) );
3366     }
3367     else
3368         static assert(false);
3369 }
3370 unittest
3371 {
3372     __m256i A = _mm256_setr_epi8(127,  1, -4, -8, 9,    7, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0,   127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
3373     __m256i B = _mm256_setr_epi8(  4, -8,  9, -7, 0, -128, 0,   0, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, -4, 0, 0);
3374     byte32 R = cast(byte32) _mm256_min_epi8(A, B);
3375     byte[32] correct =          [  4, -8, -4, -8, 0, -128, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0,     4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, -4, 0, 0];
3376     assert(R.array == correct);
3377 }
3378 
3379 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed minimum values.
3380 __m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @trusted
3381 {
3382     // PERF D_SIMD
3383     version(GNU)
3384         enum bool split = true;
3385     else static if (SIMD_COMPARISON_MASKS_32B)
3386         enum bool split = false;
3387     else
3388         enum bool split = true;
3389 
3390     static if (GDC_with_AVX2)
3391     {
3392         return cast(__m256i) __builtin_ia32_pminuw256(cast(short16)a, cast(short16)b);
3393     }
3394     else static if (split)
3395     {
3396         // split
3397         __m128i a_lo = _mm256_extractf128_si256!0(a);
3398         __m128i a_hi = _mm256_extractf128_si256!1(a);
3399         __m128i b_lo = _mm256_extractf128_si256!0(b);
3400         __m128i b_hi = _mm256_extractf128_si256!1(b);
3401         __m128i r_lo = _mm_min_epu16(a_lo, b_lo);
3402         __m128i r_hi = _mm_min_epu16(a_hi, b_hi);
3403         return _mm256_set_m128i(r_hi, r_lo);
3404     }
3405     else static if (SIMD_COMPARISON_MASKS_32B)
3406     {
3407         // catastrophic with GDC x86_64
3408         short16 sa = cast(short16)a;
3409         short16 sb = cast(short16)b;
3410         short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb);
3411         return cast(__m256i)( (~greater & sa) | (greater & sb) );
3412     }
3413     else
3414         static assert(false);
3415 }
3416 unittest
3417 {
3418     short16 R = cast(short16) _mm256_min_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9,     7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6),
3419                                                _mm256_setr_epi16(  -4, -8,  9,  7, 0,-32768, 0,  0, 0, 2, 0, 4, 2, 1, 2, -4));
3420     short[16] correct =                                         [32767, 1,  9,  7, 0,     7, 0,  0, 0, 0, 0, 0, 1, 0, 0, -6];
3421     assert(R.array == correct);
3422 }
3423 
3424 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed minimum values.
3425 __m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe
3426 {
3427     // PERF D_SIMD
3428     version(GNU)
3429         enum bool split = true;
3430     else static if (SIMD_COMPARISON_MASKS_32B)
3431         enum bool split = false;
3432     else
3433         enum bool split = true;
3434 
3435     static if (GDC_with_AVX2)
3436     {
3437         return cast(__m256i) __builtin_ia32_pminud256(cast(int8)a, cast(int8)b);
3438     }
3439     else static if (split)
3440     {
3441         // split
3442         __m128i a_lo = _mm256_extractf128_si256!0(a);
3443         __m128i a_hi = _mm256_extractf128_si256!1(a);
3444         __m128i b_lo = _mm256_extractf128_si256!0(b);
3445         __m128i b_hi = _mm256_extractf128_si256!1(b);
3446         __m128i r_lo = _mm_min_epu32(a_lo, b_lo);
3447         __m128i r_hi = _mm_min_epu32(a_hi, b_hi);
3448         return _mm256_set_m128i(r_hi, r_lo);
3449     }
3450     else static if (SIMD_COMPARISON_MASKS_32B) 
3451     {
3452         // catastrophic with GDC, so in this case split instead
3453         uint8 sa = cast(uint8)a;
3454         uint8 sb = cast(uint8)b;
3455         uint8 greater = sa > sb;
3456         return cast(__m256i)( (greater & sb) | (~greater & sa) );
3457     }
3458     else
3459         static assert(0);
3460 }
3461 unittest
3462 {
3463     int8 R = cast(int8) _mm256_min_epu32(_mm256_setr_epi32(0x7fffffff, 1,  4, -7, 0x7fffffff, 1, 11, -7),
3464                                          _mm256_setr_epi32(        -4,-8,  9, -8,         -4,-8,  9, -8));
3465     int[8] correct =                                      [0x7fffffff, 1,  4, -8, 0x7fffffff, 1,  9, -8];
3466     assert(R.array == correct);
3467 }
3468 
3469 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
3470 __m256i _mm256_min_epu8 (__m256i a, __m256i b) pure @safe
3471 {
3472     // PERF D_SIMD
3473     version(GNU)
3474         enum bool split = true;
3475     else static if (SIMD_COMPARISON_MASKS_32B)
3476         enum bool split = false;
3477     else
3478         enum bool split = true;
3479     static if (GDC_with_AVX2)
3480     {
3481         return cast(__m256i) __builtin_ia32_pminub256(cast(ubyte32)a, cast(ubyte32)b);
3482     }
3483     else static if (split)
3484     {
3485         // split
3486         __m128i a_lo = _mm256_extractf128_si256!0(a);
3487         __m128i a_hi = _mm256_extractf128_si256!1(a);
3488         __m128i b_lo = _mm256_extractf128_si256!0(b);
3489         __m128i b_hi = _mm256_extractf128_si256!1(b);
3490         __m128i r_lo = _mm_min_epu8(a_lo, b_lo);
3491         __m128i r_hi = _mm_min_epu8(a_hi, b_hi);
3492         return _mm256_set_m128i(r_hi, r_lo);
3493     }
3494     else static if (SIMD_COMPARISON_MASKS_32B)
3495     {
3496         ubyte32 sa = cast(ubyte32)a;
3497         ubyte32 sb = cast(ubyte32)b;
3498         ubyte32 greater = cast(ubyte32)(sa > sb);
3499         return cast(__m256i)( (~greater & sa) | (greater & sb) );
3500     }
3501     else
3502         static assert(false);
3503 }
3504 unittest
3505 {
3506     byte32 R = cast(byte32) _mm256_min_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0,   45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
3507                                             _mm256_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57,   -4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
3508     byte[32] correct =                                      [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0,   45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
3509     assert(R.array == correct);
3510 }
3511 
3512 /// Create mask from the most significant bit of each 8-bit element in `a`.
3513 int _mm256_movemask_epi8 (__m256i a) pure @trusted
3514 {
3515     static if (GDC_with_AVX2)
3516     {
3517         return __builtin_ia32_pmovmskb256(cast(ubyte32)a);
3518     }
3519     else static if (LDC_with_AVX2)
3520     {
3521         return __builtin_ia32_pmovmskb256(cast(byte32)a);
3522     }
3523     else
3524     {
3525         // ARM64 splitting makes it 33 inst instead of 48 for naive version.
3526         //       PERF not sure if there is something better, sounds likely
3527         // Otherwise, beneficial for every case.
3528         __m128i a_lo = _mm256_extractf128_si256!0(a);
3529         __m128i a_hi = _mm256_extractf128_si256!1(a);
3530         return (_mm_movemask_epi8(a_hi) << 16) | _mm_movemask_epi8(a_lo);
3531     }
3532 }
3533 unittest
3534 {
3535     assert(0x9D37_9C36 == _mm256_movemask_epi8(_mm256_set_epi8(-1, 1, 2, -3, -1, -1, 4,-8, 127, 0, -1, -1, 0, -1, -1, -1,
3536                                                                -1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
3537 }
3538 
3539 /// Basically 2x `_mm_mpsadbw_epu8` in parallel, over the two lanes.
3540 __m256i _mm256_mpsadbw_epu8(int imm8)(__m256i a, __m256i b) pure @safe
3541 {
3542     static if (GDC_with_AVX2)
3543     {
3544         return cast(__m256i) __builtin_ia32_mpsadbw256(cast(ubyte32)a,
3545                                                        cast(ubyte32)b,
3546                                                        imm8);
3547     }
3548     else static if (LDC_with_AVX2)
3549     {
3550         return cast(__m256i) __builtin_ia32_mpsadbw256(cast(byte32)a,
3551                                                        cast(byte32)b,
3552                                                        imm8);
3553     }
3554     else
3555     {
3556         // split
3557         __m128i a_lo = _mm256_extractf128_si256!0(a);
3558         __m128i a_hi = _mm256_extractf128_si256!1(a);
3559         __m128i b_lo = _mm256_extractf128_si256!0(b);
3560         __m128i b_hi = _mm256_extractf128_si256!1(b);
3561         __m128i r_lo = _mm_mpsadbw_epu8!(imm8 & 7)(a_lo, b_lo);
3562         __m128i r_hi = _mm_mpsadbw_epu8!((imm8 >> 3) & 7)(a_hi, b_hi);
3563         return _mm256_set_m128i(r_hi, r_lo);
3564     }
3565 }
3566 unittest
3567 {
3568     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
3569     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
3570     __m256i AA = _mm256_set_m128i(A, A);
3571     __m256i BB = _mm256_set_m128i(B, B);
3572     short[16] correct = [755, 753, 751, 749, 747, 745, 743, 741,
3573                           32,  28,  24,  20,  16,  12,  8,    4];
3574     short16 r5 = cast(short16) _mm256_mpsadbw_epu8!(7 * 8 + 5)(AA, BB);
3575     assert(r5.array == correct);
3576 }
3577 
3578 /// Multiply the low signed 32-bit integers from each packed 64-bit element in `a` and `b`, and 
3579 /// return the signed 64-bit results.
3580 __m256i _mm256_mul_epi32 (__m256i a, __m256i b) pure @trusted
3581 {
3582     // PERF LDC + SSE2 to SSSE3. I don't quite see what to do, same problem in _mm_mul_epi32.
3583     static if (GDC_with_AVX2)
3584     {
3585         return cast(__m256i) __builtin_ia32_pmuldq256(cast(int8)a, cast(int8)b);
3586     }
3587     else static if ( (LDC_with_SSE41 || LDC_with_AVX2) && LDC_with_optimizations) 
3588     {
3589         // good with LDC + SSE4.1 to AVX2, else need to split
3590         enum ir = `
3591             %ia = shufflevector <8 x i32> %0,<8 x i32> %0, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
3592             %ib = shufflevector <8 x i32> %1,<8 x i32> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
3593             %la = sext <4 x i32> %ia to <4 x i64>
3594             %lb = sext <4 x i32> %ib to <4 x i64>
3595             %r = mul <4 x i64> %la, %lb
3596             ret <4 x i64> %r`;
3597         return cast(__m256i) LDCInlineIR!(ir, long4, int8, int8)(cast(int8)a, cast(int8)b);
3598     }
3599     else
3600     {
3601         // split, very beneficial with LDC+ARM64
3602         __m128i a_lo = _mm256_extractf128_si256!0(a);
3603         __m128i a_hi = _mm256_extractf128_si256!1(a);
3604         __m128i b_lo = _mm256_extractf128_si256!0(b);
3605         __m128i b_hi = _mm256_extractf128_si256!1(b);
3606         __m128i r_lo = _mm_mul_epi32(a_lo, b_lo);
3607         __m128i r_hi = _mm_mul_epi32(a_hi, b_hi);
3608         return _mm256_set_m128i(r_hi, r_lo);
3609     }
3610 }
3611 unittest
3612 {
3613     __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616466, 1915324654, 4564061, 3);
3614     __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121145, 0);
3615     long4 R = cast(long4) _mm256_mul_epi32(A, B);
3616     long[4] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144, cast(long)61616466 * 49716422, cast(long)4564061 * -121145];
3617     assert(R.array == correct);
3618 }
3619 
3620 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, and 
3621 /// return the unsigned 64-bit results.
3622 __m256i _mm256_mul_epu32 (__m256i a, __m256i b) pure @trusted
3623 {
3624     // PERF DMD
3625     static if (GDC_with_AVX2)
3626     {
3627         return cast(__m256i) __builtin_ia32_pmuludq256(cast(int8)a, cast(int8)b);
3628     }
3629     else version(GNU)
3630     {
3631         // explicit split needed for GDC without avx2
3632         __m128i a_lo = _mm256_extractf128_si256!0(a);
3633         __m128i a_hi = _mm256_extractf128_si256!1(a);
3634         __m128i b_lo = _mm256_extractf128_si256!0(b);
3635         __m128i b_hi = _mm256_extractf128_si256!1(b);
3636         __m128i r_lo = _mm_mul_epu32(a_lo, b_lo);
3637         __m128i r_hi = _mm_mul_epu32(a_hi, b_hi);
3638         return _mm256_set_m128i(r_hi, r_lo);
3639     }       
3640     else
3641     {
3642         // Works well in all LDC cases, surprisingly.
3643         int8 ia = cast(int8)a;
3644         int8 ib = cast(int8)b;
3645         long4 r;
3646         r.ptr[0] = cast(long)cast(uint)ia.array[0] * cast(long)cast(uint)ib.array[0];
3647         r.ptr[1] = cast(long)cast(uint)ia.array[2] * cast(long)cast(uint)ib.array[2];
3648         r.ptr[2] = cast(long)cast(uint)ia.array[4] * cast(long)cast(uint)ib.array[4];
3649         r.ptr[3] = cast(long)cast(uint)ia.array[6] * cast(long)cast(uint)ib.array[6];
3650         return cast(__m256i)r;
3651     }
3652 }
3653 unittest
3654 {
3655     __m256i A = _mm256_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff, 42, 0xDEADBEEF, 42, 0xffffffff);
3656     __m256i B = _mm256_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff, 42, 0xCAFEBABE, 42, 0xffffffff);
3657     __m256i C = _mm256_mul_epu32(A, B);
3658     long4 LC = cast(long4)C;
3659     long[4] correct = [18446744065119617025uL, 12723420444339690338uL, 18446744065119617025uL, 12723420444339690338uL];
3660     assert(LC.array == correct);
3661 }
3662 
3663 /// Multiply the packed signed 16-bit integers in `a` and `b`, 
3664 /// producing intermediate 32-bit integers, and return the high 
3665 /// 16 bits of the intermediate integers.
3666 __m256i _mm256_mulhi_epi16 (__m256i a, __m256i b) pure @safe
3667 {
3668     static if (GDC_with_AVX2)
3669     {
3670         return cast(__m256i) __builtin_ia32_pmulhw256(cast(short16)a, cast(short16)b);
3671     }
3672     else static if (LDC_with_AVX2)
3673     {
3674         return cast(__m256i) __builtin_ia32_pmulhw256(cast(short16)a, cast(short16)b);
3675     }
3676     else
3677     {
3678         // split
3679         __m128i a_lo = _mm256_extractf128_si256!0(a);
3680         __m128i a_hi = _mm256_extractf128_si256!1(a);
3681         __m128i b_lo = _mm256_extractf128_si256!0(b);
3682         __m128i b_hi = _mm256_extractf128_si256!1(b);
3683         __m128i r_lo = _mm_mulhi_epi16(a_lo, b_lo);
3684         __m128i r_hi = _mm_mulhi_epi16(a_hi, b_hi);
3685         return _mm256_set_m128i(r_hi, r_lo);
3686     }
3687 }
3688 unittest
3689 {
3690     __m256i A = _mm256_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7, 0, -16, 2, 3, 4, 8, 16, 8);
3691     __m256i B = _mm256_set1_epi16(16384);
3692     short16 R = cast(short16)_mm256_mulhi_epi16(A, B);
3693     short[16] correct = [0, -4, 0, 0, 1, 2, 4, 1, 0, -4, 0, 0, 1, 2, 4, 2];
3694     assert(R.array == correct);
3695 }
3696 
3697 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, 
3698 /// producing intermediate 32-bit integers, and return the high 
3699 /// 16 bits of the intermediate integers.
3700 __m256i _mm256_mulhi_epu16 (__m256i a, __m256i b) pure @safe
3701 {
3702     static if (GDC_with_AVX2)
3703     {
3704         return cast(__m256i) __builtin_ia32_pmulhuw256(cast(short16)a, cast(short16)b);
3705     }
3706     else static if (LDC_with_AVX2)
3707     {
3708         return cast(__m256i) __builtin_ia32_pmulhuw256(cast(short16)a, cast(short16)b);
3709     }
3710     else
3711     {
3712         // split
3713         __m128i a_lo = _mm256_extractf128_si256!0(a);
3714         __m128i a_hi = _mm256_extractf128_si256!1(a);
3715         __m128i b_lo = _mm256_extractf128_si256!0(b);
3716         __m128i b_hi = _mm256_extractf128_si256!1(b);
3717         __m128i r_lo = _mm_mulhi_epu16(a_lo, b_lo);
3718         __m128i r_hi = _mm_mulhi_epu16(a_hi, b_hi);
3719         return _mm256_set_m128i(r_hi, r_lo);
3720     }
3721 }
3722 
3723 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 
3724 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return 
3725 /// bits [16:1] to dst.
3726 __m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b) pure @safe
3727 {
3728     static if (GDC_or_LDC_with_AVX2)
3729     {
3730         return cast(__m256i)__builtin_ia32_pmulhrsw256(cast(short16)a, cast(short16)b);
3731     }
3732     else
3733     {
3734         // ARM64: 8 instr with LDC >= 1.32 -O2, nice
3735         __m128i a_lo = _mm256_extractf128_si256!0(a);
3736         __m128i a_hi = _mm256_extractf128_si256!1(a);
3737         __m128i b_lo = _mm256_extractf128_si256!0(b);
3738         __m128i b_hi = _mm256_extractf128_si256!1(b);
3739         __m128i r_lo = _mm_mulhrs_epi16(a_lo, b_lo);
3740         __m128i r_hi = _mm_mulhrs_epi16(a_hi, b_hi);
3741         return _mm256_set_m128i(r_hi, r_lo);
3742     }
3743 }
3744 unittest
3745 {
3746     __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1);
3747     __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1);
3748     __m256i AB = _mm256_set_m128i(B, A);
3749     __m256i BA = _mm256_set_m128i(A, B);
3750     short16 C = cast(short16) _mm256_mulhrs_epi16(AB, BA);
3751     short[16] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0, 3344, 24487, 15678, 0, 0, 0, 32, 0];
3752     assert(C.array == correct);
3753 }
3754 
3755 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
3756 /// and return the low 16 bits of the intermediate integers.
3757 __m256i _mm256_mullo_epi16 (__m256i a, __m256i b) pure @safe
3758 {
3759     // PERF D_SIMD
3760     static if (GDC_with_AVX)
3761     {
3762         return cast(__m256i)(cast(short16)a * cast(short16)b);
3763     }
3764     else version(LDC)
3765     {
3766         return cast(__m256i)(cast(short16)a * cast(short16)b);
3767     }
3768     else
3769     {
3770         // split
3771         __m128i a_lo = _mm256_extractf128_si256!0(a);
3772         __m128i a_hi = _mm256_extractf128_si256!1(a);
3773         __m128i b_lo = _mm256_extractf128_si256!0(b);
3774         __m128i b_hi = _mm256_extractf128_si256!1(b);
3775         __m128i r_lo = _mm_mullo_epi16(a_lo, b_lo);
3776         __m128i r_hi = _mm_mullo_epi16(a_hi, b_hi);
3777         return _mm256_set_m128i(r_hi, r_lo);
3778     }
3779 }
3780 unittest
3781 {
3782     __m256i A = _mm256_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7, 16384, -16, 0,      3, 4, 1, 16, 7);
3783     __m256i B = _mm256_set1_epi16(16384);
3784     short16 R = cast(short16)_mm256_mullo_epi16(A, B);
3785     short[16] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384, 0, 0, 0, -16384, 0, 16384, 0, -16384];
3786     assert(R.array == correct);
3787 }
3788 
3789 /// Multiply the packed signed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers,
3790 /// and store the low 32 bits of the intermediate integer.
3791 __m256i _mm256_mullo_epi32 (__m256i a, __m256i b) pure @safe
3792 {
3793     // PERF D_SIMD
3794     static if (GDC_with_AVX)
3795     {
3796         return cast(__m256i)(cast(int8)a * cast(int8)b);
3797     }
3798     else version(LDC)
3799     {
3800         return cast(__m256i)(cast(int8)a * cast(int8)b);
3801     }
3802     else
3803     {
3804         // split
3805         __m128i a_lo = _mm256_extractf128_si256!0(a);
3806         __m128i a_hi = _mm256_extractf128_si256!1(a);
3807         __m128i b_lo = _mm256_extractf128_si256!0(b);
3808         __m128i b_hi = _mm256_extractf128_si256!1(b);
3809         __m128i r_lo = _mm_mullo_epi32(a_lo, b_lo);
3810         __m128i r_hi = _mm_mullo_epi32(a_hi, b_hi);
3811         return _mm256_set_m128i(r_hi, r_lo);
3812     }
3813 }
3814 unittest
3815 {
3816     __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616461, 1915324654, 4564061, 3);
3817     __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121144, 1);
3818     int8 R = cast(int8) _mm256_mullo_epi32(A, B);
3819     int[8] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0,
3820                       cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 3];
3821     assert(R.array == correct);
3822 }
3823 
3824 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`.
3825 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe
3826 {
3827     return a | b;
3828 }
3829 unittest
3830 {
3831     long A = 0x55555555_55555555;
3832     long B = 0xAAAAAAAA_AAAAAAAA;
3833     __m256i vA = _mm256_set_epi64(A, B, A, B);
3834     __m256i vB = _mm256_set_epi64(B, A, 0, B);
3835     __m256i R  = _mm256_or_si256(vA, vB);
3836     long[4] correct = [B, A, -1, -1];
3837     assert(R.array == correct);
3838 }
3839 
3840 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using signed saturation.
3841 /// Warning: `a` and `b` are interleaved per-lane. 
3842 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
3843 __m256i _mm256_packs_epi16 (__m256i a, __m256i b) pure @safe
3844 {
3845     // PERF D_SIMD
3846     static if (GDC_with_AVX2)
3847     {
3848         return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b);
3849     }
3850     else static if (LDC_with_AVX2)
3851     {
3852         return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b);
3853     }
3854     else
3855     {
3856         __m128i a_lo = _mm256_extractf128_si256!0(a);
3857         __m128i a_hi = _mm256_extractf128_si256!1(a);
3858         __m128i b_lo = _mm256_extractf128_si256!0(b);
3859         __m128i b_hi = _mm256_extractf128_si256!1(b);
3860         __m128i r_lo = _mm_packs_epi16(a_lo, b_lo);
3861         __m128i r_hi = _mm_packs_epi16(a_hi, b_hi);
3862         return _mm256_set_m128i(r_hi, r_lo);
3863     }
3864 }
3865 unittest
3866 {
3867     __m256i A = _mm256_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0, 
3868                                  -1000, -1000, 1000, 0, 256, -129, 254, 0);
3869     byte32 R = cast(byte32) _mm256_packs_epi16(A, A);
3870     byte[32] correct = [127, -128, 127, 0, 127, -128, 127, 0,
3871                         127, -128, 127, 0, 127, -128, 127, 0,
3872                        -128, -128, 127, 0, 127, -128, 127, 0,
3873                        -128, -128, 127, 0, 127, -128, 127, 0];
3874     assert(R.array == correct);
3875 }
3876 
3877 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using signed saturation.
3878 /// Warning: `a` and `b` are interleaved per-lane.
3879 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
3880 __m256i _mm256_packs_epi32 (__m256i a, __m256i b) pure @safe
3881 {
3882     // PERF D_SIMD
3883     static if (GDC_with_AVX2)
3884     {
3885         return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b);
3886     }
3887     else static if (LDC_with_AVX2)
3888     {
3889         return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b);
3890     }
3891     else
3892     {
3893         __m128i a_lo = _mm256_extractf128_si256!0(a);
3894         __m128i a_hi = _mm256_extractf128_si256!1(a);
3895         __m128i b_lo = _mm256_extractf128_si256!0(b);
3896         __m128i b_hi = _mm256_extractf128_si256!1(b);
3897         __m128i r_lo = _mm_packs_epi32(a_lo, b_lo);
3898         __m128i r_hi = _mm_packs_epi32(a_hi, b_hi);
3899         return _mm256_set_m128i(r_hi, r_lo);
3900     }
3901 }
3902 unittest
3903 {
3904     __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 4, 5, -100000, 7);
3905     short16 R = cast(short16) _mm256_packs_epi32(A, A);
3906     short[16] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0, 4, 5, -32768, 7, 4, 5, -32768, 7];
3907     assert(R.array == correct);
3908 }
3909 
3910 
3911 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using unsigned saturation.
3912 /// Warning: `a` and `b` are interleaved per-lane. 
3913 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
3914 __m256i _mm256_packus_epi16 (__m256i a, __m256i b) pure @trusted
3915 {
3916     // PERF D_SIMD
3917     static if (GDC_with_AVX2)
3918     {
3919         return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b);
3920     }
3921     else static if (LDC_with_AVX2)
3922     {
3923         return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b);
3924     }
3925     else
3926     {
3927         // Always beneficial with LDC.
3928         // arm64: 4 inst with LDC  -O1
3929         __m128i a_lo = _mm256_extractf128_si256!0(a);
3930         __m128i a_hi = _mm256_extractf128_si256!1(a);
3931         __m128i b_lo = _mm256_extractf128_si256!0(b);
3932         __m128i b_hi = _mm256_extractf128_si256!1(b);
3933         __m128i r_lo = _mm_packus_epi16(a_lo, b_lo);
3934         __m128i r_hi = _mm_packus_epi16(a_hi, b_hi);
3935         return _mm256_set_m128i(r_hi, r_lo);
3936     }
3937 }
3938 unittest
3939 {
3940     __m256i A = _mm256_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0, -10, 400,  0, 256, -32768,  2,  1, 0);
3941     __m256i B = _mm256_setr_epi16(  0,   1, 2,   3,   4, 5, 6, 7,   8,   9, 10,  11,     12, 13, 14, 15);
3942     byte32 R = cast(byte32) _mm256_packus_epi16(A, B);
3943    align(32) static immutable byte[32] correctResult = [0, -1, 0, -1, -1, 2, 1, 0, 0, 1,  2,  3,  4,  5,  6,  7,
3944                                                         0, -1, 0, -1, 0  , 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15];
3945     assert(R.array == correctResult);
3946 }
3947 
3948 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using unsigned saturation.
3949 /// Warning: `a` and `b` are interleaved per-lane.
3950 ///           Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1.
3951 __m256i _mm256_packus_epi32 (__m256i a, __m256i b) pure @safe
3952 {
3953     // PERF D_SIMD
3954     static if (GDC_with_AVX2)
3955     {
3956         return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b);
3957     }
3958     else static if (LDC_with_AVX2)
3959     {
3960         return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b);
3961     }
3962     else
3963     {
3964         // 8 inst in arm64 since LDC 1.22 -O2,
3965         // sounds a bit underperforming maybe
3966         __m128i a_lo = _mm256_extractf128_si256!0(a);
3967         __m128i a_hi = _mm256_extractf128_si256!1(a);
3968         __m128i b_lo = _mm256_extractf128_si256!0(b);
3969         __m128i b_hi = _mm256_extractf128_si256!1(b);
3970         __m128i r_lo = _mm_packus_epi32(a_lo, b_lo);
3971         __m128i r_hi = _mm_packus_epi32(a_hi, b_hi);
3972         return _mm256_set_m128i(r_hi, r_lo);
3973     }
3974 }
3975 unittest
3976 {
3977     __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 100000, -100000, 1000, 1);
3978     short16 R = cast(short16) _mm256_packus_epi32(A, A);
3979     short[16] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0,
3980                          cast(short)65535, 0, 1000, 1, cast(short)65535, 0, 1000, 1];
3981     assert(R.array == correct);
3982 }
3983 
3984 /// Shuffle 128-bits (composed of 2 packed (128-bit) integer elements)
3985 /// selected by `imm8` from `a` and `b`.
3986 /// See the documentation as the `imm8` format is quite complex.
3987 __m256i _mm256_permute2x128_si256(int imm8)(__m256i a, __m256i b) pure @safe
3988 {
3989     // PERF: the only difference with _mm256_permute2f128_si256, which we
3990     // haven't reproduced here, is that _mm256_permute2x128_si256 is supposed
3991     // to be with an integer hint at instruction level, and requires AVX2.
3992     return _mm256_permute2f128_si256!imm8(a, b);
3993 }
3994 unittest
3995 {
3996     __m256d A = _mm256_setr_pd(8.0, 1, 2, 3);
3997     __m256d B = _mm256_setr_pd(4.0, 5, 6, 7);
3998     __m256d R2 = _mm256_permute2f128_pd!(3*16 + 8 + 1)(A, B);
3999     double[4] correct2 = [0.0, 0.0, 6.0, 7.0];
4000     assert(R2.array == correct2);
4001 }
4002 
4003 /// Shuffle 64-bit integers in `a` across lanes using the control in `imm8`.
4004 __m256i _mm256_permute4x64_epi64(int imm8)(__m256i a) pure @trusted
4005 {
4006     static if (GDC_with_AVX2)
4007         return cast(__m256i) __builtin_ia32_permdi256(a, imm8);
4008     else static if (LDC_with_optimizations)
4009     {
4010         return shufflevector!(long4, (imm8 >> 0) & 3,
4011                               (imm8 >> 2) & 3,
4012                               (imm8 >> 4) & 3,
4013                               (imm8 >> 6) & 3)(a, a);
4014     }
4015     else
4016     {
4017         __m256i b = a;
4018         static foreach (i; 0..4)
4019             a[i] = b[(imm8 & (0b00000011 << (i * 2))) >> (i * 2)];
4020         return a;
4021     }
4022 }
4023 unittest
4024 {
4025     __m256i A = _mm256_setr_epi64x(1, 2, 3, 4);
4026     static immutable long[4] correct = [ 4, 3, 2, 1 ];
4027     assert(_mm256_permute4x64_epi64!(0b00011011)(A).array == correct);
4028 
4029     A = _mm256_setr_epi64x(1, 2, 3, 4);
4030     static immutable long[4] correct2 = [ 1, 4, 1, 1 ];
4031     assert(_mm256_permute4x64_epi64!(0b00001100)(A).array == correct2);
4032 }
4033 
4034 /// Shuffle 64-bit double in `a` across lanes using the control in `imm8`.
4035 __m256d _mm256_permute4x64_pd(int imm8)(__m256d a) pure @trusted
4036 {
4037     // PERF: ignore instruction-level type hint
4038     return cast(__m256d) _mm256_permute4x64_epi64!imm8(cast(__m256i)a);
4039 }
4040 unittest
4041 {
4042     __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
4043     static immutable double[4] correct = [ 4.0, 3.0, 2.0, 1.0 ];
4044     assert(_mm256_permute4x64_pd!(0b00011011)(A).array == correct);
4045 }
4046 
4047 /// Shuffle 32-bit integers in `a` across lanes using the corresponding index in `idx`.
4048 __m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) pure @trusted
4049 {
4050     // While it _should_ be possible to use 4x _mm_shuffle_epi8 for this permute,
4051     // it is quite hard to pull off and simd-everwhere doesn't attempt either.
4052     static if (GDC_or_LDC_with_AVX2)
4053     {
4054         return cast(__m256i) __builtin_ia32_permvarsi256(cast(int8)a, cast(int8)idx);
4055     }    
4056     else
4057     {
4058         // PERF ARM64 and x86 without AVX, it's not very good
4059         int8 ai = cast(int8)a;
4060         int8 ii = cast(int8)idx;
4061         int8 ri;
4062 
4063         for (int j = 0; j < 8; ++j)
4064         {
4065             ri.ptr[j] = ai.array[ ii[j] & 7 ];
4066         }
4067         return cast(__m256i) ri;
4068     }
4069 }
4070 unittest
4071 {
4072     __m256i A = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15);
4073     __m256i B = _mm256_setr_epi32(8 + 1, 4, 7, 8 + 2, 24, 3, 3, 2);
4074     int8 R = cast(int8) _mm256_permutevar8x32_epi32(A, B);
4075     int[8] correct = [ 9, 12, 15, 10, 8, 11, 11, 10 ];
4076     assert(R.array == correct);
4077 }
4078 
4079 /// Shuffle single-precision (32-bit) floating-point in `a` across lanes using the 
4080 /// corresponding index in `idx`.
4081 __m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx) pure @safe
4082 {
4083     return cast(__m256) _mm256_permutevar8x32_epi32(cast(__m256i)a, cast(__m256i)idx);
4084 }
4085 
4086 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each
4087 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the
4088 /// low 16 bits of 64-bit elements in result.
4089 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted
4090 {
4091     static if (GDC_with_AVX2)
4092     {
4093         return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b);
4094     }
4095     else static if (LDC_with_AVX2)
4096     {
4097         return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b);
4098     }
4099     else
4100     {
4101         // split is beneficial for ARM64, LDC and GDC without AVX2
4102         __m128i a_lo = _mm256_extractf128_si256!0(a);
4103         __m128i a_hi = _mm256_extractf128_si256!1(a);
4104         __m128i b_lo = _mm256_extractf128_si256!0(b);
4105         __m128i b_hi = _mm256_extractf128_si256!1(b);
4106         __m128i r_lo = _mm_sad_epu8(a_lo, b_lo);
4107         __m128i r_hi = _mm_sad_epu8(a_hi, b_hi);
4108         return _mm256_set_m128i(r_hi, r_lo);
4109     }
4110 }
4111 unittest
4112 {
4113     __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54,
4114                               3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
4115     __m256i B = _mm256_set1_epi8(1);
4116     int8 R = cast(int8) _mm256_sad_epu8(A, B);
4117     int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
4118                       0,
4119                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
4120                       0,
4121                       2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
4122                       0,
4123                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
4124                       0];
4125     assert(R.array == correct);
4126 }
4127 
4128 /// Shuffle 32-bit integers in `a` within 128-bit lanes using the control in `imm8`, and return the results.
4129 __m256i _mm256_shuffle_epi32(int imm8)(__m256i a) pure @trusted
4130 {
4131     static if (GDC_with_AVX2)
4132         return cast(__m256i)__builtin_ia32_pshufd256(cast(int8)a, imm8);
4133     else static if (LDC_with_AVX2)
4134     {
4135         return cast(__m256i)shufflevectorLDC!(int8,
4136             (imm8 >> 0) & 3,
4137             (imm8 >> 2) & 3,
4138             (imm8 >> 4) & 3,
4139             (imm8 >> 6) & 3,
4140             ((imm8 >> 0) & 3) + 4,
4141             ((imm8 >> 2) & 3) + 4,
4142             ((imm8 >> 4) & 3) + 4,
4143             ((imm8 >> 6) & 3) + 4)(cast(int8)a, cast(int8)a);
4144     }
4145     else
4146     {
4147         auto hi = _mm_shuffle_epi32!imm8(_mm256_extractf128_si256!0(a));
4148         auto lo = _mm_shuffle_epi32!imm8(_mm256_extractf128_si256!1(a));
4149         return _mm256_setr_m128i(hi, lo);
4150     }
4151 }
4152 unittest
4153 {
4154     __m256i a = _mm256_set_epi32(32, 31, 30, 29, 28, 27, 26, 25);
4155     assert(_mm256_shuffle_epi32!255(a).array == [120259084316L, 120259084316, 137438953504, 137438953504]);
4156 }
4157 
4158 /// Shuffle 8-bit integers in `a` within 128-bit lanes according to shuffle control mask in the 
4159 /// corresponding 8-bit element of `b`.
4160 __m256i _mm256_shuffle_epi8(__m256i a, __m256i b) pure @trusted
4161 {
4162     static if (GDC_with_AVX2)
4163         return cast(__m256i)__builtin_ia32_pshufb256(cast(ubyte32)a, cast(ubyte32)b);
4164     else static if (LDC_with_AVX2)
4165         return cast(__m256i)__builtin_ia32_pshufb256(cast(byte32)a, cast(byte32)b);
4166     else
4167     {
4168         auto hi = _mm_shuffle_epi8(_mm256_extractf128_si256!0(a), _mm256_extractf128_si256!0(b));
4169         auto lo = _mm_shuffle_epi8(_mm256_extractf128_si256!1(a), _mm256_extractf128_si256!1(b));
4170         return _mm256_setr_m128i(hi, lo);
4171     }
4172 }
4173 unittest
4174 {
4175     __m256i a = _mm256_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
4176     __m256i b = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
4177 
4178     __m256i expected = _mm256_setr_epi8(
4179         2, 2, 2, 2, 2, 2, 2, 2, 
4180         1, 1, 1, 1, 1, 1, 1, 1, 
4181         18, 18, 18, 18, 18, 18, 18, 18, 
4182         17, 17, 17, 17, 17, 17, 17, 17
4183     );
4184 
4185     assert(_mm256_shuffle_epi8(a, b).array == expected.array);
4186 }
4187 
4188 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
4189 /// the control in `imm8`. Store the results in the high 64 bits of 128-bit lanes
4190 /// of result, with the low 64 bits of 128-bit lanes being copied from from `a`.
4191 /// See also: `_MM_SHUFFLE`.
4192 __m256i _mm256_shufflehi_epi16(int imm8)(__m256i a) pure @safe
4193 {
4194     static if (GDC_with_AVX2)
4195     {
4196         return cast(__m256i) __builtin_ia32_pshufhw256(cast(short16)a, imm8);
4197     }
4198     else static if (LDC_with_optimizations)
4199     {
4200         return cast(__m256i) shufflevectorLDC!(short16,
4201             0, 1, 2, 3,
4202             4 + ( (imm8 >> 0) & 3 ),
4203             4 + ( (imm8 >> 2) & 3 ),
4204             4 + ( (imm8 >> 4) & 3 ),
4205             4 + ( (imm8 >> 6) & 3 ),
4206             8, 9, 10, 11,
4207             12 + ( (imm8 >> 0) & 3 ),
4208             12 + ( (imm8 >> 2) & 3 ),
4209             12 + ( (imm8 >> 4) & 3 ),
4210             12 + ( (imm8 >> 6) & 3 ))
4211             (cast(short16)a, cast(short16)a);
4212     }
4213     else
4214     {
4215         __m128i a_lo = _mm256_extractf128_si256!0(a);
4216         __m128i a_hi = _mm256_extractf128_si256!1(a);
4217         __m128i r_lo = _mm_shufflehi_epi16!imm8(a_lo);
4218         __m128i r_hi = _mm_shufflehi_epi16!imm8(a_hi);
4219         return _mm256_set_m128i(r_hi, r_lo);
4220     }
4221 }
4222 unittest
4223 {
4224     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4225     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
4226     short16 B = cast(short16) _mm256_shufflehi_epi16!SHUFFLE(A);
4227     short[16] expectedB = [ 0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12 ];
4228     assert(B.array == expectedB);
4229 }
4230 
4231 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
4232 /// the control in `imm8`. Store the results in the low 64 bits of 128-bit lanes 
4233 /// of result, with the high 64 bits of 128-bit lanes being copied from from `a`.
4234 /// See also: `_MM_SHUFFLE`.
4235 __m256i _mm256_shufflelo_epi16(int imm8)(__m256i a) pure @safe
4236 {
4237     static if (GDC_with_AVX2)
4238     {
4239         return cast(__m256i) __builtin_ia32_pshuflw256(cast(short16)a, imm8);
4240     }
4241     else static if (LDC_with_optimizations)
4242     { 
4243         return cast(__m256i) shufflevectorLDC!(short16,
4244             ( (imm8 >> 0) & 3 ),
4245             ( (imm8 >> 2) & 3 ),
4246             ( (imm8 >> 4) & 3 ),
4247             ( (imm8 >> 6) & 3 ), 
4248             4, 5, 6, 7,
4249             ( (imm8 >> 0) & 3 ) + 8,
4250             ( (imm8 >> 2) & 3 ) + 8,
4251             ( (imm8 >> 4) & 3 ) + 8,
4252             ( (imm8 >> 6) & 3 ) + 8,
4253             12, 13, 14, 15)
4254             (cast(short16)a, cast(short16)a);
4255     }
4256     else
4257     {
4258         __m128i a_lo = _mm256_extractf128_si256!0(a);
4259         __m128i a_hi = _mm256_extractf128_si256!1(a);
4260         __m128i r_lo = _mm_shufflelo_epi16!imm8(a_lo);
4261         __m128i r_hi = _mm_shufflelo_epi16!imm8(a_hi);
4262         return _mm256_set_m128i(r_hi, r_lo);
4263     }
4264 }
4265 unittest
4266 {
4267     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4268     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
4269     short16 B = cast(short16) _mm256_shufflelo_epi16!SHUFFLE(A);
4270     short[16] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7, 11, 10, 9, 8, 12, 13, 14, 15 ];
4271     assert(B.array == expectedB);
4272 }
4273 
4274 /// Negate packed signed 16-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative.
4275 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
4276 __m256i _mm256_sign_epi16 (__m256i a, __m256i b) pure @safe
4277 {
4278     // PERF DMD
4279     static if (GDC_with_AVX2)
4280     {
4281         return cast(__m256i) __builtin_ia32_psignw256(cast(short16)a, cast(short16)b);
4282     }
4283     else static if (LDC_with_AVX2)
4284     {
4285         return cast(__m256i) __builtin_ia32_psignw256(cast(short16)a, cast(short16)b);
4286     }
4287     else // split
4288     {
4289         __m128i a_lo = _mm256_extractf128_si256!0(a);
4290         __m128i a_hi = _mm256_extractf128_si256!1(a);
4291         __m128i b_lo = _mm256_extractf128_si256!0(b);
4292         __m128i b_hi = _mm256_extractf128_si256!1(b);
4293         __m128i r_lo = _mm_sign_epi16(a_lo, b_lo);
4294         __m128i r_hi = _mm_sign_epi16(a_hi, b_hi);
4295         return _mm256_set_m128i(r_hi, r_lo);
4296     }
4297     // PERF: not optimal in AVX without AVX2
4298 }
4299 unittest
4300 {
4301     __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
4302     __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
4303     __m256i AA = _mm256_set_m128i(A, A);
4304     __m256i BB = _mm256_set_m128i(B, B);
4305     short16 C = cast(short16) _mm256_sign_epi16(AA, BB);
4306     short[16] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min, 2,  0, 0, 1, -2, short.min,         0, short.min];
4307     assert(C.array == correct);
4308 }
4309 
4310 /// Negate packed signed 32-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative.
4311 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
4312 __m256i _mm256_sign_epi32 (__m256i a, __m256i b) pure @safe
4313 {
4314     // PERF DMD
4315     static if (GDC_with_AVX2)
4316     {
4317         return cast(__m256i) __builtin_ia32_psignd256(cast(int8)a, cast(int8)b);
4318     }
4319     else static if (LDC_with_AVX2)
4320     {
4321         return cast(__m256i) __builtin_ia32_psignd256(cast(int8)a, cast(int8)b);
4322     }
4323     else // split
4324     {
4325         __m128i a_lo = _mm256_extractf128_si256!0(a);
4326         __m128i a_hi = _mm256_extractf128_si256!1(a);
4327         __m128i b_lo = _mm256_extractf128_si256!0(b);
4328         __m128i b_hi = _mm256_extractf128_si256!1(b);
4329         __m128i r_lo = _mm_sign_epi32(a_lo, b_lo);
4330         __m128i r_hi = _mm_sign_epi32(a_hi, b_hi);
4331         return _mm256_set_m128i(r_hi, r_lo);
4332     }
4333     // PERF: not optimal in AVX without AVX2
4334 }
4335 unittest
4336 {
4337     __m256i A = _mm256_setr_epi32(-2, -1,  0, int.max, -2, -1,  0, int.max);
4338     __m256i B = _mm256_setr_epi32(-1,  0, -1,       1, -1,  0, -1,       1);
4339     int8 C = cast(int8) _mm256_sign_epi32(A, B);
4340     int[8] correct =             [ 2,  0, 0, int.max,   2,  0,  0, int.max];
4341     assert(C.array == correct);
4342 }
4343 
4344 /// Negate packed signed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative.
4345 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
4346 __m256i _mm256_sign_epi8 (__m256i a, __m256i b) pure @safe
4347 {
4348     // PERF DMD
4349     static if (GDC_with_AVX2)
4350     {
4351         return cast(__m256i) __builtin_ia32_psignb256(cast(ubyte32)a, cast(ubyte32)b);
4352     }
4353     else static if (LDC_with_AVX2)
4354     {
4355         return cast(__m256i) __builtin_ia32_psignb256(cast(byte32)a, cast(byte32)b);
4356     }
4357     else // split
4358     {
4359         // LDC arm64, 10 inst since LDC 1.32.1 -O1
4360         __m128i a_lo = _mm256_extractf128_si256!0(a);
4361         __m128i a_hi = _mm256_extractf128_si256!1(a);
4362         __m128i b_lo = _mm256_extractf128_si256!0(b);
4363         __m128i b_hi = _mm256_extractf128_si256!1(b);
4364         __m128i r_lo = _mm_sign_epi8(a_lo, b_lo);
4365         __m128i r_hi = _mm_sign_epi8(a_hi, b_hi);
4366         return _mm256_set_m128i(r_hi, r_lo);
4367     }
4368     // PERF: not optimal in AVX without AVX2
4369 }
4370 unittest
4371 {
4372     __m256i A = _mm256_setr_epi8( 1,  1, 1, 1,  1,        1,       -2,        1,  0,  1, 0, 0,  0,        0,       -2,        1, 
4373                                  -2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
4374     __m256i B = _mm256_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -1,  0,-1, 1, -2,      -50,        0,       50,
4375                                  -1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
4376     byte32  C = cast(byte32) _mm256_sign_epi8(A, B);
4377     byte[32] correct =         [ -1, 0,-1, 1, -1,       -1,        0,        1,  0,  0, 0, 0,  0,        0,        0,        1,        
4378                                   2, 0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
4379     assert(C.array == correct);
4380 }
4381 
4382 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeroes.
4383 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4384 /// If bit-shift > 15, result is defined to be all zeroes.
4385 /// Note: prefer `_mm256_slli_epi16`, less of a trap.
4386 __m256i _mm256_sll_epi16 (__m256i a, __m128i count) pure @trusted
4387 {
4388     // PERF ARM64
4389     static if (GDC_or_LDC_with_AVX2)
4390     {
4391         return cast(__m256i) __builtin_ia32_psllw256(cast(short16)a, cast(short8)count);
4392     }
4393     else
4394     {
4395         __m128i a_lo = _mm256_extractf128_si256!0(a);
4396         __m128i a_hi = _mm256_extractf128_si256!1(a);
4397         __m128i r_lo = _mm_sll_epi16(a_lo, count);
4398         __m128i r_hi = _mm_sll_epi16(a_hi, count);
4399         return _mm256_set_m128i(r_hi, r_lo);
4400     }
4401 }
4402 unittest
4403 {
4404     __m128i shift0 = _mm_setzero_si128();
4405     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4406     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4407     __m256i A = _mm256_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768);
4408     short[16] correct0  = (cast(short16)A).array;
4409     short[16] correctX  = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 
4410     short[16] correct2  = [16, -32, 44, 0, 16, -32, 44, 0, 16, -32, 44, 0, 16, -32, 44, 0];
4411     short16 B0 = cast(short16) _mm256_sll_epi16(A, shift0);
4412     short16 BX = cast(short16) _mm256_sll_epi16(A, shiftX);
4413     short16 B2 = cast(short16) _mm256_sll_epi16(A, shift2);
4414     assert(B0.array == correct0);
4415     assert(BX.array == correctX);
4416     assert(B2.array == correct2);
4417 }
4418 
4419 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeroes.
4420 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4421 /// If bit-shift > 31, result is defined to be all zeroes.
4422 /// Note: prefer `_mm256_slli_epi32`, less of a trap.
4423 __m256i _mm256_sll_epi32 (__m256i a, __m128i count) pure @trusted
4424 {
4425     // PERF ARM64
4426     static if (GDC_or_LDC_with_AVX2)
4427     {
4428         return cast(__m256i) __builtin_ia32_pslld256(cast(int8)a, count);
4429     }
4430     else
4431     {
4432         __m128i a_lo = _mm256_extractf128_si256!0(a);
4433         __m128i a_hi = _mm256_extractf128_si256!1(a);
4434         __m128i r_lo = _mm_sll_epi32(a_lo, count);
4435         __m128i r_hi = _mm_sll_epi32(a_hi, count);
4436         return _mm256_set_m128i(r_hi, r_lo);
4437     }
4438 }
4439 unittest
4440 {
4441     __m128i shift0 = _mm_setzero_si128();
4442     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4443     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4444     __m256i A = _mm256_setr_epi32(4, -9, 11, -2147483648, 2, -9, 11, -2147483648);
4445     int[8] correct0  = (cast(int8)A).array;
4446     int[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
4447     int[8] correct2  = [16, -36, 44, 0, 8, -36, 44, 0];
4448     int8 B0 = cast(int8) _mm256_sll_epi32(A, shift0);
4449     int8 BX = cast(int8) _mm256_sll_epi32(A, shiftX);
4450     int8 B2 = cast(int8) _mm256_sll_epi32(A, shift2);
4451     assert(B0.array == correct0);
4452     assert(BX.array == correctX);
4453     assert(B2.array == correct2);
4454 }
4455 
4456 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeroes.
4457 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4458 /// If bit-shift > 63, result is defined to be all zeroes.
4459 /// Note: prefer `_mm256_sll_epi64`, less of a trap.
4460 __m256i _mm256_sll_epi64 (__m256i a, __m128i count) pure @trusted
4461 {
4462     // PERF ARM64
4463     static if (GDC_or_LDC_with_AVX2)
4464     {
4465         return cast(__m256i) __builtin_ia32_psllq256(cast(long4)a, cast(long2)count);
4466     }
4467     else
4468     {
4469         __m128i a_lo = _mm256_extractf128_si256!0(a);
4470         __m128i a_hi = _mm256_extractf128_si256!1(a);
4471         __m128i r_lo = _mm_sll_epi64(a_lo, count);
4472         __m128i r_hi = _mm_sll_epi64(a_hi, count);
4473         return _mm256_set_m128i(r_hi, r_lo);
4474     }
4475 }
4476 unittest
4477 {
4478     __m128i shift0 = _mm_setzero_si128();
4479     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4480     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4481     __m256i A = _mm256_setr_epi64(4, -9, 5, -8);
4482     long[4] correct0  = [ 4,  -9, 5, -8];
4483     long[4] correctX  = [ 0,   0,  0, 0];
4484     long[4] correct2  = [16, -36, 20, -32];
4485     long4 B0 = cast(long4) _mm256_sll_epi64(A, shift0);
4486     long4 BX = cast(long4) _mm256_sll_epi64(A, shiftX);
4487     long4 B2 = cast(long4) _mm256_sll_epi64(A, shift2);
4488     assert(B0.array == correct0);
4489     assert(BX.array == correctX);
4490     assert(B2.array == correct2);
4491 }
4492 
4493 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
4494 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @safe
4495 {
4496     static if (GDC_or_LDC_with_AVX2)
4497     {
4498         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
4499     }
4500     else // split
4501     {
4502         __m128i a_lo = _mm256_extractf128_si256!0(a);
4503         __m128i a_hi = _mm256_extractf128_si256!1(a);
4504         __m128i r_lo = _mm_slli_epi16(a_lo, imm8);
4505         __m128i r_hi = _mm_slli_epi16(a_hi, imm8);
4506         return _mm256_set_m128i(r_hi, r_lo);
4507     }
4508 }
4509 unittest
4510 {
4511     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
4512     short16 B = cast(short16)( _mm256_slli_epi16(A, 1) );
4513     short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) );
4514     short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ];
4515     assert(B.array == expectedB);
4516     assert(B2.array == expectedB);
4517 
4518     short16 C = cast(short16)( _mm256_slli_epi16(A, 16) );
4519     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
4520     assert(C.array == expectedC);
4521 }
4522 
4523 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
4524 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @safe
4525 {
4526     static if (GDC_or_LDC_with_AVX2)
4527     {
4528         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
4529     }
4530     else
4531     {
4532         __m128i a_lo = _mm256_extractf128_si256!0(a);
4533         __m128i a_hi = _mm256_extractf128_si256!1(a);
4534         __m128i r_lo = _mm_slli_epi32(a_lo, imm8);
4535         __m128i r_hi = _mm_slli_epi32(a_hi, imm8);
4536         return _mm256_set_m128i(r_hi, r_lo);
4537     }
4538 }
4539 unittest
4540 {
4541     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -9);
4542     int8 B = cast(int8) _mm256_slli_epi32(A, 1);
4543     int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256);
4544     int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -18 ];
4545     assert(B.array == expectedB);
4546     assert(B2.array == expectedB);
4547 
4548     int8 C = cast(int8) _mm256_slli_epi32(A, 0);
4549     int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -9 ];
4550     assert(C.array == expectedC);
4551 
4552     int8 D = cast(int8) _mm256_slli_epi32(A, 65);
4553     int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
4554     assert(D.array == expectedD);
4555 }
4556 
4557 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
4558 __m256i _mm256_slli_epi64 (__m256i a, int imm8) pure @safe
4559 {
4560     static if (GDC_or_LDC_with_AVX2)
4561     {
4562         return cast(__m256i) __builtin_ia32_psllqi256(cast(long4)a, cast(ubyte)imm8);
4563     }
4564     else
4565     {
4566         __m128i a_lo = _mm256_extractf128_si256!0(a);
4567         __m128i a_hi = _mm256_extractf128_si256!1(a);
4568         __m128i r_lo = _mm_slli_epi64(a_lo, imm8);
4569         __m128i r_hi = _mm_slli_epi64(a_hi, imm8);
4570         return _mm256_set_m128i(r_hi, r_lo);
4571     }
4572 }
4573 unittest
4574 {
4575     __m256i A = _mm256_setr_epi64(23, -4, 1, long.max);
4576     long4 B = cast(long4) _mm256_slli_epi64(A, 1);
4577     long4 B2 = cast(long4) _mm256_slli_epi64(A, 1 + 256);
4578 
4579     long[4] expectedB = [ 46, -8, 2, -2];
4580     assert(B.array == expectedB);
4581     assert(B2.array == expectedB);
4582 
4583     long4 C = cast(long4) _mm256_slli_epi64(A, 0);
4584     long[4] expectedC = [ 23, -4, 1, long.max ];
4585     assert(C.array == expectedC);
4586 
4587     long4 D = cast(long4) _mm256_slli_epi64(A, 65);
4588     long[4] expectedD = [ 0, 0, 0, 0 ];
4589     assert(D.array == expectedD);
4590 }
4591 
4592 /// Shift 128-bit lanes in `a` left by `bytes` bytes while shifting in zeroes.
4593 alias _mm256_slli_si256 = _mm256_bslli_epi128;
4594 
4595 /// Shift packed 32-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes.
4596 __m128i _mm_sllv_epi32(__m128i a, __m128i count) pure @trusted
4597 {
4598     static if (GDC_with_AVX2 || LDC_with_AVX2)
4599         return cast(__m128i)__builtin_ia32_psllv4si(cast(byte16)a, cast(byte16)count);
4600     else
4601     {
4602         // UB if b[n] >= 32
4603         __m128i R = _mm_setr_epi32(a.array[0] << count.array[0], 
4604                                    a.array[1] << count.array[1], 
4605                                    a.array[2] << count.array[2], 
4606                                    a.array[3] << count.array[3]);
4607 
4608         // Map large and negative shifts to 32
4609         __m128i mm32 = _mm_set1_epi32(32);
4610         __m128i shift = _mm_min_epu32(count, mm32);
4611 
4612         // Set to 0 where the shift is >= 32
4613         R = R & _mm_cmplt_epi32(shift, mm32);
4614         return R;
4615     }
4616 }
4617 unittest
4618 {
4619     __m128i A     = _mm_setr_epi32(-1,  1, 4, -4);
4620     __m128i shift = _mm_setr_epi32( 2, -6, 1, 32);
4621     int4 R = cast(int4) _mm_sllv_epi32(A, shift);
4622     int[4] expected = [ -4, 0, 8, 0 ];
4623     assert(R.array == expected);
4624 }
4625 
4626 /// Shift packed 32-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes.
4627 __m256i _mm256_sllv_epi32 (__m256i a, __m256i count) pure @safe
4628 {
4629     static if (GDC_with_AVX2 || LDC_with_AVX2)
4630         return cast(__m256i)__builtin_ia32_psllv8si(cast(int8)a, cast(int8)count);
4631     else
4632     {
4633         // split
4634         __m128i a_lo = _mm256_extractf128_si256!0(a);
4635         __m128i a_hi = _mm256_extractf128_si256!1(a);
4636         __m128i c_lo = _mm256_extractf128_si256!0(count);
4637         __m128i c_hi = _mm256_extractf128_si256!1(count);
4638         __m128i r_lo = _mm_sllv_epi32(a_lo, c_lo);
4639         __m128i r_hi = _mm_sllv_epi32(a_hi, c_hi);
4640         return _mm256_set_m128i(r_hi, r_lo);
4641     }
4642 }
4643 unittest
4644 {
4645     __m256i A     = _mm256_setr_epi32(-1,  1, 4, -4, -1,  1,  4, -4);
4646     __m256i shift = _mm256_setr_epi32( 2, -6, 1, 32,  2, -6, 33, 32);
4647     int8 R = cast(int8) _mm256_sllv_epi32(A, shift);
4648     int[8] expected = [ -4, 0, 8, 0, -4, 0, 0, 0 ];
4649     assert(R.array == expected);
4650 }
4651 
4652 
4653 /// Shift packed 64-bit integers in `a` left by the amount specified by the corresponding element in `b` while shifting in zeros.
4654 __m128i _mm_sllv_epi64(__m128i a, __m128i count) pure @trusted
4655 {
4656     static if (GDC_with_AVX2 || LDC_with_AVX2)
4657     {
4658         return cast(__m128i)__builtin_ia32_psllv2di(cast(long2)a, cast(long2)count);
4659     }
4660     else
4661     {
4662         // PERF arm64
4663         // LDC: x86, it's not good, but at least it's branchless
4664         long2 la = cast(long2)a;
4665         long2 lb = cast(long2)count;
4666         long2 R;
4667         R.ptr[0] = cast(uint)(lb.array[0]) < 64 ? (la.array[0] << lb.array[0]) : 0;
4668         R.ptr[1] = cast(uint)(lb.array[1]) < 64 ? (la.array[1] << lb.array[1]) : 0;
4669         return cast(__m128i)R;
4670     }
4671 }
4672 unittest
4673 {
4674     __m128i A  = _mm_setr_epi64( -4,  6);
4675     __m128i B1 = _mm_setr_epi64(  2,  0);
4676     __m128i B2 = _mm_setr_epi64(-12, 64);
4677     long2 R1 = cast(long2) _mm_sllv_epi64(A, B1);
4678     long2 R2 = cast(long2) _mm_sllv_epi64(A, B2);
4679     long[2] correct1 = [-16, 6];
4680     long[2] correct2 = [  0, 0];
4681     assert(R1.array == correct1);
4682     assert(R2.array == correct2);
4683 }
4684 
4685 /// Shift packed 64-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes.
4686 __m256i _mm256_sllv_epi64 (__m256i a, __m256i count) pure @safe
4687 {
4688     static if (GDC_with_AVX2 || LDC_with_AVX2)
4689         return cast(__m256i)__builtin_ia32_psllv4di(cast(long4)a, cast(long4)count);
4690     else
4691     {
4692         // split
4693         __m128i a_lo = _mm256_extractf128_si256!0(a);
4694         __m128i a_hi = _mm256_extractf128_si256!1(a);
4695         __m128i c_lo = _mm256_extractf128_si256!0(count);
4696         __m128i c_hi = _mm256_extractf128_si256!1(count);
4697         __m128i r_lo = _mm_sllv_epi64(a_lo, c_lo);
4698         __m128i r_hi = _mm_sllv_epi64(a_hi, c_hi);
4699         return _mm256_set_m128i(r_hi, r_lo);
4700     }
4701 }
4702 unittest
4703 {
4704     __m256i A  = _mm256_setr_epi64( -4,  6, -1, 6);
4705     __m256i B1 = _mm256_setr_epi64(  2,  0,  3, 1);
4706     __m256i B2 = _mm256_setr_epi64(-12, 64, 63, 64);
4707     long4 R1 = cast(long4) _mm256_sllv_epi64(A, B1);
4708     long4 R2 = cast(long4) _mm256_sllv_epi64(A, B2);
4709     long[4] correct1 = [-16, 6, -8, 12];
4710     long[4] correct2 = [  0, 0, long.min, 0];
4711     assert(R1.array == correct1);
4712     assert(R2.array == correct2);
4713 }
4714 
4715 
4716 
4717 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
4718 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4719 /// If bit-shift > 15, result is defined to be all sign bits.
4720 /// Warning: prefer `_mm256_srai_epi16`, less of a trap.
4721 __m256i _mm256_sra_epi16 (__m256i a, __m128i count) pure @trusted
4722 {
4723     static if (GDC_or_LDC_with_AVX2)
4724     {
4725         return cast(__m256i) __builtin_ia32_psraw256(cast(short16)a, cast(short8)count);
4726     }
4727     else
4728     {
4729         // split
4730         __m128i a_lo = _mm256_extractf128_si256!0(a);
4731         __m128i a_hi = _mm256_extractf128_si256!1(a);
4732         __m128i r_lo = _mm_sra_epi16(a_lo, count);
4733         __m128i r_hi = _mm_sra_epi16(a_hi, count);
4734         return _mm256_set_m128i(r_hi, r_lo);
4735     }
4736 }
4737 unittest
4738 {
4739     __m128i shift0 = _mm_setzero_si128();
4740     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4741     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4742     __m256i A = _mm256_setr_epi16(4, -9, 11, -32768, 4, -8, 11, -32768,
4743                                   4, -9, 11, -32768, 4, -8, 11, -32768);
4744     short[16] correct0  = (cast(short16)A).array;
4745     short[16] correctX  = [0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1]; 
4746     short[16] correct2  = [1, -3,  2, -8192,  1, -2,  2, -8192, 1, -3,  2, -8192,  1, -2,  2, -8192];
4747     short16 B0 = cast(short16) _mm256_sra_epi16(A, shift0);
4748     short16 BX = cast(short16) _mm256_sra_epi16(A, shiftX);
4749     short16 B2 = cast(short16) _mm256_sra_epi16(A, shift2);
4750     assert(B0.array == correct0);
4751     assert(BX.array == correctX);
4752     assert(B2.array == correct2);
4753 }
4754 
4755 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
4756 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4757 /// If bit-shift > 31, result is defined to be all sign bits.
4758 /// Warning: prefer `_mm256_sra_epi32`, less of a trap.
4759 __m256i _mm256_sra_epi32 (__m256i a, __m128i count) pure @trusted
4760 {
4761     static if (GDC_or_LDC_with_AVX2)
4762     {
4763         return cast(__m256i) __builtin_ia32_psrad256(cast(int8)a, cast(int4)count);
4764     }
4765     else
4766     {
4767         // split
4768         __m128i a_lo = _mm256_extractf128_si256!0(a);
4769         __m128i a_hi = _mm256_extractf128_si256!1(a);
4770         __m128i r_lo = _mm_sra_epi32(a_lo, count);
4771         __m128i r_hi = _mm_sra_epi32(a_hi, count);
4772         return _mm256_set_m128i(r_hi, r_lo);
4773     }
4774 }
4775 unittest
4776 {
4777     __m128i shift0 = _mm_setzero_si128();
4778     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4779     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4780     __m256i A = _mm256_setr_epi32(4, -9, 11, -2147483648, 8, -9, 11, -2147483648);
4781     int[8] correct0  = (cast(int8)A).array;
4782     int[8] correctX  = [0, -1, 0, -1, 0, -1, 0, -1]; 
4783     int[8] correct2  = [1, -3, 2, -536870912, 2, -3, 2, -536870912];
4784     int8 B0 = cast(int8) _mm256_sra_epi32(A, shift0);
4785     int8 BX = cast(int8) _mm256_sra_epi32(A, shiftX);
4786     int8 B2 = cast(int8) _mm256_sra_epi32(A, shift2);
4787     assert(B0.array == correct0);
4788     assert(BX.array == correctX);
4789     assert(B2.array == correct2);
4790 }
4791 
4792 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
4793 __m256i _mm256_srai_epi16 (__m256i a, int imm8) pure @safe
4794 {
4795     static if (GDC_or_LDC_with_AVX2)
4796     {
4797         return cast(__m256i) __builtin_ia32_psrawi256(cast(short16)a, cast(ubyte)imm8);
4798     }
4799     else 
4800     {
4801         // split
4802         __m128i a_lo = _mm256_extractf128_si256!0(a);
4803         __m128i a_hi = _mm256_extractf128_si256!1(a);
4804         __m128i r_lo = _mm_srai_epi16(a_lo, imm8);
4805         __m128i r_hi = _mm_srai_epi16(a_hi, imm8);
4806         return _mm256_set_m128i(r_hi, r_lo);
4807     }
4808 }
4809 unittest
4810 {
4811     __m256i A  = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, short.min, short.max, 2, 3, -4, -5, 6, 7);
4812     short16 B  = cast(short16)( _mm256_srai_epi16(A, 1) );
4813     short16 B2 = cast(short16)( _mm256_srai_epi16(A, 1 + 256) );
4814     short[16] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3, -16384, 16383, 1, 1, -2, -3, 3, 3 ];
4815     assert(B.array == expectedB);
4816     assert(B2.array == expectedB);
4817 
4818     short16 C = cast(short16)( _mm256_srai_epi16(A, 18) );
4819     short[16] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0,
4820                            -1, 0, 0, 0, -1, -1, 0, 0 ];
4821     assert(C.array == expectedC);
4822 }
4823 
4824 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
4825 __m256i _mm256_srai_epi32 (__m256i a, int imm8) pure @safe
4826 {
4827     static if (GDC_or_LDC_with_AVX2)
4828     {
4829         return cast(__m256i) __builtin_ia32_psradi256(cast(int8)a, cast(ubyte)imm8);
4830     }
4831     else // split
4832     {
4833         __m128i a_lo = _mm256_extractf128_si256!0(a);
4834         __m128i a_hi = _mm256_extractf128_si256!1(a);
4835         __m128i r_lo = _mm_srai_epi32(a_lo, imm8);
4836         __m128i r_hi = _mm_srai_epi32(a_hi, imm8);
4837         return _mm256_set_m128i(r_hi, r_lo);
4838     }
4839 }
4840 unittest
4841 {
4842     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
4843     int8 B = cast(int8) _mm256_srai_epi32(A, 1);
4844     int8 B2 = cast(int8) _mm256_srai_epi32(A, 1 + 256);
4845     int[8] expectedB = [ 0, 1, 1, -2, 0, 1, 1, -2];
4846     assert(B.array == expectedB);
4847     assert(B2.array == expectedB);
4848 
4849     int8 C = cast(int8) _mm256_srai_epi32(A, 32);
4850     int[8] expectedC = [ 0, 0, 0, -1, 0, 0, 0, -1];
4851     assert(C.array == expectedC);
4852 
4853     int8 D = cast(int8) _mm256_srai_epi32(A, 0);
4854     int[8] expectedD = [ 0, 2, 3, -4, 0, 2, 3, -4];
4855     assert(D.array == expectedD);
4856 }
4857 
4858 __m128i _mm_srav_epi32(__m128i a, __m128i count) pure @trusted
4859 {
4860     static if (GDC_with_AVX2 || LDC_with_AVX2)
4861         return cast(__m128i)__builtin_ia32_psrav4si(cast(int4)a, cast(int4)count);
4862     else
4863     {
4864         __m128i R = _mm_setr_epi32(a.array[0] >> count.array[0], 
4865                                    a.array[1] >> count.array[1], 
4866                                    a.array[2] >> count.array[2], 
4867                                    a.array[3] >> count.array[3]);
4868 
4869         // Map large and negative shifts to all sign bits
4870         __m128i signbits = _mm_srai_epi32(a, 31);
4871         __m128i mm32 = _mm_set1_epi32(32);
4872         __m128i shift = _mm_min_epu32(count, mm32);
4873 
4874         // Set to 0 where the shift is >= 32
4875         __m128i lower = _mm_cmplt_epi32(shift, mm32);
4876 
4877         R = (R & lower) | (signbits & ~lower);
4878         return R;
4879     }
4880 }
4881 unittest
4882 {
4883     __m128i A     = _mm_setr_epi32(-1,  1, -4, -4);
4884     __m128i shift = _mm_setr_epi32( 2, -6, 31, 32);
4885     int4 R = cast(int4) _mm_srav_epi32(A, shift);
4886     int[4] expected = [ -1, 0, -1, -1 ];
4887     assert(R.array == expected);
4888 }
4889 
4890 __m256i _mm256_srav_epi32 (__m256i a, __m256i count) pure @safe
4891 {
4892     static if (GDC_or_LDC_with_AVX2)
4893     {
4894         return cast(__m256i) __builtin_ia32_psrav8si(cast(int8)a, cast(int8)count);
4895     }
4896     else // split
4897     {
4898         __m128i a_lo = _mm256_extractf128_si256!0(a);
4899         __m128i a_hi = _mm256_extractf128_si256!1(a);
4900         __m128i c_lo = _mm256_extractf128_si256!0(count);
4901         __m128i c_hi = _mm256_extractf128_si256!1(count);
4902         __m128i r_lo = _mm_srav_epi32(a_lo, c_lo);
4903         __m128i r_hi = _mm_srav_epi32(a_hi, c_hi);
4904         return _mm256_set_m128i(r_hi, r_lo);
4905     }
4906 }
4907 unittest
4908 {
4909     __m128i A     = _mm_setr_epi32(-1,  1, -4, -4);
4910     __m128i shift = _mm_setr_epi32( 2, -6, 31, 32);
4911     int4 R = cast(int4) _mm_srav_epi32(A, shift);
4912     int[4] expected = [ -1, 0, -1, -1 ];
4913     assert(R.array == expected);
4914 }
4915 
4916 /// Shift packed 16-bit integers in `a` right by `count` while shifting in zeroes.
4917 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4918 /// If bit-shift > 15, result is defined to be all zeroes.
4919 /// Note: prefer `_mm256_srli_epi16`, less of a trap.
4920 __m256i _mm256_srl_epi16 (__m256i a, __m128i count) pure @trusted
4921 {
4922     // PERF ARM64
4923     static if (GDC_or_LDC_with_AVX2)
4924     {
4925         return cast(__m256i) __builtin_ia32_psrlw256(cast(short16)a, cast(short8)count);
4926     }
4927     else
4928     {
4929         __m128i a_lo = _mm256_extractf128_si256!0(a);
4930         __m128i a_hi = _mm256_extractf128_si256!1(a);
4931         __m128i r_lo = _mm_srl_epi16(a_lo, count);
4932         __m128i r_hi = _mm_srl_epi16(a_hi, count);
4933         return _mm256_set_m128i(r_hi, r_lo);
4934     }
4935 }
4936 unittest
4937 {
4938     __m128i shift0 = _mm_setzero_si128();
4939     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4940     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4941     __m256i A = _mm256_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768);
4942     short[16] correct0  = (cast(short16)A).array;
4943     short[16] correctX  = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 
4944     short[16] correct2  = [1, 16382, 2, 8192, 1, 16382, 2, 8192, 1, 16382, 2, 8192, 1, 16382, 2, 8192];
4945     short16 B0 = cast(short16) _mm256_srl_epi16(A, shift0);
4946     short16 BX = cast(short16) _mm256_srl_epi16(A, shiftX);
4947     short16 B2 = cast(short16) _mm256_srl_epi16(A, shift2);
4948     assert(B0.array == correct0);
4949     assert(BX.array == correctX);
4950     assert(B2.array == correct2);
4951 }
4952 
4953 /// Shift packed 32-bit integers in `a` right by `count` while shifting in zeroes.
4954 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4955 /// If bit-shift > 31, result is defined to be all zeroes.
4956 /// Note: prefer `_mm256_srli_epi32`, less of a trap.
4957 __m256i _mm256_srl_epi32 (__m256i a, __m128i count) pure @trusted
4958 {
4959     // PERF ARM64
4960     static if (GDC_or_LDC_with_AVX2)
4961     {
4962         return cast(__m256i) __builtin_ia32_psrld256(cast(int8)a, count);
4963     }
4964     else
4965     {
4966         __m128i a_lo = _mm256_extractf128_si256!0(a);
4967         __m128i a_hi = _mm256_extractf128_si256!1(a);
4968         __m128i r_lo = _mm_srl_epi32(a_lo, count);
4969         __m128i r_hi = _mm_srl_epi32(a_hi, count);
4970         return _mm256_set_m128i(r_hi, r_lo);
4971     }
4972 }
4973 unittest
4974 {
4975     __m128i shift0 = _mm_setzero_si128();
4976     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4977     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4978     __m256i A = _mm256_setr_epi32(4, -8, 11, -0x80000000, 0, 1, -11, 0x7fffffff);
4979     int[8] correct0  = (cast(int8)A).array;
4980     int[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
4981     int[8] correct2  = [1, 1073741822, 2, 536870912, 0, 0, 1073741821, 0x1fffffff];
4982     int8 B0 = cast(int8) _mm256_srl_epi32(A, shift0);
4983     int8 BX = cast(int8) _mm256_srl_epi32(A, shiftX);
4984     int8 B2 = cast(int8) _mm256_srl_epi32(A, shift2);
4985     assert(B0.array == correct0);
4986     assert(BX.array == correctX);
4987     assert(B2.array == correct2);
4988 }
4989 
4990 /// Shift packed 64-bit integers in `a` right by `count` while shifting in zeroes.
4991 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4992 /// If bit-shift > 63, result is defined to be all zeroes.
4993 /// Note: prefer `_mm256_srli_epi64`, less of a trap.
4994 __m256i _mm256_srl_epi64 (__m256i a, __m128i count) pure @trusted
4995 {
4996     // PERF ARM64
4997     /*
4998     static if (LDC_with_ARM64)
4999     { 
5000         long bs = (cast(long2)count).array[0];
5001         if (bs > 63)
5002             return long4(0);
5003         else 
5004         {
5005             a <<= long4(bs);
5006             return a;
5007         }
5008     }
5009     else*/  static if (GDC_or_LDC_with_AVX2)
5010     {
5011         return cast(__m256i) __builtin_ia32_psrlq256(cast(long4)a, cast(long2)count);
5012     }
5013     else
5014     {
5015         __m128i a_lo = _mm256_extractf128_si256!0(a);
5016         __m128i a_hi = _mm256_extractf128_si256!1(a);
5017         __m128i r_lo = _mm_srl_epi64(a_lo, count);
5018         __m128i r_hi = _mm_srl_epi64(a_hi, count);
5019         return _mm256_set_m128i(r_hi, r_lo);
5020     }
5021 }
5022 unittest
5023 {
5024     __m128i shift0 = _mm_setzero_si128();
5025     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
5026     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
5027     __m256i A = _mm256_setr_epi64(4, -9, 8, -9);
5028     long[4] correct0  = [ 4,  -9, 8, -9];
5029     long[4] correctX  = [ 0,   0,  0, 0];
5030     long[4] correct2  = [ 1,  4611686018427387901,  2, 4611686018427387901];
5031     long4 B0 = cast(long4) _mm256_srl_epi64(A, shift0);
5032     long4 BX = cast(long4) _mm256_srl_epi64(A, shiftX);
5033     long4 B2 = cast(long4) _mm256_srl_epi64(A, shift2);
5034     assert(B0.array == correct0);
5035     assert(BX.array == correctX);
5036     assert(B2.array == correct2);
5037 }
5038 
5039 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
5040 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted
5041 {
5042     static if (GDC_with_AVX2)
5043     {
5044         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
5045     }
5046     else static if (LDC_with_AVX2)
5047     {
5048         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
5049     }
5050     else
5051     {
5052         __m128i a_lo = _mm256_extractf128_si256!0(a);
5053         __m128i a_hi = _mm256_extractf128_si256!1(a);
5054         __m128i r_lo = _mm_srli_epi16(a_lo, imm8);
5055         __m128i r_hi = _mm_srli_epi16(a_hi, imm8);
5056         return _mm256_set_m128i(r_hi, r_lo);
5057     }
5058 }
5059 unittest
5060 {
5061     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
5062     short16 B = cast(short16) _mm256_srli_epi16(A, 1);
5063     short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256);
5064     short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
5065     assert(B.array == expectedB);
5066     assert(B2.array == expectedB);
5067 
5068     short16 C = cast(short16) _mm256_srli_epi16(A, 16);
5069     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
5070     assert(C.array == expectedC);
5071 
5072     short16 D = cast(short16) _mm256_srli_epi16(A, 0);
5073     short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ];
5074     assert(D.array == expectedD);
5075 }
5076 
5077 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
5078 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted
5079 {
5080     static if (GDC_with_AVX2)
5081     {
5082         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
5083     }
5084     else static if (LDC_with_AVX2)
5085     {
5086         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
5087     }
5088     else 
5089     {
5090         // split
5091         __m128i a_lo = _mm256_extractf128_si256!0(a);
5092         __m128i a_hi = _mm256_extractf128_si256!1(a);
5093         __m128i r_lo = _mm_srli_epi32(a_lo, imm8);
5094         __m128i r_hi = _mm_srli_epi32(a_hi, imm8);
5095         return _mm256_set_m128i(r_hi, r_lo);
5096     }
5097 }
5098 unittest
5099 {
5100     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
5101     int8 B = cast(int8) _mm256_srli_epi32(A, 1);
5102     int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256);
5103     int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE];
5104     assert(B.array == expectedB);
5105     assert(B2.array == expectedB);
5106 
5107     int8 C = cast(int8) _mm256_srli_epi32(A, 255);
5108     int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
5109     assert(C.array == expectedC);
5110 }
5111 
5112 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
5113 __m256i _mm256_srli_epi64 (__m256i a, int imm8) pure @safe
5114 {
5115     static if (GDC_or_LDC_with_AVX2)
5116     {
5117         return cast(__m256i) __builtin_ia32_psrlqi256(cast(int8)a, cast(ubyte)imm8);
5118     }
5119     else 
5120     {
5121         // split
5122         __m128i a_lo = _mm256_extractf128_si256!0(a);
5123         __m128i a_hi = _mm256_extractf128_si256!1(a);
5124         __m128i r_lo = _mm_srli_epi64(a_lo, imm8);
5125         __m128i r_hi = _mm_srli_epi64(a_hi, imm8);
5126         return _mm256_set_m128i(r_hi, r_lo);
5127     }
5128 }
5129 unittest
5130 {
5131     __m256i A = _mm256_setr_epi64(8, -4, 16, -8);
5132     long4 B = cast(long4) _mm256_srli_epi64(A, 1);
5133     long4 B2 = cast(long4) _mm256_srli_epi64(A, 1 + 512);
5134     long[4] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE, 8, 0x7FFFFFFFFFFFFFFC];
5135     assert(B.array == expectedB);
5136     assert(B2.array == expectedB);
5137 
5138     long4 C = cast(long4) _mm256_srli_epi64(A, 64);
5139     long[4] expectedC = [ 0, 0, 0, 0 ];
5140     assert(C.array == expectedC);
5141 }
5142 
5143 /// Shift 128-bit lanes in `a` right by `bytes` bytes while shifting in zeroes.
5144 alias _mm256_srli_si256 = _mm256_bsrli_epi128;
5145 
5146 /// Shift packed 32-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
5147 __m128i _mm_srlv_epi32(__m128i a, __m128i count) pure @trusted
5148 {
5149     static if (GDC_with_AVX2 || LDC_with_AVX2)
5150         return cast(__m128i)__builtin_ia32_psrlv4si(cast(byte16)a, cast(byte16)count);
5151     else
5152     {
5153         __m128i R = _mm_setr_epi32(a.array[0] >>> count.array[0], 
5154                                    a.array[1] >>> count.array[1], 
5155                                    a.array[2] >>> count.array[2], 
5156                                    a.array[3] >>> count.array[3]);
5157 
5158         // Map large and negative shifts to 32
5159         __m128i mm32 = _mm_set1_epi32(32);
5160         __m128i shift = _mm_min_epu32(count, mm32);
5161 
5162         // Set to 0 where the shift is >= 32
5163         R = R & _mm_cmplt_epi32(shift, mm32);
5164         return R;
5165     }
5166 }
5167 unittest
5168 {
5169     __m128i A     = _mm_setr_epi32(-1,  1, 4, -4);
5170     __m128i shift = _mm_setr_epi32( 2, -6, 1, 32);
5171     int4 R = cast(int4) _mm_srlv_epi32(A, shift);
5172     int[4] expected = [ 1073741823, 0, 2, 0 ];
5173     assert(R.array == expected);
5174 }
5175 
5176 /// Shift packed 32-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
5177 __m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @trusted
5178 {
5179     static if (GDC_with_AVX2 || LDC_with_AVX2)
5180         return cast(__m256i)__builtin_ia32_psrlv8si(cast(int8)a, cast(int8)count);
5181     else
5182     {
5183         // split
5184         __m128i a_lo = _mm256_extractf128_si256!0(a);
5185         __m128i a_hi = _mm256_extractf128_si256!1(a);
5186         __m128i c_lo = _mm256_extractf128_si256!0(count);
5187         __m128i c_hi = _mm256_extractf128_si256!1(count);
5188         __m128i r_lo = _mm_srlv_epi32(a_lo, c_lo);
5189         __m128i r_hi = _mm_srlv_epi32(a_hi, c_hi);
5190         return _mm256_set_m128i(r_hi, r_lo);
5191     }
5192 }
5193 unittest
5194 {
5195     __m256i A     = _mm256_setr_epi32(-1,  1, 4, -4, -1,  1, 4, -4);
5196     __m256i shift = _mm256_setr_epi32( 2, -6, 1, 32, 33,  2, -6, 1);
5197     int8 R = cast(int8) _mm256_srlv_epi32(A, shift);
5198     int[8] expected = [ 1073741823, 0, 2, 0, 0, 0, 0, 2147483646 ];
5199     assert(R.array == expected);
5200 }
5201 
5202 /// Shift packed 64-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
5203 __m128i _mm_srlv_epi64(__m128i a, __m128i count) pure @trusted
5204 {
5205     static if (GDC_or_LDC_with_AVX2)
5206     {
5207         return cast(__m128i)__builtin_ia32_psrlv2di(cast(long2)a, cast(long2)count);
5208     }
5209     else
5210     {
5211         // Note: arm64 rather bad for LDC < 1.34
5212         //       after that, perfect.
5213         // LDC: x86, it's not good, but at least it's branchless
5214         long2 la = cast(long2)a;
5215         long2 lb = cast(long2)count;
5216         long2 R;
5217         R.ptr[0] = cast(ulong)(lb.array[0]) < 64 ? (la.array[0] >>> lb.array[0]) : 0;
5218         R.ptr[1] = cast(ulong)(lb.array[1]) < 64 ? (la.array[1] >>> lb.array[1]) : 0;
5219         return cast(__m128i)R;
5220     }
5221 }
5222 unittest
5223 {
5224     __m256i A  = _mm256_setr_epi64( -4,  6,  -4,  6);
5225     __m256i B1 = _mm256_setr_epi64(  2,  0,   2,  0);
5226     __m256i B2 = _mm256_setr_epi64(-12, 64, -12, 64);
5227     long4 R1 = cast(long4) _mm256_srlv_epi64(A, B1);
5228     long4 R2 = cast(long4) _mm256_srlv_epi64(A, B2);
5229     long[4] correct1 = [ 4611686018427387903, 6,  4611686018427387903, 6];
5230     long[4] correct2 = [                   0, 0,                    0, 0];
5231     assert(R1.array == correct1);
5232     assert(R2.array == correct2);
5233 }
5234 
5235 /// Shift packed 64-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes.
5236 __m256i _mm256_srlv_epi64 (__m256i a, __m256i count) pure @trusted
5237 {
5238     // PERF: rather lame in non-AVX2 x86
5239     static if (GDC_with_AVX2 || LDC_with_AVX2)
5240         return cast(__m256i)__builtin_ia32_psrlv4di(cast(long4)a, cast(long4)count);
5241     else
5242     {
5243         // split
5244         __m128i a_lo = _mm256_extractf128_si256!0(a);
5245         __m128i a_hi = _mm256_extractf128_si256!1(a);
5246         __m128i c_lo = _mm256_extractf128_si256!0(count);
5247         __m128i c_hi = _mm256_extractf128_si256!1(count);
5248         __m128i r_lo = _mm_srlv_epi64(a_lo, c_lo);
5249         __m128i r_hi = _mm_srlv_epi64(a_hi, c_hi);
5250         return _mm256_set_m128i(r_hi, r_lo);
5251     }
5252 }
5253 unittest
5254 {
5255     __m256i A  = _mm256_setr_epi64( -4,  6,  -4,  6);
5256     __m256i B1 = _mm256_setr_epi64(  2,  0,   2,  0);
5257     __m256i B2 = _mm256_setr_epi64(-12, 64, -12, 64);
5258     long4 R1 = cast(long4) _mm256_srlv_epi64(A, B1);
5259     long4 R2 = cast(long4) _mm256_srlv_epi64(A, B2);
5260     long[4] correct1 = [ 4611686018427387903, 6,  4611686018427387903, 6];
5261     long[4] correct2 = [                   0, 0,                    0, 0];
5262     assert(R1.array == correct1);
5263     assert(R2.array == correct2);
5264 }
5265 
5266 /// Load 256-bits of integer data from memory using a non-temporal memory hint.
5267 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be generated.
5268 __m256i _mm256_stream_load_si256 (const(__m256i)* mem_addr) pure @trusted
5269 {
5270     // PERF DMD D_SIMD
5271     static if (GDC_with_AVX2)
5272     {
5273         return cast(__m256i) __builtin_ia32_movntdqa256(cast(__m256i*)mem_addr); // const_cast
5274     }
5275     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
5276     {
5277         enum prefix = `!0 = !{ i32 1 }`;
5278         enum ir = `
5279             %r = load <4 x i64>, <4 x i64>* %0, !nontemporal !0
5280             ret <4 x i64> %r`;
5281         return cast(__m256i) LDCInlineIREx!(prefix, ir, "", long4, const(long4)*)(mem_addr);
5282     }
5283     else
5284     {
5285         return *mem_addr; // regular move instead
5286     }
5287 }
5288 unittest
5289 {
5290     align(32) static immutable int[8] correct = [1, 2, 3, 4, 5, 6, 7, 8];
5291     __m256i A = _mm256_stream_load_si256(cast(__m256i*)correct.ptr);
5292     _mm_mfence();
5293     assert((cast(int8)A).array == correct);
5294 }
5295 
5296 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
5297 __m256i _mm256_sub_epi16 (__m256i a, __m256i b) pure @safe
5298 {
5299     pragma(inline, true);
5300     return cast(__m256i)(cast(short16)a - cast(short16)b);
5301 }
5302 unittest
5303 {
5304     __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0,  6, -2);
5305     short16 R = cast(short16) _mm256_sub_epi16(A, A);
5306     short[16] correct         = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];
5307     assert(R.array == correct);
5308 }
5309 
5310 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
5311 __m256i _mm256_sub_epi32(__m256i a, __m256i b) pure @safe
5312 {
5313     pragma(inline, true);
5314     return cast(__m256i)(cast(int8)a - cast(int8)b);
5315 }
5316 unittest
5317 {
5318     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
5319     int8 R = cast(int8) _mm256_sub_epi32(A, A);
5320     int[8] correct = [ 0, 0, 0, 0, 0, 0, 0, 0];
5321     assert(R.array == correct);
5322 }
5323 
5324 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
5325 __m256i _mm256_sub_epi64 (__m256i a, __m256i b) pure @safe
5326 {
5327     pragma(inline, true);
5328     return a - b;
5329 }
5330 unittest
5331 {
5332     __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12);
5333     long4 R = cast(__m256i) _mm256_sub_epi64(A, A);
5334     long[4] correct = [ 0, 0, 0, 0 ];
5335     assert(R.array == correct);
5336 }
5337 
5338 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
5339 __m256i _mm256_sub_epi8 (__m256i a, __m256i b) pure @safe
5340 {
5341     pragma(inline, true);
5342     return cast(__m256i)(cast(byte32)a - cast(byte32)b);
5343 }
5344 unittest
5345 {
5346     __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78,
5347                                  4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78);
5348     byte32 R = cast(byte32) _mm256_sub_epi8(A, A);
5349     byte[32] correct; // zero initialized
5350     assert(R.array == correct);
5351 }
5352 
5353 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 
5354 /// saturation.
5355 __m256i _mm256_subs_epi16 (__m256i a, __m256i b) pure @trusted
5356 {
5357     // PERF DMD
5358     static if (GDC_with_AVX2)
5359     {
5360         return cast(__m256i) __builtin_ia32_psubsw256(cast(short16)a, cast(short16)b);
5361     }
5362     else static if(LDC_with_saturated_intrinsics)
5363     {
5364         return cast(__m256i) inteli_llvm_subs!short16(cast(short16)a, cast(short16)b);
5365     }
5366     else
5367     {
5368         short16 r;
5369         short16 sa = cast(short16)a;
5370         short16 sb = cast(short16)b;
5371         foreach(i; 0..16)
5372             r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
5373         return cast(__m256i)r;
5374     }
5375 }
5376 unittest
5377 {
5378     short16 res = cast(short16) _mm256_subs_epi16(_mm256_setr_epi16( 7,  6,  5, -32768, 3, 3, 32766,   0,  7,  6,  5, -32750, 3, 3, 32767,   0),
5379                                                   _mm256_setr_epi16( 7,  6,  5, -30000, 3, 1,    -2, -10,  7,  6,  5,    100, 3, 1,     1, -10));
5380     static immutable short[16] correctResult                    =  [ 0,  0,  0,  -2768, 0, 2, 32767,  10,  0,  0,  0, -32768, 0, 2, 32766,  10];
5381     assert(res.array == correctResult);
5382 }
5383 
5384 
5385 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
5386 /// saturation.
5387 __m256i _mm256_subs_epi8 (__m256i a, __m256i b) pure @trusted
5388 {
5389     // PERF DMD
5390     static if (GDC_with_AVX2)
5391     {
5392         return cast(__m256i) __builtin_ia32_psubsb256(cast(ubyte32)a, cast(ubyte32)b);
5393     }
5394     else static if(LDC_with_saturated_intrinsics)
5395     {
5396         return cast(__m256i) inteli_llvm_subs!byte32(cast(byte32)a, cast(byte32)b);
5397     }
5398     else
5399     {
5400         byte32 r;
5401         byte32 sa = cast(byte32)a;
5402         byte32 sb = cast(byte32)b;
5403         foreach(i; 0..32)
5404             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
5405         return cast(__m256i)r;
5406     }
5407 }
5408 unittest
5409 {
5410     byte32 R = cast(byte32) _mm256_subs_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 126, 9, 8, 7, 6, 5, -127, 3, 2, 1, 0),
5411                                              _mm256_setr_epi8(15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,    4, 3, 2, 1, 0, 15, 14, 13, 12, 11, -10, 9, 8, 7, 6, 5,    4, 3, 2, 1, 0));
5412     static immutable byte[32] correct                      = [ 0,  0,  0,  0,  0, 117, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0,  0,  0,  0,  0,  0, 127, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0]; 
5413     assert(R.array == correct);
5414 }
5415 
5416 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
5417 /// using saturation.
5418 __m256i _mm256_subs_epu16 (__m256i a, __m256i b) pure @trusted
5419 {
5420     // PERF DMD
5421     static if (GDC_with_AVX2)
5422     {
5423         return cast(__m256i) __builtin_ia32_psubusw256(cast(short16)a, cast(short16)b);
5424     }
5425     else static if(LDC_with_saturated_intrinsics)
5426     {
5427         return cast(__m256i) inteli_llvm_subus!short16(cast(short16)a, cast(short16)b);
5428     }
5429     else
5430     {
5431         short16 r;
5432         short16 sa = cast(short16)a;
5433         short16 sb = cast(short16)b;
5434         foreach(i; 0..16)
5435             r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]));
5436         return cast(__m256i)r;
5437     }
5438 }
5439 unittest
5440 {
5441     short16 R = cast(short16) _mm256_subs_epu16(_mm256_setr_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3,  2, cast(short)65534, 0),
5442                                                 _mm256_setr_epi16(3, 4,                1, 0, 3, 2,                1, 0, 3, 2,                1, 0, 3, 20, cast(short)65535, 0));
5443     static immutable short[16] correct =                         [0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0,  0,                0, 0];
5444     assert(R.array == correct);
5445 }
5446 
5447 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` using
5448 /// saturation.
5449 __m256i _mm256_subs_epu8 (__m256i a, __m256i b) pure @trusted
5450 {
5451     // PERF DMD
5452     // PERF GDC without AVX2
5453     static if (GDC_with_AVX2)
5454     {
5455         return cast(__m256i) __builtin_ia32_psubusb256(cast(ubyte32)a, cast(ubyte32)b);
5456     }
5457     else static if(LDC_with_saturated_intrinsics)
5458     {
5459         return cast(__m256i) inteli_llvm_subus!byte32(cast(byte32)a, cast(byte32)b);
5460     }
5461     else
5462     {
5463         byte32 r;
5464         byte32 sa = cast(byte32)a;
5465         byte32 sb = cast(byte32)b;
5466         foreach(i; 0..32)
5467             r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
5468         return cast(__m256i)r;
5469     }
5470 }
5471 unittest
5472 {
5473     __m256i A          = _mm256_setr_epi8(0, 0, 5, 4, 5, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0);
5474     __m256i B          = _mm256_setr_epi8(0, 0, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0,             1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)137, 0, 0, 0,            40, 0, 0, 0, 0, 0, 0);
5475     byte32 R = cast(byte32) _mm256_subs_epu8(A, B);
5476     static immutable byte[32] correct =  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)254, 0, 0, 0, 0, 0, 0, 0, 0,   cast(byte)0, 0, 0, 0, cast(byte) 96, 0, 0, 0, 0, 0, 0];
5477     assert(R.array == correct);
5478 }
5479 
5480 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in `a` and `b`.
5481 __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b) pure @safe
5482 {
5483     static if (GDC_with_AVX2)
5484     {
5485         return cast(long4) __builtin_ia32_punpckhwd256(cast(short16)a, cast(short16)b);
5486     }
5487     else static if (LDC_with_optimizations)
5488     {
5489         enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12,i32 28, i32 13,i32 29, i32 14,i32 30, i32 15,i32 31>
5490             ret <16 x i16> %r`;
5491         return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b);
5492     }
5493     else
5494     {
5495         // Better for arm64, GDC without AVX2
5496         __m128i a_lo = _mm256_extractf128_si256!0(a);
5497         __m128i a_hi = _mm256_extractf128_si256!1(a);
5498         __m128i b_lo = _mm256_extractf128_si256!0(b);
5499         __m128i b_hi = _mm256_extractf128_si256!1(b);
5500         __m128i r_lo = _mm_unpackhi_epi16(a_lo, b_lo);
5501         __m128i r_hi = _mm_unpackhi_epi16(a_hi, b_hi);
5502         return _mm256_set_m128i(r_hi, r_lo);
5503     }
5504 }
5505 unittest
5506 {
5507     __m256i A = _mm256_setr_epi16( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5508     __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5509     short16 C = cast(short16) _mm256_unpackhi_epi16(A, B);
5510     short[16] correct = [4,  20, 5,  21, 6, 22, 7, 23, 
5511                          12, 28, 13, 29, 14, 30, 15, 31];
5512     assert(C.array == correct);
5513 }
5514 
5515 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in `a` and `b`.
5516 __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b) pure @trusted
5517 {
5518     static if (GDC_with_AVX2)
5519         enum bool split = false;
5520     else version(GNU)
5521         enum bool split = true;
5522     else
5523         enum bool split = false;
5524 
5525     static if (GDC_with_AVX2)
5526     {
5527         return cast(long4) __builtin_ia32_punpckhdq256(cast(int8)a, cast(int8)b);
5528     }
5529     else static if (LDC_with_optimizations)
5530     {
5531         // LDC AVX2: Suprisingly, this start using vunpckhps in LDC 1.31 -O2
5532         enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
5533             ret <8 x i32> %r`;
5534         return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b);
5535     }
5536     else static if (split)
5537     {
5538         __m128i a_lo = _mm256_extractf128_si256!0(a);
5539         __m128i a_hi = _mm256_extractf128_si256!1(a);
5540         __m128i b_lo = _mm256_extractf128_si256!0(b);
5541         __m128i b_hi = _mm256_extractf128_si256!1(b);
5542         __m128i r_lo = _mm_unpackhi_epi32(a_lo, b_lo);
5543         __m128i r_hi = _mm_unpackhi_epi32(a_hi, b_hi);
5544         return _mm256_set_m128i(r_hi, r_lo);
5545     }
5546     else
5547     {
5548         int8 R;
5549         int8 ai = cast(int8)a;
5550         int8 bi = cast(int8)b;
5551         R.ptr[0] = ai.array[2];
5552         R.ptr[1] = bi.array[2];
5553         R.ptr[2] = ai.array[3];
5554         R.ptr[3] = bi.array[3];
5555         R.ptr[4] = ai.array[6];
5556         R.ptr[5] = bi.array[6];
5557         R.ptr[6] = ai.array[7];
5558         R.ptr[7] = bi.array[7];
5559         return cast(__m256i) R;
5560     }
5561 }
5562 unittest
5563 {
5564     __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
5565     __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15);
5566     int8 C = cast(int8) _mm256_unpackhi_epi32(A, B);
5567     int[8] correct = [2, 10, 3, 11, 6, 14, 7, 15];
5568     assert(C.array == correct);
5569 }
5570 
5571 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in `a` and `b`,
5572 __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b) pure @trusted
5573 {
5574     static if (GDC_with_AVX2)
5575     {
5576         return cast(__m256i) __builtin_ia32_punpckhbw256(cast(ubyte32)a, cast(ubyte32)b);
5577     }
5578     else static if (LDC_with_optimizations)
5579     {
5580         enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 8, i32 40,  i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
5581             ret <32 x i8> %r`;
5582         return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b);
5583     }
5584     else
5585     {
5586         // Splitting always beneficial
5587         __m128i a_lo = _mm256_extractf128_si256!0(a);
5588         __m128i a_hi = _mm256_extractf128_si256!1(a);
5589         __m128i b_lo = _mm256_extractf128_si256!0(b);
5590         __m128i b_hi = _mm256_extractf128_si256!1(b);
5591         __m128i r_lo = _mm_unpackhi_epi8(a_lo, b_lo);
5592         __m128i r_hi = _mm_unpackhi_epi8(a_hi, b_hi);
5593         return _mm256_set_m128i(r_hi, r_lo);
5594     }
5595 }
5596 unittest
5597 {
5598     __m256i A = _mm256_setr_epi8(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
5599                                   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5600     __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
5601                                   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
5602     byte32 C = cast(byte32) _mm256_unpackhi_epi8(A, B);
5603     byte[32] correct =          [  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
5604                                   24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ];
5605     assert(C.array == correct);
5606 }
5607 
5608 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in `a` and `b`.
5609 __m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b) pure @trusted
5610 {
5611     version(GNU)
5612         enum split = true; // Benefits GDC in non-AVX2
5613     else
5614         enum split = false;
5615 
5616     static if (GDC_with_AVX2)
5617     {
5618         return __builtin_ia32_punpckhqdq256(a, b);
5619     }
5620     else static if (LDC_with_optimizations)
5621     {
5622         enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
5623             ret <4 x i64> %r`;
5624         return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b);
5625     }
5626     else static if (split)
5627     {
5628         __m128i a_lo = _mm256_extractf128_si256!0(a);
5629         __m128i a_hi = _mm256_extractf128_si256!1(a);
5630         __m128i b_lo = _mm256_extractf128_si256!0(b);
5631         __m128i b_hi = _mm256_extractf128_si256!1(b);
5632         __m128i r_lo = _mm_unpackhi_epi64(a_lo, b_lo);
5633         __m128i r_hi = _mm_unpackhi_epi64(a_hi, b_hi);
5634         return _mm256_set_m128i(r_hi, r_lo);
5635     }
5636     else
5637     {        
5638         long4 R;
5639         R.ptr[0] = a.array[1];
5640         R.ptr[1] = b.array[1];
5641         R.ptr[2] = a.array[3];
5642         R.ptr[3] = b.array[3];
5643         return R;
5644     }
5645 }
5646 unittest
5647 {
5648     __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3);
5649     __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5);
5650     long4 C = _mm256_unpackhi_epi64(A, B);
5651     long[4] correct = [0x33333333_33333333, 0x55555555_55555555, 3, 5];
5652     assert(C.array == correct);
5653 }
5654 
5655 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in `a` and `b`.
5656 __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b) pure @safe
5657 {
5658     static if (GDC_with_AVX2)
5659     {
5660         return cast(__m256i) __builtin_ia32_punpcklwd256(cast(short16)a, cast(short16)b);
5661     }
5662     else static if (LDC_with_optimizations)
5663     {
5664         enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
5665             ret <16 x i16> %r`;
5666         return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b);
5667     }
5668     else
5669     {
5670         __m128i a_lo = _mm256_extractf128_si256!0(a);
5671         __m128i a_hi = _mm256_extractf128_si256!1(a);
5672         __m128i b_lo = _mm256_extractf128_si256!0(b);
5673         __m128i b_hi = _mm256_extractf128_si256!1(b);
5674         __m128i r_lo = _mm_unpacklo_epi16(a_lo, b_lo);
5675         __m128i r_hi = _mm_unpacklo_epi16(a_hi, b_hi);
5676         return _mm256_set_m128i(r_hi, r_lo);
5677     }
5678 }
5679 unittest
5680 {
5681     __m256i A = _mm256_setr_epi16( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5682     __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5683     short16 C = cast(short16) _mm256_unpacklo_epi16(A, B);
5684     short[16] correct = [0,  16, 1,  17, 2, 18, 3, 19, 
5685                          8,  24, 9,  25, 10, 26, 11, 27];
5686     assert(C.array == correct);
5687 }
5688 
5689 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in `a` and `b`.
5690 __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b) pure @trusted
5691 {
5692     static if (GDC_with_AVX2)
5693         enum bool split = false;
5694     else version(GNU)
5695         enum bool split = true;
5696     else
5697         enum bool split = false;
5698 
5699     static if (GDC_with_AVX2)
5700     {
5701         return cast(long4) __builtin_ia32_punpckldq256(cast(int8)a, cast(int8)b);
5702     }
5703     else static if (LDC_with_optimizations)
5704     {
5705         // LDC AVX2: Suprisingly, this start using vunpcklps in LDC 1.31 -O1
5706         enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
5707             ret <8 x i32> %r`;
5708         return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b);
5709     }
5710     else static if (split)
5711     {
5712         __m128i a_lo = _mm256_extractf128_si256!0(a);
5713         __m128i a_hi = _mm256_extractf128_si256!1(a);
5714         __m128i b_lo = _mm256_extractf128_si256!0(b);
5715         __m128i b_hi = _mm256_extractf128_si256!1(b);
5716         __m128i r_lo = _mm_unpacklo_epi32(a_lo, b_lo);
5717         __m128i r_hi = _mm_unpacklo_epi32(a_hi, b_hi);
5718         return _mm256_set_m128i(r_hi, r_lo);
5719     }
5720     else
5721     {
5722         int8 R;
5723         int8 ai = cast(int8)a;
5724         int8 bi = cast(int8)b;
5725         R.ptr[0] = ai.array[0];
5726         R.ptr[1] = bi.array[0];
5727         R.ptr[2] = ai.array[1];
5728         R.ptr[3] = bi.array[1];
5729         R.ptr[4] = ai.array[4];
5730         R.ptr[5] = bi.array[4];
5731         R.ptr[6] = ai.array[5];
5732         R.ptr[7] = bi.array[5];
5733         return cast(__m256i) R;
5734     }
5735 }
5736 unittest
5737 {
5738     __m256i A = _mm256_setr_epi32(0, 1,  2,  3,  4,  5,  6,  7);
5739     __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15);
5740     int8 C = cast(int8) _mm256_unpacklo_epi32(A, B);
5741     int[8] correct = [0, 8, 1, 9, 4, 12, 5, 13];
5742     assert(C.array == correct);
5743 }
5744 
5745 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in `a` and `b`.
5746 __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) pure @trusted
5747 {
5748     version(GNU)
5749         enum split = true; // Benefits GDC in non-AVX2
5750     else
5751         enum split = false;
5752 
5753     static if (GDC_with_AVX2)
5754     {
5755         return __builtin_ia32_punpcklqdq256(a, b);
5756     }
5757     else static if (LDC_with_optimizations)
5758     {
5759         enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
5760             ret <4 x i64> %r`;
5761         return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b);
5762     }
5763     else static if (split)
5764     {
5765         __m128i a_lo = _mm256_extractf128_si256!0(a);
5766         __m128i a_hi = _mm256_extractf128_si256!1(a);
5767         __m128i b_lo = _mm256_extractf128_si256!0(b);
5768         __m128i b_hi = _mm256_extractf128_si256!1(b);
5769         __m128i r_lo = _mm_unpacklo_epi64(a_lo, b_lo);
5770         __m128i r_hi = _mm_unpacklo_epi64(a_hi, b_hi);
5771         return _mm256_set_m128i(r_hi, r_lo);
5772     }
5773     else
5774     {        
5775         long4 R;
5776         R.ptr[0] = a.array[0];
5777         R.ptr[1] = b.array[0];
5778         R.ptr[2] = a.array[2];
5779         R.ptr[3] = b.array[2];
5780         return R;
5781     }
5782 }
5783 unittest
5784 {
5785     __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3);
5786     __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5);
5787     long4 C = _mm256_unpacklo_epi64(A, B);
5788     long[4] correct = [0x22222222_22222222, 0x44444444_44444444, 2, 4];
5789     assert(C.array == correct);
5790 }
5791 
5792 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in `a` and `b`. 
5793 __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) pure @trusted
5794 {
5795     static if (GDC_with_AVX2)
5796     {
5797         return cast(__m256i) __builtin_ia32_punpcklbw256(cast(ubyte32)a, cast(ubyte32)b);
5798     }
5799     else static if (LDC_with_optimizations)
5800     {
5801         enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
5802             ret <32 x i8> %r`;
5803         return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b);
5804     }
5805     else
5806     {
5807         // Splitting always beneficial
5808         __m128i a_lo = _mm256_extractf128_si256!0(a);
5809         __m128i a_hi = _mm256_extractf128_si256!1(a);
5810         __m128i b_lo = _mm256_extractf128_si256!0(b);
5811         __m128i b_hi = _mm256_extractf128_si256!1(b);
5812         __m128i r_lo = _mm_unpacklo_epi8(a_lo, b_lo);
5813         __m128i r_hi = _mm_unpacklo_epi8(a_hi, b_hi);
5814         return _mm256_set_m128i(r_hi, r_lo);
5815     }
5816 }
5817 unittest
5818 {
5819     __m256i A = _mm256_setr_epi8(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
5820                                   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5821     __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
5822                                   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
5823     byte32 C = cast(byte32) _mm256_unpacklo_epi8(A, B);
5824     byte[32] correct =          [  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39,
5825                                   16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 ];
5826     assert(C.array == correct);
5827 }
5828 
5829 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`.
5830 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe
5831 {
5832     return a ^ b;
5833 }
5834 unittest
5835 {
5836     __m256i A = _mm256_setr_epi64(975394,    619809709,    -1,    54);
5837     __m256i B = _mm256_setr_epi64(-920275025,       -6, 85873, 96644);
5838     long4 R = cast(long4) _mm256_xor_si256(A, B);
5839     long[4] correct = [975394 ^ (-920275025L), 619809709L ^ -6, (-1) ^ 85873, 54 ^ 96644];
5840     assert(R.array == correct);
5841 }
5842 
5843 private bool isValidSIBScale(const int scale)
5844 {
5845     // Encoded using two SIB bits in the x86 instruction
5846     return scale == 1 || scale == 2 || scale == 4 || scale == 8;
5847 }