1 /**
2 * AVX2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avx2intrin;
10 
11 // AVX2 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2
13 // Note: this header will work whether you have AVX2 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively
15 // generate AVX2 instructions.
16 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively
17 // generate AVX2 instructions.
18 
19 public import inteli.types;
20 import inteli.internals;
21 
22 // Pull in all previous instruction set intrinsics.
23 public import inteli.avxintrin;
24 
25 nothrow @nogc:
26 
27 /// Compute the absolute value of packed signed 16-bit integers in `a`.
28 __m256i _mm256_abs_epi16 (__m256i a) @trusted
29 {
30     // PERF DMD
31     version(LDC)
32         enum split = true; // akways beneficial in LDC neon, ssse3, or even sse2
33     else
34         enum split = GDC_with_SSSE3;
35 
36     static if (GDC_with_AVX2)
37     {
38         return cast(__m256i) __builtin_ia32_pabsw256(cast(short16)a);
39     }
40     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
41     {
42         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
43         // no good way to do abs(256-bit)
44         return cast(__m256i) inteli_llvm_abs!short16(cast(short16)a, false);
45     }    
46     else static if (split)
47     {
48         __m128i a_lo = _mm256_extractf128_si256!0(a);
49         __m128i a_hi = _mm256_extractf128_si256!1(a);
50         __m128i r_lo = _mm_abs_epi16(a_lo);
51         __m128i r_hi = _mm_abs_epi16(a_hi);
52         return _mm256_set_m128i(r_hi, r_lo);
53     }    
54     else
55     {        
56         short16 sa = cast(short16)a;
57         for (int i = 0; i < 16; ++i)
58         {
59             short s = sa.array[i];
60             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
61         }  
62         return cast(__m256i)sa;
63     }
64 }
65 unittest
66 {
67     __m256i A = _mm256_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000,
68                                   1, -1, -32768, 32767, 12, -13, 1000, -1040);
69     short16 B = cast(short16) _mm256_abs_epi16(A);
70     short[16] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000,
71                          1, 1, -32768, 32767, 12, 13, 1000, 1040];
72     assert(B.array == correct);
73 }
74 
75 /// Compute the absolute value of packed signed 32-bit integers in `a`.
76 __m256i _mm256_abs_epi32 (__m256i a) @trusted
77 {
78     // PERF DMD
79     version(LDC)
80         enum split = true; // always beneficial in LDC neon, ssse3, or even sse2
81     else
82         enum split = false; // GDC manages to split and use pabsd in SSSE3 without guidance
83 
84     static if (GDC_with_AVX2)
85     {
86         return cast(__m256i) __builtin_ia32_pabsd256(cast(int8)a);
87     }
88     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
89     {
90         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
91         // no good way to do abs(256-bit)
92         return cast(__m256i) inteli_llvm_abs!int8(cast(int8)a, false);
93     }
94     else static if (split)
95     {
96         __m128i a_lo = _mm256_extractf128_si256!0(a);
97         __m128i a_hi = _mm256_extractf128_si256!1(a);
98         __m128i r_lo = _mm_abs_epi32(a_lo);
99         __m128i r_hi = _mm_abs_epi32(a_hi);
100         return _mm256_set_m128i(r_hi, r_lo);
101     }
102     else
103     {
104         int8 sa = cast(int8)a;
105         for (int i = 0; i < 8; ++i)
106         {
107             int s = sa.array[i];
108             sa.ptr[i] = (s >= 0 ? s : -s);
109         }
110         return cast(__m256i)sa;
111     }
112 }
113 unittest
114 {
115     __m256i A = _mm256_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647, -1, 0, -2_147_483_648, -2_147_483_646);
116     int8 B = cast(int8) _mm256_abs_epi32(A);
117     int[8] correct = [0, 1, -2_147_483_648, 2_147_483_647, 1, 0, -2_147_483_648, 2_147_483_646];
118     assert(B.array == correct);
119 }
120 
121 /// Compute the absolute value of packed signed 8-bit integers in `a`.
122 __m256i _mm256_abs_epi8 (__m256i a) @trusted
123 {
124     // PERF DMD
125     // PERF GDC in SSSE3 to AVX doesn't use pabsb and split is catastrophic because of _mm_min_epu8
126     version(LDC)
127         enum split = true; // akways beneficial in LDC neon, ssse3, sse2
128     else
129         enum split = false;
130 
131     static if (GDC_with_AVX2)
132     {
133         return cast(__m256i) __builtin_ia32_pabsb256(cast(ubyte32)a);
134     }
135     else static if (__VERSION__ >= 2097 && LDC_with_AVX2)
136     {
137         // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 
138         // no good way to do abs(256-bit)
139         return cast(__m256i) inteli_llvm_abs!byte32(cast(byte32)a, false);
140     }
141     else static if (split)
142     {
143         __m128i a_lo = _mm256_extractf128_si256!0(a);
144         __m128i a_hi = _mm256_extractf128_si256!1(a);
145         __m128i r_lo = _mm_abs_epi8(a_lo);
146         __m128i r_hi = _mm_abs_epi8(a_hi);
147         return _mm256_set_m128i(r_hi, r_lo);
148     }
149     else
150     {
151         // Basically this loop is poison for LDC optimizer
152         byte32 sa = cast(byte32)a;
153         for (int i = 0; i < 32; ++i)
154         {
155             byte s = sa.array[i];
156             sa.ptr[i] = s >= 0 ? s : cast(byte)(-cast(int)(s));
157         }
158         return cast(__m256i)sa;
159     }
160 }
161 unittest
162 {
163     __m256i A = _mm256_setr_epi8(0, -1, -128, -127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
164                                  0, -1, -128, -126, 127, -6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5);
165     byte32 B = cast(byte32) _mm256_abs_epi8(A);
166     byte[32] correct =          [0,  1, -128,  127, 127,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0,
167                                  0,  1, -128,  126, 127,  6,  5,  4,  3,  2, 0, 1, 2, 3, 4, 5];
168     assert(B.array == correct);
169 }
170 
171 /// Add packed 16-bit integers in `a` and `b`.
172 __m256i _mm256_add_epi16 (__m256i a, __m256i b) pure @safe
173 {
174     pragma(inline, true);
175     return cast(__m256i)(cast(short16)a + cast(short16)b);
176 }
177 unittest
178 {
179     __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0,  6, -2);
180     short16 R = cast(short16) _mm256_add_epi16(A, A);
181     short[16] correct         = [ -14, -2, 0, 18, -200, 200, 468, 864,     0,    -2, 0, -2,  25536, 0, 12, -4 ];
182     assert(R.array == correct);
183 }
184 
185 /// Add packed 32-bit integers in `a` and `b`.
186 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe
187 {
188     pragma(inline, true);
189     return cast(__m256i)(cast(int8)a + cast(int8)b);
190 }
191 unittest
192 {
193     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
194     int8 R = cast(int8) _mm256_add_epi32(A, A);
195     int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ];
196     assert(R.array == correct);
197 }
198 
199 /// Add packed 64-bit integers in `a` and `b`.
200 __m256i _mm256_add_epi64 (__m256i a, __m256i b) pure @safe
201 {
202     pragma(inline, true);
203     return a + b;
204 }
205 unittest
206 {
207     __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12);
208     long4 R = cast(__m256i) _mm256_add_epi64(A, A);
209     long[4] correct = [ -2, 0, 84, -24 ];
210     assert(R.array == correct);
211 }
212 
213 /// Add packed 8-bit integers in `a` and `b`.
214 __m256i _mm256_add_epi8 (__m256i a, __m256i b) pure @safe
215 {
216     pragma(inline, true);
217     return cast(__m256i)(cast(byte32)a + cast(byte32)b);
218 }
219 unittest
220 {
221     __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78,
222                                  4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78);
223     byte32 R = cast(byte32) _mm256_add_epi8(A, A);
224     byte[32] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100,
225                         8, 18, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -4, 0, 20, -100];
226     assert(R.array == correct);
227 }
228 
229 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
230 __m256i _mm256_adds_epi16 (__m256i a, __m256i b) pure @trusted
231 {
232     // PERF DMD
233     static if (GDC_with_AVX2)
234     {
235         return cast(__m256i) __builtin_ia32_paddsw256(cast(short16)a, cast(short16)b);
236     }
237     else version(LDC)
238     {
239         return cast(__m256i) inteli_llvm_adds!short16(cast(short16)a, cast(short16)b);
240     }
241     else
242     {
243         short16 r;
244         short16 sa = cast(short16)a;
245         short16 sb = cast(short16)b;
246         foreach(i; 0..16)
247             r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
248         return cast(__m256i)r;
249     }
250 }
251 unittest
252 {
253     short16 res = cast(short16) _mm256_adds_epi16(_mm256_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0,  7,  6,  5, -32768, 3, 3, 32767,   0),
254                                                   _mm256_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10,  7,  6,  5, -30000, 3, 1,     1, -10));
255     static immutable short[16] correctResult                    =  [14, 12, 10, -32768, 6, 4, 32767, -10, 14, 12, 10, -32768, 6, 4, 32767, -10];
256     assert(res.array == correctResult);
257 }
258 
259 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
260 __m256i _mm256_adds_epi8 (__m256i a, __m256i b) pure @trusted
261 {
262     // PERF DMD
263     static if (GDC_with_AVX2)
264     {
265         return cast(__m256i) __builtin_ia32_paddsb256(cast(ubyte32)a, cast(ubyte32)b);
266     }
267     else version(LDC)
268     {
269         return cast(__m256i) inteli_llvm_adds!byte32(cast(byte32)a, cast(byte32)b);
270     }
271     else
272     {
273         byte32 r;
274         byte32 sa = cast(byte32)a;
275         byte32 sb = cast(byte32)b;
276         foreach(i; 0..32)
277             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
278         return cast(__m256i)r;
279     }
280 }
281 unittest
282 {
283     byte32 res = cast(byte32) _mm256_adds_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
284                                                _mm256_setr_epi8(15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0, 15, 14, 13, 12, 11,  10, 9, 8, 7, 6, 5,   -4, 3, 2, 1, 0));
285     static immutable byte[32] correctResult                  = [30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0, 30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0]; 
286     assert(res.array == correctResult);
287 }
288 
289 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
290 __m256i _mm256_adds_epu16 (__m256i a, __m256i b) pure @trusted
291 {
292     // PERF DMD
293     static if (GDC_with_AVX2)
294     {
295         return cast(__m256i) __builtin_ia32_paddusw256(cast(short16)a, cast(short16)b);
296     }
297     else version(LDC)
298     {
299         return cast(__m256i) inteli_llvm_addus!short16(cast(short16)a, cast(short16)b);
300     }
301     else
302     {
303         short16 r;
304         short16 sa = cast(short16)a;
305         short16 sb = cast(short16)b;
306         foreach(i; 0..16)
307             r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
308         return cast(__m256i)r;
309     }
310 }
311 unittest
312 {
313     short16 res = cast(short16) _mm256_adds_epu16(_mm256_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
314                                              _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0));
315     static immutable short[16] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
316     assert(res.array == correctResult);
317 }
318 
319 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
320 __m256i _mm256_adds_epu8 (__m256i a, __m256i b) pure @trusted
321 {
322     // PERF DMD
323     static if (GDC_with_AVX2)
324     {
325         return cast(__m256i) __builtin_ia32_paddusb256(cast(ubyte32)a, cast(ubyte32)b);
326     }
327     else version(LDC)
328     {
329         return cast(__m256i) inteli_llvm_addus!byte32(cast(byte32)a, cast(byte32)b);
330     }
331     else
332     {
333         byte32 r;
334         byte32 sa = cast(byte32)a;
335         byte32 sb = cast(byte32)b;
336         foreach(i; 0..32)
337             r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
338         return cast(__m256i)r;
339     }
340 }
341 unittest
342 {
343     __m256i A          = _mm256_setr_epi8(0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0);
344     __m256i B          = _mm256_setr_epi8(0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,             1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0,            40, 0, 0, 0, 0, 0, 0);
345     byte32 R = cast(byte32) _mm256_adds_epu8(A, B);
346     static immutable byte[32] correct =  [0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, cast(byte)176, 0, 0, 0, 0, 0, 0];
347     assert(R.array == correct);
348 }
349 
350 // TODO __m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int imm8) pure @safe
351 
352 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`.
353 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe
354 {
355     pragma(inline, true);
356     return a & b;
357 }
358 unittest
359 {
360     __m256i A = _mm256_set1_epi32(7);
361     __m256i B = _mm256_set1_epi32(14);
362     int8 R = cast(int8) _mm256_and_si256(A, B);
363     int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6];
364     assert(R.array == correct);
365 }
366 
367 /// Compute the bitwise NOT of 256 bits (representing integer data) in `a` and then AND with `b`.
368 __m256i _mm256_andnot_si256 (__m256i a, __m256i b) pure @safe
369 {
370     pragma(inline, true);
371     return (~a) & b;
372 }
373 unittest
374 {
375     __m256i A = _mm256_setr_epi32(7, -2, 9, 54654, 7, -2, 9, 54654);
376     __m256i B = _mm256_setr_epi32(14, 78, 111, -256, 14, 78, 111, -256);
377     int8 R = cast(int8) _mm256_andnot_si256(A, B);
378     int[8] correct = [8, 0, 102, -54784, 8, 0, 102, -54784];
379     assert(R.array == correct);
380 }
381 
382 
383 // TODO __m256i _mm256_avg_epu16 (__m256i a, __m256i b) pure @safe
384 // TODO __m256i _mm256_avg_epu8 (__m256i a, __m256i b) pure @safe
385 // TODO __m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8) pure @safe
386 // TODO __m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8) pure @safe
387 // TODO __m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8) pure @safe
388 // TODO __m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask) pure @safe
389 // TODO __m128i _mm_broadcastb_epi8 (__m128i a) pure @safe
390 // TODO __m256i _mm256_broadcastb_epi8 (__m128i a) pure @safe
391 // TODO __m128i _mm_broadcastd_epi32 (__m128i a) pure @safe
392 // TODO __m256i _mm256_broadcastd_epi32 (__m128i a) pure @safe
393 // TODO __m128i _mm_broadcastq_epi64 (__m128i a) pure @safe
394 // TODO __m256i _mm256_broadcastq_epi64 (__m128i a) pure @safe
395 // TODO __m128d _mm_broadcastsd_pd (__m128d a) pure @safe
396 // TODO __m256d _mm256_broadcastsd_pd (__m128d a) pure @safe
397 // TODO __m256i _mm_broadcastsi128_si256 (__m128i a) pure @safe
398 // TODO __m256i _mm256_broadcastsi128_si256 (__m128i a) pure @safe
399 // TODO __m128 _mm_broadcastss_ps (__m128 a) pure @safe
400 // TODO __m256 _mm256_broadcastss_ps (__m128 a) pure @safe
401 // TODO __m128i _mm_broadcastw_epi16 (__m128i a) pure @safe
402 // TODO __m256i _mm256_broadcastw_epi16 (__m128i a) pure @safe
403 // TODO __m256i _mm256_bslli_epi128 (__m256i a, const int imm8) pure @safe
404 // TODO __m256i _mm256_bsrli_epi128 (__m256i a, const int imm8) pure @safe
405 // TODO __m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b) pure @safe
406 // TODO __m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b) pure @safe
407 // TODO __m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) pure @safe
408 // TODO __m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b) pure @safe
409 // TODO __m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b) pure @safe
410 // TODO __m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b) pure @safe
411 // TODO __m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b) pure @safe
412 // TODO __m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b) pure @safe
413 // TODO __m256i _mm256_cvtepi16_epi32 (__m128i a) pure @safe
414 // TODO __m256i _mm256_cvtepi16_epi64 (__m128i a) pure @safe
415 // TODO __m256i _mm256_cvtepi32_epi64 (__m128i a) pure @safe
416 // TODO __m256i _mm256_cvtepi8_epi16 (__m128i a) pure @safe
417 // TODO __m256i _mm256_cvtepi8_epi32 (__m128i a) pure @safe
418 // TODO __m256i _mm256_cvtepi8_epi64 (__m128i a) pure @safe
419 
420 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
421 // TODO verify
422 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted
423 {
424     static if (GDC_with_AVX2)
425     {
426         return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a);
427     }
428     else
429     {
430         short8 sa = cast(short8)a;
431         int8 r; // PERF =void;
432         // Explicit cast to unsigned to get *zero* extension (instead of sign extension).
433         r.ptr[0] = cast(ushort)sa.array[0];
434         r.ptr[1] = cast(ushort)sa.array[1];
435         r.ptr[2] = cast(ushort)sa.array[2];
436         r.ptr[3] = cast(ushort)sa.array[3];
437         r.ptr[4] = cast(ushort)sa.array[4];
438         r.ptr[5] = cast(ushort)sa.array[5];
439         r.ptr[6] = cast(ushort)sa.array[6];
440         r.ptr[7] = cast(ushort)sa.array[7];
441         return cast(__m256i)r;
442     }
443 }
444 unittest
445 {
446     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
447     int8 C = cast(int8) _mm256_cvtepu16_epi32(A);
448     int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767];
449     assert(C.array == correct);
450 }
451 
452 // TODO __m256i _mm256_cvtepu16_epi64 (__m128i a) pure @safe
453 // TODO __m256i _mm256_cvtepu32_epi64 (__m128i a) pure @safe
454 // TODO __m256i _mm256_cvtepu8_epi16 (__m128i a) pure @safe
455 // TODO __m256i _mm256_cvtepu8_epi32 (__m128i a) pure @safe
456 // TODO __m256i _mm256_cvtepu8_epi64 (__m128i a) pure @safe
457 // TODO int _mm256_extract_epi16 (__m256i a, const int index) pure @safe
458 // TODO int _mm256_extract_epi8 (__m256i a, const int index) pure @safe
459 
460 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
461 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted
462     if ( (imm8 == 0) || (imm8 == 1) )
463 // TODO verify
464 {
465     pragma(inline, true);
466 
467     static if (GDC_with_AVX2)
468     {
469         return cast(__m128i) __builtin_ia32_extract128i256(a, imm8);
470     }
471     else version (LDC)
472     {
473         enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>";
474         enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~
475                   "ret <2 x i64> %r";
476         return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a);
477     }
478     else
479     {
480         long4 al = cast(long4) a;
481         long2 ret;
482         ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0];
483         ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1];
484         return cast(__m128i) ret;
485     }
486 }
487 unittest
488 {
489     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 );
490     int[4] correct0 = [ -7, -1, 0, 9 ];
491     int[4] correct1 = [ -100, 100, 234, 432 ];
492     __m128i R0 = _mm256_extracti128_si256!(0)(A);
493     __m128i R1 = _mm256_extracti128_si256!(1)(A);
494     assert(R0.array == correct0);
495     assert(R1.array == correct1);
496 }
497 
498 // TODO __m256i _mm256_hadd_epi16 (__m256i a, __m256i b) pure @safe
499 // TODO __m256i _mm256_hadd_epi32 (__m256i a, __m256i b) pure @safe
500 // TODO __m256i _mm256_hadds_epi16 (__m256i a, __m256i b) pure @safe
501 // TODO __m256i _mm256_hsub_epi16 (__m256i a, __m256i b) pure @safe
502 // TODO __m256i _mm256_hsub_epi32 (__m256i a, __m256i b) pure @safe
503 // TODO __m256i _mm256_hsubs_epi16 (__m256i a, __m256i b) pure @safe
504 // TODO __m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe
505 // TODO __m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
506 // TODO __m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
507 // TODO __m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe
508 // TODO __m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
509 // TODO __m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
510 // TODO __m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
511 // TODO __m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) pure @safe
512 // TODO __m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
513 // TODO __m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe
514 // TODO __m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
515 // TODO __m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) pure @safe
516 // TODO __m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe
517 // TODO __m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe
518 // TODO __m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe
519 // TODO __m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) pure @safe
520 // TODO __m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe
521 // TODO __m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
522 // TODO __m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
523 // TODO __m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) pure @safe
524 // TODO __m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
525 // TODO __m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
526 // TODO __m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) pure @safe
527 // TODO __m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe
528 // TODO __m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe
529 // TODO __m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe
530 // TODO __m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) pure @safe
531 // TODO __m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) pure @safe
532 // TODO __m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe
533 // TODO __m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe
534 // TODO __m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe
535 // TODO __m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale) pure @safe
536 // TODO __m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8) pure @safe
537 
538 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
539 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
540 /// and pack the results in destination.
541 // TODO verify
542 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted
543 {
544     static if (GDC_with_AVX2)
545     {
546         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
547     }
548     else static if (LDC_with_AVX2)
549     {
550         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
551     }
552     else
553     {
554         short16 sa = cast(short16)a;
555         short16 sb = cast(short16)b;
556         int8 r; // PERF =void;
557         foreach(i; 0..8)
558         {
559             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
560         }
561         return cast(__m256i) r;
562     }
563 }
564 unittest
565 {
566     short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
567     short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
568     int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B);
569     int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767];
570     assert(R.array == correct);
571 }
572 
573 // TODO __m256i _mm256_maddubs_epi16 (__m256i a, __m256i b) pure @safe
574 // TODO __m128i _mm_maskload_epi32 (int const* mem_addr, __m128i mask) pure @safe
575 // TODO __m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask) pure @safe
576 // TODO __m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask) pure @safe
577 // TODO __m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask) pure @safe
578 // TODO __m256i _mm256_max_epi16 (__m256i a, __m256i b) pure @safe
579 // TODO __m256i _mm256_max_epi32 (__m256i a, __m256i b) pure @safe
580 // TODO __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @safe
581 // TODO __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @safe
582 // TODO __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe
583 // TODO __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe
584 // TODO __m256i _mm256_min_epi16 (__m256i a, __m256i b) pure @safe
585 // TODO __m256i _mm256_min_epi32 (__m256i a, __m256i b) pure @safe
586 // TODO __m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @safe
587 // TODO __m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @safe
588 // TODO __m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe
589 // TODO __m256i _mm256_min_epu8 (__m256i a, __m256i b) pure @safe
590 // TODO int _mm256_movemask_epi8 (__m256i a) pure @safe
591 // TODO __m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8) pure @safe
592 // TODO __m256i _mm256_mul_epi32 (__m256i a, __m256i b) pure @safe
593 // TODO __m256i _mm256_mul_epu32 (__m256i a, __m256i b) pure @safe
594 // TODO __m256i _mm256_mulhi_epi16 (__m256i a, __m256i b) pure @safe
595 // TODO __m256i _mm256_mulhi_epu16 (__m256i a, __m256i b) pure @safe
596 // TODO __m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b) pure @safe
597 // TODO __m256i _mm256_mullo_epi16 (__m256i a, __m256i b) pure @safe
598 // TODO __m256i _mm256_mullo_epi32 (__m256i a, __m256i b) pure @safe
599 
600 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`.
601 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe
602 {
603     return a | b;
604 }
605 unittest
606 {
607     long A = 0x55555555_55555555;
608     long B = 0xAAAAAAAA_AAAAAAAA;
609     __m256i vA = _mm256_set_epi64(A, B, A, B);
610     __m256i vB = _mm256_set_epi64(B, A, 0, B);
611     __m256i R  = _mm256_or_si256(vA, vB);
612     long[4] correct = [B, A, -1, -1];
613     assert(R.array == correct);
614 }
615 
616 // TODO __m256i _mm256_packs_epi16 (__m256i a, __m256i b) pure @safe
617 // TODO __m256i _mm256_packs_epi32 (__m256i a, __m256i b) pure @safe
618 // TODO __m256i _mm256_packus_epi16 (__m256i a, __m256i b) pure @safe
619 // TODO __m256i _mm256_packus_epi32 (__m256i a, __m256i b) pure @safe
620 // TODO __m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) pure @safe
621 // TODO __m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8) pure @safe
622 // TODO __m256d _mm256_permute4x64_pd (__m256d a, const int imm8) pure @safe
623 // TODO __m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) pure @safe
624 // TODO __m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx) pure @safe
625 
626 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each
627 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the
628 /// low 16 bits of 64-bit elements in result.
629 // TODO verify
630 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted
631 {
632     static if (GDC_with_AVX2)
633     {
634         return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b);
635     }
636     else static if (LDC_with_AVX2)
637     {
638         return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b);
639     }
640     else
641     {
642         // PERF: ARM64/32 is lacking
643         byte32 ab = cast(byte32)a;
644         byte32 bb = cast(byte32)b;
645         ubyte[32] t;
646         foreach(i; 0..32)
647         {
648             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
649             if (diff < 0) diff = -diff;
650             t.ptr[i] = cast(ubyte)(diff);
651         }
652         int8 r = cast(int8) _mm256_setzero_si256();
653         r.ptr[0] = t[0]  + t[1]  + t[2]  + t[3]  + t[4]  + t[5]  + t[6]  + t[7];
654         r.ptr[2] = t[8]  + t[9]  + t[10] + t[11] + t[12] + t[13] + t[14] + t[15];
655         r.ptr[4] = t[16] + t[17] + t[18] + t[19] + t[20] + t[21] + t[22] + t[23];
656         r.ptr[6] = t[24] + t[25] + t[26] + t[27] + t[28] + t[29] + t[30] + t[31];
657         return cast(__m256i) r;
658     }
659 }
660 unittest
661 {
662     __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54,
663                               3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
664     __m256i B = _mm256_set1_epi8(1);
665     int8 R = cast(int8) _mm256_sad_epu8(A, B);
666     int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
667                       0,
668                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
669                       0,
670                       2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
671                       0,
672                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
673                       0];
674     assert(R.array == correct);
675 }
676 
677 
678 // TODO __m256i _mm256_shuffle_epi32 (__m256i a, const int imm8) pure @safe
679 // TODO __m256i _mm256_shuffle_epi8 (__m256i a, __m256i b) pure @safe
680 // TODO __m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8) pure @safe
681 // TODO __m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8) pure @safe
682 // TODO __m256i _mm256_sign_epi16 (__m256i a, __m256i b) pure @safe
683 // TODO __m256i _mm256_sign_epi32 (__m256i a, __m256i b) pure @safe
684 // TODO __m256i _mm256_sign_epi8 (__m256i a, __m256i b) pure @safe
685 // TODO __m256i _mm256_sll_epi16 (__m256i a, __m128i count) pure @safe
686 // TODO __m256i _mm256_sll_epi32 (__m256i a, __m128i count) pure @safe
687 // TODO __m256i _mm256_sll_epi64 (__m256i a, __m128i count) pure @safe
688 
689 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
690 // TODO verify
691 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @trusted
692 {
693     static if (GDC_with_AVX2)
694     {
695         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
696     }
697     else static if (LDC_with_AVX2)
698     {
699         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
700     }
701     else
702     {
703         //PERF: ARM
704         short16 sa  = cast(short16)a;
705         short16 r   = cast(short16)_mm256_setzero_si256();
706         ubyte count = cast(ubyte) imm8;
707         if (count > 15)
708             return cast(__m256i)r;
709         foreach(i; 0..16)
710             r.ptr[i] = cast(short)(sa.array[i] << count);
711         return cast(__m256i)r;
712     }
713 }
714 unittest
715 {
716     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
717     short16 B = cast(short16)( _mm256_slli_epi16(A, 1) );
718     short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) );
719     short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ];
720     assert(B.array == expectedB);
721     assert(B2.array == expectedB);
722 
723     short16 C = cast(short16)( _mm256_slli_epi16(A, 16) );
724     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
725     assert(C.array == expectedC);
726 }
727 
728 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
729 // TODO verify
730 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @trusted
731 {
732     static if (GDC_with_AVX2)
733     {
734         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
735     }
736     else static if (LDC_with_AVX2)
737     {
738         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
739     }
740     else
741     {
742         // Note: the intrinsics guarantee imm8[0..7] is taken, however
743         //       D says "It's illegal to shift by the same or more bits
744         //       than the size of the quantity being shifted"
745         //       and it's UB instead.
746         int8 a_int8 = cast(int8) a;
747         int8 r      = cast(int8) _mm256_setzero_si256();
748 
749         ubyte count = cast(ubyte) imm8;
750         if (count > 31)
751             return cast(__m256i) r;
752 
753         foreach(i; 0..8)
754             r.ptr[i] = cast(uint)(a_int8.array[i]) << count;
755         return cast(__m256i) r;
756     }
757 }
758 unittest
759 {
760     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
761     int8 B = cast(int8) _mm256_slli_epi32(A, 1);
762     int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256);
763     int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -8 ];
764     assert(B.array == expectedB);
765     assert(B2.array == expectedB);
766 
767     int8 C = cast(int8) _mm256_slli_epi32(A, 0);
768     int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -4 ];
769     assert(C.array == expectedC);
770 
771     int8 D = cast(int8) _mm256_slli_epi32(A, 65);
772     int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
773     assert(D.array == expectedD);
774 }
775 
776 // TODO __m256i _mm256_slli_epi64 (__m256i a, int imm8) pure @safe
777 // TODO __m256i _mm256_slli_si256 (__m256i a, const int imm8) pure @safe
778 // TODO __m128i _mm_sllv_epi32 (__m128i a, __m128i count) pure @safe
779 // TODO __m256i _mm256_sllv_epi32 (__m256i a, __m256i count) pure @safe
780 // TODO __m128i _mm_sllv_epi64 (__m128i a, __m128i count) pure @safe
781 // TODO __m256i _mm256_sllv_epi64 (__m256i a, __m256i count) pure @safe
782 // TODO __m256i _mm256_sra_epi16 (__m256i a, __m128i count) pure @safe
783 // TODO __m256i _mm256_sra_epi32 (__m256i a, __m128i count) pure @safe
784 // TODO __m256i _mm256_srai_epi16 (__m256i a, int imm8) pure @safe
785 // TODO __m256i _mm256_srai_epi32 (__m256i a, int imm8) pure @safe
786 // TODO __m128i _mm_srav_epi32 (__m128i a, __m128i count) pure @safe
787 // TODO __m256i _mm256_srav_epi32 (__m256i a, __m256i count) pure @safe
788 // TODO __m256i _mm256_srl_epi16 (__m256i a, __m128i count) pure @safe
789 // TODO __m256i _mm256_srl_epi32 (__m256i a, __m128i count) pure @safe
790 // TODO __m256i _mm256_srl_epi64 (__m256i a, __m128i count) pure @safe
791 
792 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
793 // TODO verify
794 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted
795 {
796     static if (GDC_with_AVX2)
797     {
798         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
799     }
800     else static if (LDC_with_AVX2)
801     {
802         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
803     }
804     else
805     {
806         //PERF: ARM
807         short16 sa  = cast(short16)a;
808         ubyte count = cast(ubyte)imm8;
809         short16 r   = cast(short16) _mm256_setzero_si256();
810         if (count >= 16)
811             return cast(__m256i)r;
812 
813         foreach(i; 0..16)
814             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
815         return cast(__m256i)r;
816     }
817 }
818 unittest
819 {
820     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
821     short16 B = cast(short16) _mm256_srli_epi16(A, 1);
822     short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256);
823     short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
824     assert(B.array == expectedB);
825     assert(B2.array == expectedB);
826 
827     short16 C = cast(short16) _mm256_srli_epi16(A, 16);
828     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
829     assert(C.array == expectedC);
830 
831     short16 D = cast(short16) _mm256_srli_epi16(A, 0);
832     short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ];
833     assert(D.array == expectedD);
834 }
835 
836 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
837 // TODO verify
838 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted
839 {
840     static if (GDC_with_AVX2)
841     {
842         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
843     }
844     else static if (LDC_with_AVX2)
845     {
846         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
847     }
848     else
849     {
850         ubyte count = cast(ubyte) imm8;
851         int8 a_int8 = cast(int8) a;
852 
853         // Note: the intrinsics guarantee imm8[0..7] is taken, however
854         //       D says "It's illegal to shift by the same or more bits
855         //       than the size of the quantity being shifted"
856         //       and it's UB instead.
857         int8 r = cast(int8) _mm256_setzero_si256();
858         if (count >= 32)
859             return cast(__m256i) r;
860         r.ptr[0] = a_int8.array[0] >>> count;
861         r.ptr[1] = a_int8.array[1] >>> count;
862         r.ptr[2] = a_int8.array[2] >>> count;
863         r.ptr[3] = a_int8.array[3] >>> count;
864         r.ptr[4] = a_int8.array[4] >>> count;
865         r.ptr[5] = a_int8.array[5] >>> count;
866         r.ptr[6] = a_int8.array[6] >>> count;
867         r.ptr[7] = a_int8.array[7] >>> count;
868         return cast(__m256i) r;
869     }
870 }
871 unittest
872 {
873     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
874     int8 B = cast(int8) _mm256_srli_epi32(A, 1);
875     int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256);
876     int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE];
877     assert(B.array == expectedB);
878     assert(B2.array == expectedB);
879 
880     int8 C = cast(int8) _mm256_srli_epi32(A, 255);
881     int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
882     assert(C.array == expectedC);
883 }
884 
885 // TODO __m256i _mm256_srli_epi64 (__m256i a, int imm8) pure @safe
886 // TODO __m256i _mm256_srli_si256 (__m256i a, const int imm8) pure @safe
887 // TODO __m128i _mm_srlv_epi32 (__m128i a, __m128i count) pure @safe
888 // TODO __m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @safe
889 // TODO __m128i _mm_srlv_epi64 (__m128i a, __m128i count) pure @safe
890 // TODO __m256i _mm256_srlv_epi64 (__m256i a, __m256i count) pure @safe
891 // TODO __m256i _mm256_stream_load_si256 (__m256i const* mem_addr) pure @safe
892 // TODO __m256i _mm256_sub_epi16 (__m256i a, __m256i b) pure @safe
893 // TODO __m256i _mm256_sub_epi32 (__m256i a, __m256i b) pure @safe
894 // TODO __m256i _mm256_sub_epi64 (__m256i a, __m256i b) pure @safe
895 // TODO __m256i _mm256_sub_epi8 (__m256i a, __m256i b) pure @safe
896 // TODO __m256i _mm256_subs_epi16 (__m256i a, __m256i b) pure @safe
897 // TODO __m256i _mm256_subs_epi8 (__m256i a, __m256i b) pure @safe
898 // TODO __m256i _mm256_subs_epu16 (__m256i a, __m256i b) pure @safe
899 // TODO __m256i _mm256_subs_epu8 (__m256i a, __m256i b) pure @safe
900 // TODO __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b) pure @safe
901 // TODO __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b) pure @safe
902 // TODO __m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b) pure @safe
903 // TODO __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b) pure @safe
904 // TODO __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b) pure @safe
905 // TODO __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b) pure @safe
906 // TODO __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) pure @safe
907 // TODO __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) pure @safe
908 
909 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`.
910 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe
911 // TODO verify
912 {
913     return a ^ b;
914 }
915 // TODO unittest and thus force inline
916 
917 
918 /+
919 
920 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d")
921 int4 __builtin_ia32_gatherd_d(int4, const void*, int4, int4, byte);
922 
923 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d.256")
924 int8 __builtin_ia32_gatherd_d256(int8, const void*, int8, int8, byte);
925 
926 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd")
927 double2 __builtin_ia32_gatherd_pd(double2, const void*, int4, double2, byte);
928 
929 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd.256")
930 double4 __builtin_ia32_gatherd_pd256(double4, const void*, int4, double4, byte);
931 
932 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps")
933 float4 __builtin_ia32_gatherd_ps(float4, const void*, int4, float4, byte);
934 
935 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps.256")
936 float8 __builtin_ia32_gatherd_ps256(float8, const void*, int8, float8, byte);
937 
938 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q")
939 long2 __builtin_ia32_gatherd_q(long2, const void*, int4, long2, byte);
940 
941 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q.256")
942 long4 __builtin_ia32_gatherd_q256(long4, const void*, int4, long4, byte);
943 
944 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d")
945 int4 __builtin_ia32_gatherq_d(int4, const void*, long2, int4, byte);
946 
947 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d.256")
948 int4 __builtin_ia32_gatherq_d256(int4, const void*, long4, int4, byte);
949 
950 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd")
951 double2 __builtin_ia32_gatherq_pd(double2, const void*, long2, double2, byte);
952 
953 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd.256")
954 double4 __builtin_ia32_gatherq_pd256(double4, const void*, long4, double4, byte);
955 
956 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps")
957 float4 __builtin_ia32_gatherq_ps(float4, const void*, long2, float4, byte);
958 
959 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps.256")
960 float4 __builtin_ia32_gatherq_ps256(float4, const void*, long4, float4, byte);
961 
962 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q")
963 long2 __builtin_ia32_gatherq_q(long2, const void*, long2, long2, byte);
964 
965 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q.256")
966 long4 __builtin_ia32_gatherq_q256(long4, const void*, long4, long4, byte);
967 
968 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d")
969 int4 __builtin_ia32_maskloadd(const void*, int4);
970 
971 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d.256")
972 int8 __builtin_ia32_maskloadd256(const void*, int8);
973 
974 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q")
975 long2 __builtin_ia32_maskloadq(const void*, long2);
976 
977 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q.256")
978 long4 __builtin_ia32_maskloadq256(const void*, long4);
979 
980 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d")
981 void __builtin_ia32_maskstored(void*, int4, int4);
982 
983 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d.256")
984 void __builtin_ia32_maskstored256(void*, int8, int8);
985 
986 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q")
987 void __builtin_ia32_maskstoreq(void*, long2, long2);
988 
989 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q.256")
990 void __builtin_ia32_maskstoreq256(void*, long4, long4);
991 
992 pragma(LDC_intrinsic, "llvm.x86.avx2.mpsadbw")
993 short16 __builtin_ia32_mpsadbw256(byte32, byte32, byte) pure @safe;
994 
995 pragma(LDC_intrinsic, "llvm.x86.avx2.packssdw")
996 short16 __builtin_ia32_packssdw256(int8, int8) pure @safe;
997 
998 pragma(LDC_intrinsic, "llvm.x86.avx2.packsswb")
999 byte32 __builtin_ia32_packsswb256(short16, short16) pure @safe;
1000 
1001 pragma(LDC_intrinsic, "llvm.x86.avx2.packusdw")
1002 short16 __builtin_ia32_packusdw256(int8, int8) pure @safe;
1003 
1004 pragma(LDC_intrinsic, "llvm.x86.avx2.packuswb")
1005 byte32 __builtin_ia32_packuswb256(short16, short16) pure @safe;
1006 
1007 pragma(LDC_intrinsic, "llvm.x86.avx2.pavg.b")
1008 byte32 __builtin_ia32_pavgb256(byte32, byte32) pure @safe;
1009 
1010 pragma(LDC_intrinsic, "llvm.x86.avx2.pavg.w")
1011 short16 __builtin_ia32_pavgw256(short16, short16) pure @safe;
1012 
1013 pragma(LDC_intrinsic, "llvm.x86.avx2.pblendvb")
1014 byte32 __builtin_ia32_pblendvb256(byte32, byte32, byte32) pure @safe;
1015 
1016 pragma(LDC_intrinsic, "llvm.x86.avx2.permd")
1017 int8 __builtin_ia32_permvarsi256(int8, int8) pure @safe;
1018 
1019 pragma(LDC_intrinsic, "llvm.x86.avx2.permps")
1020 float8 __builtin_ia32_permvarsf256(float8, int8) pure @safe;
1021 
1022 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.d")
1023 int8 __builtin_ia32_phaddd256(int8, int8) pure @safe;
1024 
1025 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.sw")
1026 short16 __builtin_ia32_phaddsw256(short16, short16) pure @safe;
1027 
1028 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.w")
1029 short16 __builtin_ia32_phaddw256(short16, short16) pure @safe;
1030 
1031 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.d")
1032 int8 __builtin_ia32_phsubd256(int8, int8) pure @safe;
1033 
1034 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.sw")
1035 short16 __builtin_ia32_phsubsw256(short16, short16) pure @safe;
1036 
1037 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.w")
1038 short16 __builtin_ia32_phsubw256(short16, short16) pure @safe;
1039 
1040 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.ub.sw")
1041 short16 __builtin_ia32_pmaddubsw256(byte32, byte32) pure @safe;
1042 
1043 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.wd")
1044 int8 __builtin_ia32_pmaddwd256(short16, short16) pure @safe;
1045 
1046 pragma(LDC_intrinsic, "llvm.x86.avx2.pmovmskb")
1047 int __builtin_ia32_pmovmskb256(byte32) pure @safe;
1048 
1049 pragma(LDC_intrinsic, "llvm.x86.avx2.pmul.hr.sw")
1050 short16 __builtin_ia32_pmulhrsw256(short16, short16) pure @safe;
1051 
1052 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulh.w")
1053 short16 __builtin_ia32_pmulhw256(short16, short16) pure @safe;
1054 
1055 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulhu.w")
1056 short16 __builtin_ia32_pmulhuw256(short16, short16) pure @safe;
1057 
1058 pragma(LDC_intrinsic, "llvm.x86.avx2.psad.bw")
1059 long4 __builtin_ia32_psadbw256(byte32, byte32) pure @safe;
1060 
1061 pragma(LDC_intrinsic, "llvm.x86.avx2.pshuf.b")
1062 byte32 __builtin_ia32_pshufb256(byte32, byte32) pure @safe;
1063 
1064 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.b")
1065 byte32 __builtin_ia32_psignb256(byte32, byte32) pure @safe;
1066 
1067 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.d")
1068 int8 __builtin_ia32_psignd256(int8, int8) pure @safe;
1069 
1070 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.w")
1071 short16 __builtin_ia32_psignw256(short16, short16) pure @safe;
1072 
1073 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.d")
1074 int8 __builtin_ia32_pslld256(int8, int4) pure @safe;
1075 
1076 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.q")
1077 long4 __builtin_ia32_psllq256(long4, long2) pure @safe;
1078 
1079 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.w")
1080 short16 __builtin_ia32_psllw256(short16, short8) pure @safe;
1081 
1082 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.d")
1083 int8 __builtin_ia32_pslldi256(int8, int) pure @safe;
1084 
1085 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.q")
1086 long4 __builtin_ia32_psllqi256(long4, int) pure @safe;
1087 
1088 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.w")
1089 short16 __builtin_ia32_psllwi256(short16, int) pure @safe;
1090 
1091 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d")
1092 int4 __builtin_ia32_psllv4si(int4, int4) pure @safe;
1093 
1094 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d.256")
1095 int8 __builtin_ia32_psllv8si(int8, int8) pure @safe;
1096 
1097 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q")
1098 long2 __builtin_ia32_psllv2di(long2, long2) pure @safe;
1099 
1100 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q.256")
1101 long4 __builtin_ia32_psllv4di(long4, long4) pure @safe;
1102 
1103 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.d")
1104 int8 __builtin_ia32_psrad256(int8, int4) pure @safe;
1105 
1106 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.w")
1107 short16 __builtin_ia32_psraw256(short16, short8) pure @safe;
1108 
1109 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.d")
1110 int8 __builtin_ia32_psradi256(int8, int) pure @safe;
1111 
1112 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.w")
1113 short16 __builtin_ia32_psrawi256(short16, int) pure @safe;
1114 
1115 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d")
1116 int4 __builtin_ia32_psrav4si(int4, int4) pure @safe;
1117 
1118 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d.256")
1119 int8 __builtin_ia32_psrav8si(int8, int8) pure @safe;
1120 
1121 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.d")
1122 int8 __builtin_ia32_psrld256(int8, int4) pure @safe;
1123 
1124 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.q")
1125 long4 __builtin_ia32_psrlq256(long4, long2) pure @safe;
1126 
1127 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.w")
1128 short16 __builtin_ia32_psrlw256(short16, short8) pure @safe;
1129 
1130 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.d")
1131 int8 __builtin_ia32_psrldi256(int8, int) pure @safe;
1132 
1133 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.q")
1134 long4 __builtin_ia32_psrlqi256(long4, int) pure @safe;
1135 
1136 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.w")
1137 short16 __builtin_ia32_psrlwi256(short16, int) pure @safe;
1138 
1139 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d")
1140 int4 __builtin_ia32_psrlv4si(int4, int4) pure @safe;
1141 
1142 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d.256")
1143 int8 __builtin_ia32_psrlv8si(int8, int8) pure @safe;
1144 
1145 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q")
1146 long2 __builtin_ia32_psrlv2di(long2, long2) pure @safe;
1147 
1148 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q.256")
1149 long4 __builtin_ia32_psrlv4di(long4, long4) pure @safe;
1150 
1151 +/