1 /**
2 * SSE4.1 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.smmintrin;
10 
11 // SSE4.1 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
13 // Note: this header will work whether you have SSE4.1 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
15 // generate SSE4.1 instructions.
16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
17 
18 public import inteli.types;
19 import inteli.internals;
20 
21 // smmintrin pulls in all previous instruction set intrinsics.
22 public import inteli.tmmintrin;
23 
24 nothrow @nogc:
25 
26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
27 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
28 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
29 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
30 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
31 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
32 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
33 
34 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
35 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
36 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
37 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
38 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
40 
41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
42 // Note: changed signature, GDC needs a compile-time value for imm8.
43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted
44 {
45     // PERF DMD
46     static if (GDC_with_SSE41)
47     {
48         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
49     }
50     else 
51     {
52         // LDC x86 This generates pblendw since LDC 1.1 and -O2
53         short8 r;
54         short8 sa = cast(short8)a;
55         short8 sb = cast(short8)b;
56         for (int n = 0; n < 8; ++n)
57         {
58             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
59         }
60         return cast(__m128i)r;
61     }
62 }
63 unittest
64 {
65     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
66     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
67     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
68     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
69     assert(C.array == correct);
70 }
71 
72 
73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
74 // Note: changed signature, GDC needs a compile-time value for `imm8`.
75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
76 {
77     static assert(imm8 >= 0 && imm8 < 4);
78     // PERF DMD
79     static if (GDC_with_SSE41)
80     {
81         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
82     }
83     else
84     {
85         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
86         double2 r;
87         for (int n = 0; n < 2; ++n)
88         {
89             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
90         }
91         return cast(__m128d)r;
92     }
93 }
94 unittest
95 {
96     __m128d A = _mm_setr_pd(0, 1);
97     __m128d B = _mm_setr_pd(8, 9);
98     double2 C = _mm_blend_pd!2(A, B);
99     double[2] correct =    [0, 9];
100     assert(C.array == correct);
101 }
102 
103 
104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`.
105 // Note: changed signature, GDC needs a compile-time value for imm8.
106 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted
107 {
108     // PERF DMD
109     static assert(imm8 >= 0 && imm8 < 16);
110     static if (GDC_with_SSE41)
111     {
112         return __builtin_ia32_blendps(a, b, imm8);
113     }
114     else version(LDC)
115     {
116         // LDC x86: generates blendps since LDC 1.1 -O2
117         //   arm64: pretty good, two instructions worst case
118         return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
119                                          (imm8 & 2) ? 5 : 1,
120                                          (imm8 & 4) ? 6 : 2,
121                                          (imm8 & 8) ? 7 : 3)(a, b);
122     }
123     else
124     {
125         __m128 r; // PERF =void;
126         for (int n = 0; n < 4; ++n)
127         {
128             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
129         }
130         return r;
131     }
132 }
133 unittest
134 {
135     __m128 A = _mm_setr_ps(0, 1,  2,  3);
136     __m128 B = _mm_setr_ps(8, 9, 10, 11);
137     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
138     float[4] correct =    [8, 1, 10, 11];
139     assert(C.array == correct);
140 }
141 
142 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
144 {
145     // PERF DMD
146     /*static if (GDC_with_SSE41)
147     {
148         // This intrinsic do nothing in GDC 12.
149         // TODO report to GDC. No problem in GCC.
150         return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
151     }
152     else*/
153     static if (LDC_with_SSE41)
154     {
155         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
156     }
157     else static if (LDC_with_ARM64)
158     {
159         // LDC arm64: two instructions since LDC 1.12 -O2
160         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
161         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
162     }
163     else
164     {
165         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
166         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
167     }
168 }
169 unittest
170 {
171     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
172                                8,  9, 10, 11, 12, 13, 14, 15);
173     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
174                               24, 25, 26, 27, 28, 29, 30, 31);
175     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
176                                1,  1, -1, -1,  4,  1,  8, -128);
177     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
178     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
179                                8,  9, 26, 27, 12, 13, 14, 31 ];
180     assert(R.array == correct);
181 }
182 
183 
184 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
185 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
186 {
187     // PERF DMD
188     static if (GDC_with_SSE42)
189     {
190         // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
191         // with -msse4.2 but not -msse4.1.
192         // Not sure what is the reason, and there is a replacement sequence.
193         // Sounds like a bug.
194         return __builtin_ia32_blendvpd(a, b, mask);
195     }
196     else static if (LDC_with_SSE41)
197     {
198         return __builtin_ia32_blendvpd(a, b, mask);
199     }
200     else static if (LDC_with_ARM64)
201     {
202         long2 shift;
203         shift = 63;
204         long2 lmask = cast(long2)mask >> shift;
205         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
206     }
207     else
208     {
209         __m128d r; // PERF =void;
210         long2 lmask = cast(long2)mask;
211         for (int n = 0; n < 2; ++n)
212         {
213             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
214         }
215         return r;
216     }
217 }
218 unittest
219 {
220     __m128d A = _mm_setr_pd(1.0, 2.0);
221     __m128d B = _mm_setr_pd(3.0, 4.0);
222     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
223     __m128d R1 = _mm_blendv_pd(A, B, M1);
224     double[2] correct1 = [3.0, 2.0];
225     assert(R1.array == correct1);
226 
227     // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason.
228     // but it does work in arm64 macOS
229     // yields different results despite FP seemingly not being used
230     version(linux)
231     {}
232     else
233     {
234         __m128d M2 = _mm_setr_pd(double.nan, -double.nan);
235         __m128d R2 = _mm_blendv_pd(A, B, M2);
236         double[2] correct2 = [1.0, 4.0];
237         assert(R2.array == correct2);
238     }
239 }
240 
241 
242 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
243 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
244 {
245     // PERF DMD
246     static if (GDC_with_SSE41)
247     {
248         return __builtin_ia32_blendvps(a, b, mask);
249     }
250     else static if (LDC_with_SSE41)
251     {
252         return __builtin_ia32_blendvps(a, b, mask);
253     }
254     else static if (LDC_with_ARM64)
255     {
256         int4 shift;
257         shift = 31;
258         int4 lmask = cast(int4)mask >> shift;
259         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
260     }
261     else
262     {
263         __m128 r; // PERF =void;
264         int4 lmask = cast(int4)mask;
265         for (int n = 0; n < 4; ++n)
266         {
267             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
268         }
269         return r;
270     }
271 }
272 unittest
273 {
274     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
275     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
276     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
277     __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f);
278     __m128 R1 = _mm_blendv_ps(A, B, M1);
279     __m128 R2 = _mm_blendv_ps(A, B, M2);
280     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
281     float[4] correct2 =    [ 0.0f, 5.0f, 6.0f, 3.0f];
282     assert(R1.array == correct1);
283 
284     // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason.
285     // yields different results despite FP seemingly not being used
286     version(linux)
287     {}
288     else
289     {
290         assert(R2.array == correct2);
291     }
292 }
293 
294 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
295 /// and store the results as packed double-precision floating-point elements.
296 __m128d _mm_ceil_pd (__m128d a) @trusted
297 {
298     static if (LDC_with_ARM64)
299     {
300         // LDC arm64 acceptable since 1.8 -O2
301         // Unfortunately x86 intrinsics force a round-trip back to double2
302         // ARM neon semantics wouldn't have that
303         long2 l = vcvtpq_s64_f64(a);
304         double2 r;
305         r.ptr[0] = l.array[0];
306         r.ptr[1] = l.array[1];
307         return r;
308     }
309     else
310     {
311         return _mm_round_pd!2(a);
312     }
313 }
314 unittest
315 {
316     __m128d A = _mm_setr_pd(1.3f, -2.12f);
317     __m128d B = _mm_setr_pd(53.6f, -2.7f);
318     A = _mm_ceil_pd(A);
319     B = _mm_ceil_pd(B);
320     double[2] correctA = [2.0, -2.0];
321     double[2] correctB = [54.0, -2.0];
322     assert(A.array == correctA);
323     assert(B.array == correctB);
324 }
325 
326 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
327 /// and store the results as packed single-precision floating-point elements.
328 __m128 _mm_ceil_ps (__m128 a) @trusted
329 {
330     static if (LDC_with_ARM64)
331     {
332         // LDC arm64 acceptable since 1.8 -O1
333         int4 l = vcvtpq_s32_f32(a);
334         float4 r;
335         r.ptr[0] = l.array[0];
336         r.ptr[1] = l.array[1];
337         r.ptr[2] = l.array[2];
338         r.ptr[3] = l.array[3];
339         return r;
340     }
341     else
342     {
343         return _mm_round_ps!2(a);
344     }
345 }
346 unittest
347 {
348     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
349     __m128 C = _mm_ceil_ps(A);
350     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
351     assert(C.array == correct);
352 }
353 
354 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
355 /// store the result as a double-precision floating-point element in the lower element of result, 
356 /// and copy the upper element from `a` to the upper element of dst.
357 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
358 {
359     static if (LDC_with_ARM64)
360     {
361         a[0] = vcvtps_s64_f64(b[0]);
362         return a;
363     }
364     else
365     {
366         return _mm_round_sd!2(a, b);
367     }
368 }
369 unittest
370 {
371     __m128d A = _mm_setr_pd(1.3, -2.12);
372     __m128d B = _mm_setr_pd(53.6, -3.7);
373     __m128d C = _mm_ceil_sd(A, B);
374     double[2] correct = [54.0, -2.12];
375     assert(C.array == correct);
376 }
377 
378 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
379 /// store the result as a single-precision floating-point element in the lower element of result, 
380 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
381 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
382 {
383     static if (LDC_with_ARM64)
384     {
385         a[0] = vcvtps_s32_f32(b[0]);
386         return a;
387     }
388     else
389     {
390         return _mm_round_ss!2(a, b);
391     }
392 }
393 unittest
394 {
395     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
396     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
397     __m128 C = _mm_ceil_ss(A, B);
398     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
399     assert(C.array == correct);
400 }
401 
402 /// Compare packed 64-bit integers in `a` and `b` for equality.
403 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
404 {
405     // PERF DMD
406     static if (GDC_with_SSE41)
407     {
408         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
409     }
410     else version(LDC)
411     {
412         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
413         //     arm64: generates cmeq since LDC 1.8 -O1
414         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
415     }
416     else
417     {
418         // Clever pcmpeqd + pand use with LDC 1.24 -O2
419         long2 la = cast(long2)a;
420         long2 lb = cast(long2)b;
421         long2 res;
422         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
423         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
424         return cast(__m128i)res;
425     }
426 }
427 unittest
428 {
429     __m128i A = _mm_setr_epi64(-1, -2);
430     __m128i B = _mm_setr_epi64(-3, -2);
431     __m128i C = _mm_setr_epi64(-1, -4);
432     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
433     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
434     long[2] correct1 = [0, -1];
435     long[2] correct2 = [-1, 0];
436     assert(AB.array == correct1);
437     assert(AC.array == correct2);
438 }
439 
440 
441 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
442 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
443 {
444     // PERF DMD
445     static if (GDC_with_SSE41)
446     {
447         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
448     }
449     else version(LDC)
450     {
451         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
452         enum ir = `
453             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
454             %r = sext <4 x i16> %v to <4 x i32>
455             ret <4 x i32> %r`;
456         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
457     }
458     else
459     {
460         short8 sa = cast(short8)a;
461         int4 r;
462         r.ptr[0] = sa.array[0];
463         r.ptr[1] = sa.array[1];
464         r.ptr[2] = sa.array[2];
465         r.ptr[3] = sa.array[3];
466         return r;
467     }
468 }
469 unittest
470 {
471     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
472     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
473     int[4] correct = [-1, 0, -32768, 32767];
474     assert(C.array == correct);
475 }
476 
477 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
478 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
479 {
480     // PERF DMD
481     static if (GDC_with_SSE41)
482     {
483         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
484     }
485     else version(LDC)
486     {
487         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
488         enum ir = `
489             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
490             %r = sext <2 x i16> %v to <2 x i64>
491             ret <2 x i64> %r`;
492         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
493     }
494     else
495     {
496         short8 sa = cast(short8)a;
497         long2 r;
498         r.ptr[0] = sa.array[0];
499         r.ptr[1] = sa.array[1];
500         return cast(__m128i)r;
501     }
502 }
503 unittest
504 {
505     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
506     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
507     long[2] correct = [-32768, 32767];
508     assert(C.array == correct);
509 }
510 
511 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
512 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
513 {
514     // PERF DMD
515     static if (GDC_with_SSE41)
516     {
517         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
518     }
519     else version(LDC)
520     {
521         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
522         enum ir = `
523             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
524             %r = sext <2 x i32> %v to <2 x i64>
525             ret <2 x i64> %r`;
526         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
527     }
528     else
529     {
530         int4 sa = cast(int4)a;
531         long2 r;
532         r.ptr[0] = sa.array[0];
533         r.ptr[1] = sa.array[1];
534         return cast(__m128i)r;
535     }
536 }
537 unittest
538 {
539     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
540     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
541     long[2] correct = [-4, 42];
542     assert(C.array == correct);
543 }
544 
545 
546 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
547 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
548 {
549     // PERF DMD
550     static if (GDC_with_SSE41)
551     {
552         alias ubyte16 = __vector(ubyte[16]);
553         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
554     }
555     else version(LDC)
556     {
557         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
558         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
559         enum ir = `
560             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
561             %r = sext <8 x i8> %v to <8 x i16>
562             ret <8 x i16> %r`;
563         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
564     }
565     else
566     {
567         byte16 sa = cast(byte16)a;
568         short8 r;
569         foreach(n; 0..8)
570             r.ptr[n] = sa.array[n];
571         return cast(__m128i)r;
572     }
573 }
574 unittest
575 {
576     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
577     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
578     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
579     assert(C.array == correct);
580 }
581 
582 
583 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
584 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
585 {
586     // PERF DMD
587     static if (GDC_with_SSE41)
588     {
589         alias ubyte16 = __vector(ubyte[16]);
590         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
591     }
592     else static if (LDC_with_SSE41)
593     {
594         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
595         enum ir = `
596             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
597             %r = sext <4 x i8> %v to <4 x i32>
598             ret <4 x i32> %r`;
599         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
600     }
601     else
602     {
603         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
604         byte16 sa = cast(byte16)a;
605         int4 r;
606         r.ptr[0] = sa.array[0];
607         r.ptr[1] = sa.array[1];
608         r.ptr[2] = sa.array[2];
609         r.ptr[3] = sa.array[3];
610         return cast(__m128i)r;
611     }
612 }
613 unittest
614 {
615     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
616     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
617     int[4] correct = [127, -128, 1, -1];
618     assert(C.array == correct);
619 }
620 
621 
622 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
623 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
624 {
625     // PERF DMD
626     static if (GDC_with_SSE41)
627     {
628         alias ubyte16 = __vector(ubyte[16]);
629         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
630     }
631     else version(LDC)
632     {
633         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
634         // LDC arm64: it's ok since LDC 1.8 -O1
635         enum ir = `
636             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
637             %r = sext <2 x i8> %v to <2 x i64>
638             ret <2 x i64> %r`;
639         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
640     }
641     else
642     {
643         byte16 sa = cast(byte16)a;
644         long2 r;
645         foreach(n; 0..2)
646             r.ptr[n] = sa.array[n];
647         return cast(__m128i)r;
648     }
649 }
650 unittest
651 {
652     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
653     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
654     long[2] correct = [127, -128];
655     assert(C.array == correct);
656 }
657 
658 
659 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
660 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
661 {
662     // PERF DMD
663     static if (GDC_with_SSE41)
664     {
665         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
666     }
667     else
668     {
669         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
670         //     arm64: ushll since LDC 1.12 -O1
671         short8 sa = cast(short8)a;
672         int4 r;
673         r.ptr[0] = cast(ushort)sa.array[0];
674         r.ptr[1] = cast(ushort)sa.array[1];
675         r.ptr[2] = cast(ushort)sa.array[2];
676         r.ptr[3] = cast(ushort)sa.array[3];
677         return cast(__m128i)r;
678     }
679 }
680 unittest
681 {
682     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
683     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
684     int[4] correct = [65535, 0, 32768, 32767];
685     assert(C.array == correct);
686 }
687 
688 
689 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
690 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
691 {
692     // PERF DMD
693     static if (GDC_with_SSE41)
694     {
695         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
696     }
697     else static if (LDC_with_ARM64)
698     {
699         // LDC arm64: a bit shorter than below, in -O2
700         short8 sa = cast(short8)a;
701         long2 r;
702         for(int n = 0; n < 2; ++n)
703             r.ptr[n] = cast(ushort)sa.array[n];
704         return cast(__m128i)r;
705     }
706     else
707     {
708         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
709         short8 sa = cast(short8)a;
710         long2 r;
711         r.ptr[0] = cast(ushort)sa.array[0];
712         r.ptr[1] = cast(ushort)sa.array[1];
713         return cast(__m128i)r;
714     }
715 }
716 unittest
717 {
718     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
719     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
720     long[2] correct = [65535, 0];
721     assert(C.array == correct);
722 }
723 
724 
725 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
726 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
727 {
728     // PERF DMD
729     static if (GDC_with_SSE41)
730     {
731         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
732     }
733     else
734     {
735         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
736         //     arm64: generates ushll since LDC 1.12 -O1
737         int4 sa = cast(int4)a;
738         long2 r;
739         r.ptr[0] = cast(uint)sa.array[0];
740         r.ptr[1] = cast(uint)sa.array[1];
741         return cast(__m128i)r;
742     }
743 }
744 unittest
745 {
746     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
747     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
748     long[2] correct = [4294967295, 42];
749     assert(C.array == correct);
750 }
751 
752 
753 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
754 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
755 {
756     // PERF DMD
757     static if (GDC_with_SSE41)
758     {
759         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
760     }
761     else
762     {
763         // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1
764         //     arm64: ushll since LDC 1.12 -O1
765         // PERF: catastrophic with GDC without SSE4.1
766         byte16 sa = cast(byte16)a;
767         short8 r;
768         r.ptr[0] = cast(ubyte)sa.array[0];
769         r.ptr[1] = cast(ubyte)sa.array[1];
770         r.ptr[2] = cast(ubyte)sa.array[2];
771         r.ptr[3] = cast(ubyte)sa.array[3];
772         r.ptr[4] = cast(ubyte)sa.array[4];
773         r.ptr[5] = cast(ubyte)sa.array[5];
774         r.ptr[6] = cast(ubyte)sa.array[6];
775         r.ptr[7] = cast(ubyte)sa.array[7];
776         return cast(__m128i)r;
777     }
778 }
779 unittest
780 {
781     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
782     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
783     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
784     assert(C.array == correct);
785 }
786 
787 
788 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
789 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
790 {
791     // PERF DMD
792     static if (GDC_with_SSE41)
793     {
794         alias ubyte16 = __vector(ubyte[16]);
795         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
796     }
797     else static if (LDC_with_ARM64)
798     {
799         // LDC arm64: a bit better than below in -O2
800         byte16 sa = cast(byte16)a;
801         int4 r;
802         for(int n = 0; n < 4; ++n) 
803             r.ptr[n] = cast(ubyte)sa.array[n];
804         return cast(__m128i)r;
805     }
806     else
807     {
808         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
809         // PERF: catastrophic with GDC without SSE4.1
810         byte16 sa = cast(byte16)a;
811         int4 r;
812         r.ptr[0] = cast(ubyte)sa.array[0];
813         r.ptr[1] = cast(ubyte)sa.array[1];
814         r.ptr[2] = cast(ubyte)sa.array[2];
815         r.ptr[3] = cast(ubyte)sa.array[3];
816         return cast(__m128i)r;
817     }
818 }
819 unittest
820 {
821     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
822     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
823     int[4] correct = [127, 128, 1, 255];
824     assert(C.array == correct);
825 }
826 
827 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
828 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
829 {
830     // PERF DMD
831     static if (GDC_with_SSE41)
832     {
833         alias ubyte16 = __vector(ubyte[16]);
834         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
835     }
836     else static if (LDC_with_ARM64)
837     {
838         // LDC arm64: this optimizes better than the loop below
839         byte16 sa = cast(byte16)a;
840         long2 r;
841         for (int n = 0; n < 2; ++n)
842             r.ptr[n] = cast(ubyte)sa.array[n];
843         return cast(__m128i)r;
844     }
845     else
846     {
847         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
848         byte16 sa = cast(byte16)a;
849         long2 r;
850         r.ptr[0] = cast(ubyte)sa.array[0];
851         r.ptr[1] = cast(ubyte)sa.array[1];
852         return cast(__m128i)r;
853     }
854 }
855 unittest
856 {
857     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
858     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
859     long[2] correct = [127, 254];
860     assert(C.array == correct);
861 }
862 
863 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
864 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
865 /// store the sum in dst using the low 4 bits of `imm8`.
866 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
867 {
868     // PERF DMD
869     static if (GDC_with_SSE41)
870     {
871         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
872     }
873     else static if (LDC_with_SSE41)
874     {
875         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
876     }
877     else
878     {
879         __m128d zero = _mm_setzero_pd();
880         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
881         double sum = temp.array[0] + temp.array[1];
882         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
883     }
884 }
885 unittest
886 {
887     __m128d A = _mm_setr_pd(1.0, 2.0);
888     __m128d B = _mm_setr_pd(4.0, 8.0);
889     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
890     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
891     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
892     double[2] correct1 = [ 4.0,  4.0];
893     double[2] correct2 = [16.0,  0.0];
894     double[2] correct3 = [ 0.0, 20.0];
895     assert(R1.array == correct1);
896     assert(R2.array == correct2);
897     assert(R3.array == correct3);
898 }
899 
900 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
901 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
902 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
903 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
904 {
905       // PERF DMD
906     static if (GDC_with_SSE41)
907     {
908         return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
909     }
910     else static if (LDC_with_SSE41)
911     {
912         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
913     }
914     else
915     {
916         __m128 zero = _mm_setzero_ps();
917         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
918         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
919         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
920     }        
921 }
922 unittest
923 {
924     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
925     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
926     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
927     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
928     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
929     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
930     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
931     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
932     assert(R1.array == correct1);
933     assert(R2.array == correct2);
934     assert(R3.array == correct3);
935 }
936 
937 
938 /// Extract a 32-bit integer from `a`, selected with `imm8`.
939 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
940 {
941     return (cast(int4)a).array[imm8 & 3];
942 }
943 unittest
944 {
945     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
946     assert(_mm_extract_epi32(A, 0) == 1);
947     assert(_mm_extract_epi32(A, 1 + 8) == 2);
948     assert(_mm_extract_epi32(A, 3 + 4) == 4);
949 }
950 
951 /// Extract a 64-bit integer from `a`, selected with `imm8`.
952 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
953 {
954     long2 la = cast(long2)a;
955     return la.array[imm8 & 1];
956 }
957 unittest
958 {
959     __m128i A = _mm_setr_epi64(45, -67);
960     assert(_mm_extract_epi64(A, 0) == 45);
961     assert(_mm_extract_epi64(A, 1) == -67);
962     assert(_mm_extract_epi64(A, 2) == 45);
963 }
964 
965 /// Extract an 8-bit integer from `a`, selected with `imm8`.
966 /// Warning: the returned value is zero-extended to 32-bits.
967 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
968 {
969     byte16 ba = cast(byte16)a;
970     return cast(ubyte) ba.array[imm8 & 15];
971 }
972 unittest
973 {
974     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
975     assert(_mm_extract_epi8(A, 7) == 7);
976     assert(_mm_extract_epi8(A, 13) == 255);
977     assert(_mm_extract_epi8(A, 7 + 16) == 7);
978 }
979 
980 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
981 /// Note: returns a 32-bit $(I integer).
982 int _mm_extract_ps (__m128 a, const int imm8) @trusted
983 {
984     return (cast(int4)a).array[imm8 & 3];
985 }
986 unittest
987 {
988     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
989     assert(_mm_extract_ps(A, 0) == 0x3f800000);
990     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
991     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
992 }
993 
994 
995 
996 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
997 /// integer value, and store the results as packed double-precision floating-point elements.
998 __m128d _mm_floor_pd (__m128d a) @trusted
999 {
1000     static if (LDC_with_ARM64)
1001     {
1002         // LDC arm64 acceptable since 1.8 -O2
1003         long2 l = vcvtmq_s64_f64(a);
1004         double2 r;
1005         r.ptr[0] = l.array[0];
1006         r.ptr[1] = l.array[1];
1007         return r;
1008     }
1009     else
1010     {
1011         return _mm_round_pd!1(a);
1012     }
1013 }
1014 unittest
1015 {
1016     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1017     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1018     A = _mm_floor_pd(A);
1019     B = _mm_floor_pd(B);
1020     double[2] correctA = [1.0, -3.0];
1021     double[2] correctB = [53.0, -3.0];
1022     assert(A.array == correctA);
1023     assert(B.array == correctB);
1024 }
1025 
1026 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1027 /// integer value, and store the results as packed single-precision floating-point elements.
1028 __m128 _mm_floor_ps (__m128 a) @trusted
1029 {
1030     static if (LDC_with_ARM64)
1031     {
1032         // LDC arm64 acceptable since 1.8 -O1
1033         int4 l = vcvtmq_s32_f32(a);
1034         float4 r;
1035         r.ptr[0] = l.array[0];
1036         r.ptr[1] = l.array[1];
1037         r.ptr[2] = l.array[2];
1038         r.ptr[3] = l.array[3];
1039         return r;
1040     }
1041     else
1042     {
1043         return _mm_round_ps!1(a);
1044     }
1045 }
1046 unittest
1047 {
1048     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1049     __m128 C = _mm_floor_ps(A);
1050     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1051     assert(C.array == correct);
1052 }
1053 
1054 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1055 /// integer value, store the result as a double-precision floating-point element in the 
1056 /// lower element, and copy the upper element from `a` to the upper element.
1057 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1058 {
1059     static if (LDC_with_ARM64)
1060     {
1061         a[0] = vcvtms_s64_f64(b[0]);
1062         return a;
1063     }
1064     else
1065     {
1066         return _mm_round_sd!1(a, b);
1067     }
1068 }
1069 unittest
1070 {
1071     __m128d A = _mm_setr_pd(1.3, -2.12);
1072     __m128d B = _mm_setr_pd(-53.1, -3.7);
1073     __m128d C = _mm_floor_sd(A, B);
1074     double[2] correct = [-54.0, -2.12];
1075     assert(C.array == correct);
1076 }
1077 
1078 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1079 /// integer value, store the result as a single-precision floating-point element in the
1080 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1081 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1082 {
1083     static if (LDC_with_ARM64)
1084     {
1085         a[0] = vcvtms_s32_f32(b[0]);
1086         return a;
1087     }
1088     else
1089     {
1090         return _mm_round_ss!1(a, b);
1091     }
1092 }
1093 unittest
1094 {
1095     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1096     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1097     __m128 C = _mm_floor_ss(A, B);
1098     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1099     assert(C.array == correct);
1100 }
1101 
1102 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1103 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1104 {
1105     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1106     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1107     // LDC arm64: ins.s since LDC 1.8 -O2
1108     int4 ia = cast(int4)a;
1109     ia.ptr[imm8 & 3] = i;
1110     return cast(__m128i)ia; 
1111 }
1112 unittest
1113 {
1114     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1115     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1116     int[4] result = [1, 2, 5, 4];
1117     assert(C.array == result);
1118 }
1119 
1120 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1121 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1122 {
1123     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1124     // LDC x86: always do something sensible.
1125     long2 la = cast(long2)a;
1126     la.ptr[imm8 & 1] = i;
1127     return cast(__m128i)la;
1128 }
1129 unittest
1130 {
1131     __m128i A = _mm_setr_epi64(1, 2);
1132     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1133     long[2] result = [1, 5];
1134     assert(C.array == result);
1135 }
1136 
1137 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1138 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1139 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1140 {
1141     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1142     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1143     byte16 ba = cast(byte16)a;
1144     ba.ptr[imm8 & 15] = cast(byte)i;
1145     return cast(__m128i)ba; 
1146 }
1147 unittest
1148 {
1149     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1150     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1151     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1152     assert(C.array == result);
1153 }
1154 
1155 
1156 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1157 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1158 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1159 /// (elements are zeroed out when the corresponding bit is set).
1160 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1161 {
1162     // PERF DMD
1163     static if (GDC_with_SSE41)
1164     {
1165         return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
1166     }
1167     else static if (LDC_with_SSE41)
1168     {
1169         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1170     }
1171     else
1172     {
1173         float4 tmp2 = a;
1174         float tmp1 = b.array[(imm8 >> 6) & 3];
1175         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1176         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1177     }
1178 }
1179 unittest
1180 {
1181     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1182     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1183     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1184     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1185     assert(C.array == correct);
1186 }
1187 
1188 
1189 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1190 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
1191 {
1192     static if (GDC_with_SSE41)
1193     {
1194         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1195     }
1196     else version(LDC)
1197     {
1198         // x86: pmaxsd since LDC 1.1 -O1
1199         // ARM: smax.4s since LDC 1.8 -01
1200         int4 sa = cast(int4)a;
1201         int4 sb = cast(int4)b;
1202         int4 greater = greaterMask!int4(sa, sb);
1203         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1204     }
1205     else
1206     {
1207         __m128i higher = _mm_cmpgt_epi32(a, b);
1208         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1209         __m128i mask = _mm_and_si128(aTob, higher);
1210         return _mm_xor_si128(b, mask);
1211     }
1212 }
1213 unittest
1214 {
1215     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1216                                       _mm_setr_epi32(        -4,-8,  9, -8));
1217     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1218     assert(R.array == correct);
1219 }
1220 
1221 /// Compare packed signed 8-bit integers in `a` and `b`, 
1222 /// and return packed maximum values.
1223 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
1224 {
1225     // PERF DMD
1226     static if (GDC_with_SSE41)
1227     {
1228         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1229     }
1230     else version(LDC)
1231     {
1232         // x86: pmaxsb since LDC 1.1 -O1
1233         // ARM64: smax.16b since LDC 1.8.0 -O1
1234         byte16 sa = cast(byte16)a;
1235         byte16 sb = cast(byte16)b;
1236         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1237         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1238     }
1239     else
1240     {
1241         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1242         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1243         __m128i mask = _mm_and_si128(aTob, lower);
1244         return _mm_xor_si128(b, mask);
1245     }
1246 }
1247 unittest
1248 {
1249     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1250     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1251     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1252     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1253     assert(R.array == correct);
1254 }
1255 
1256 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1257 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
1258 {
1259     // PERF DMD
1260     static if (GDC_with_SSE41)
1261     {
1262         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1263     }
1264     else version(LDC)
1265     {
1266         // x86: pmaxuw since LDC 1.1 -O1
1267         // ARM64: umax.8h since LDC 1.8.0 -O1
1268         // PERF: without sse4.1, LLVM 12 produces a very interesting
1269         //          psubusw xmm0, xmm1
1270         //          paddw   xmm0, xmm1
1271         //       sequence that maybe should go in other min/max intrinsics? 
1272         ushort8 sa = cast(ushort8)a;
1273         ushort8 sb = cast(ushort8)b;
1274         ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1275         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1276     }
1277     else
1278     {
1279         b = _mm_subs_epu16(b, a);
1280         b = _mm_add_epi16(b, a);
1281         return b;
1282     }
1283 }
1284 unittest
1285 {
1286     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1287                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1288     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1289     assert(R.array == correct);
1290 }
1291 
1292 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1293 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
1294 {
1295     // PERF DMD
1296     static if (GDC_with_SSE41)
1297     {
1298         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1299     }
1300     else version(LDC)
1301     {
1302         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1303         // ARM64: umax.4s since LDC 1.8.0 -O1
1304         uint4 sa = cast(uint4)a;
1305         uint4 sb = cast(uint4)b;
1306         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1307         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1308     }
1309     else
1310     {
1311         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1312         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1313         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1314         __m128i mask = _mm_and_si128(aTob, higher);
1315         return _mm_xor_si128(b, mask);
1316     }
1317 }
1318 unittest
1319 {
1320     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1321                                       _mm_setr_epi32(        -4,-8,  9, -8));
1322     int[4] correct =                                [        -4,-8,  9, -7];
1323     assert(R.array == correct);
1324 }
1325 
1326 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1327 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
1328 {
1329     // PERF DMD
1330     static if (GDC_with_SSE41)
1331     {
1332         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1333     }
1334     else version(LDC)
1335     {
1336         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1337         // ARM: smin.4s since LDC 1.8 -01
1338         int4 sa = cast(int4)a;
1339         int4 sb = cast(int4)b;
1340         int4 greater = greaterMask!int4(sa, sb);
1341         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1342     }
1343     else
1344     {
1345         __m128i higher = _mm_cmplt_epi32(a, b);
1346         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1347         __m128i mask = _mm_and_si128(aTob, higher);
1348         return _mm_xor_si128(b, mask);
1349     }
1350 }
1351 unittest
1352 {
1353     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1354                                       _mm_setr_epi32(        -4, -8,  9, -8));
1355     int[4] correct =                               [         -4, -8, -4, -8];
1356     assert(R.array == correct);
1357 }
1358 
1359 /// Compare packed signed 8-bit integers in `a` and `b`, 
1360 /// and return packed minimum values.
1361 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
1362 {
1363     // PERF DMD
1364     static if (GDC_with_SSE41)
1365     {
1366         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1367     }
1368     else version(LDC)
1369     {
1370         // x86: pminsb since LDC 1.1 -O1
1371         // ARM64: smin.16b since LDC 1.8.0 -O1
1372         byte16 sa = cast(byte16)a;
1373         byte16 sb = cast(byte16)b;
1374         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1375         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1376     }
1377     else
1378     {
1379         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1380         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1381         __m128i mask = _mm_and_si128(aTob, lower);
1382         return _mm_xor_si128(b, mask);
1383     }
1384 }
1385 unittest
1386 {
1387     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1388     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1389     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1390     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1391     assert(R.array == correct);
1392 }
1393 
1394 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1395 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
1396 {
1397     // PERF DMD
1398     static if (GDC_with_SSE41)
1399     {
1400         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1401     }
1402     else version(LDC)
1403     {
1404         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1405         // ARM64: umin.8h since LDC 1.8.0 -O1
1406         ushort8 sa = cast(ushort8)a;
1407         ushort8 sb = cast(ushort8)b;
1408         ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1409         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1410     }
1411     else
1412     {
1413         __m128i c = _mm_subs_epu16(b, a);
1414         b = _mm_sub_epi16(b, c);
1415         return b;
1416     }
1417 }
1418 unittest
1419 {
1420     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1421                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1422     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1423     assert(R.array == correct);
1424 }
1425 
1426 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1427 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
1428 {
1429     // PERF DMD
1430     static if (GDC_with_SSE41)
1431     {
1432         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1433     }
1434     else version(LDC)
1435     {
1436         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1437         // ARM64: umin.4s since LDC 1.8.0 -O1
1438         uint4 sa = cast(uint4)a;
1439         uint4 sb = cast(uint4)b;
1440         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1441         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1442     }
1443     else
1444     {
1445         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1446         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1447         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1448         __m128i mask = _mm_and_si128(aTob, higher);
1449         return _mm_xor_si128(b, mask);
1450     }
1451 }
1452 unittest
1453 {
1454     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1455                                       _mm_setr_epi32(        -4,-8,  9, -8));
1456     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1457     assert(R.array == correct);
1458 }
1459 
1460 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1461 /// store the minimum and index in return value, and zero the remaining bits.
1462 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1463 {
1464     // PERF DMD
1465     static if (GDC_with_SSE41)
1466     {
1467         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1468     }
1469     else static if (LDC_with_SSE41)
1470     {
1471         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1472     }
1473     else static if (LDC_with_ARM64)
1474     {
1475         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1476         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1477         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1478         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1479         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1480         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1481         short8 sbest = cast(short8)best;
1482         short8 r;
1483         r[0] = sbest[1];
1484         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1485         r[2] = 0;
1486         r[3] = 0;
1487         r[4] = 0;
1488         r[5] = 0;
1489         r[6] = 0;
1490         r[7] = 0;
1491         return cast(__m128i)r;
1492     }
1493     else
1494     {
1495         short8 sa = cast(short8)a;
1496         ushort min = 0xffff;
1497         int index = 0;
1498         for(int n = 0; n < 8; ++n)
1499         {
1500             ushort c = sa.array[n];
1501             if (c < min)
1502             {
1503                 min = c;
1504                 index = n;
1505             }
1506         }
1507         short8 r;
1508         r.ptr[0] = min;
1509         r.ptr[1] = cast(short)index;
1510         return cast(__m128i)r;
1511     }
1512 }
1513 unittest
1514 {
1515     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1516     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1517     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1518     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1519     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1520     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1521     assert(R1.array == correct1);
1522     assert(R2.array == correct2);
1523 }
1524 
1525 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1526 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1527 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1528 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1529 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1530 /// at the offset specified in `imm8[2]`.
1531 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1532 {
1533     // PERF DMD
1534     static if (GDC_with_SSE41)
1535     {
1536         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
1537     }
1538     else static if (LDC_with_SSE41)
1539     {
1540         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1541     }
1542     else
1543     {
1544         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1545         int b_offset = (imm8 & 3) * 4;
1546 
1547         byte16 ba = cast(byte16)a;
1548         byte16 bb = cast(byte16)b;
1549         short8 r;
1550 
1551         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1552 
1553         for (int j = 0; j < 8; j += 2)
1554         {
1555             int k = a_offset + j;
1556             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1557                                            0, 0, 0, 0, 
1558                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1559                                            0, 0, 0, 0);
1560             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1561             r.ptr[j] = diffs.array[0];
1562             r.ptr[j+1] = diffs.array[4];
1563         }
1564         return cast(__m128i)r;
1565     }
1566 }
1567 unittest
1568 {
1569     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1570     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1571     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1572     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1573     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1574     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1575     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1576     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
1577     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
1578     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
1579     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
1580     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
1581     assert(r1.array == correct1);
1582     assert(r4.array == correct4);
1583     assert(r5.array == correct5);
1584     assert(r7.array == correct7);
1585     assert(r8.array == correct0);
1586 }
1587 
1588 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1589 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
1590 {
1591     // PERF DMD
1592     static if (GDC_with_SSE41)
1593     {
1594         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1595     }
1596     else static if (LDC_with_SSE41)
1597     {
1598         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1599         // Use IR instead.
1600         // This generates pmuldq with since LDC 1.2.0 -O0 
1601         enum ir = `
1602             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1603             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1604             %la = sext <2 x i32> %ia to <2 x i64>
1605             %lb = sext <2 x i32> %ib to <2 x i64>
1606             %r = mul <2 x i64> %la, %lb
1607             ret <2 x i64> %r`;
1608         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1609     }
1610     else static if (LDC_with_ARM64)  
1611     {
1612         // 3 instructions since LDC 1.8 -O2
1613         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1614         int2 a_lo = vmovn_s64(cast(long2)a);
1615         int2 b_lo = vmovn_s64(cast(long2)b);
1616         return cast(__m128i) vmull_s32(a_lo, b_lo);
1617     }
1618     else
1619     {
1620         int4 ia = cast(int4)a;
1621         int4 ib = cast(int4)b;
1622         long2 r;
1623         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1624         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1625         return cast(__m128i)r;
1626     }
1627 }
1628 unittest
1629 {
1630     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1631     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1632     long2 R = cast(long2) _mm_mul_epi32(A, B);
1633     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1634     assert(R.array == correct);
1635 }
1636 
1637 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1638 /// return the low 32 bits of the intermediate integers.
1639 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
1640 {
1641     // PERF DMD
1642     // PERF GDC without SSE4.1 could be better
1643     static if (GDC_with_SSE41)
1644     {
1645         int4 ia = cast(int4)a;
1646         int4 ib = cast(int4)b;
1647         // Note: older GDC doesn't have that op, but older GDC
1648         // also has no support for -msse4.1 detection
1649         return cast(__m128i)(a * b); 
1650     }
1651     else version(LDC)
1652     {
1653         int4 ia = cast(int4)a;
1654         int4 ib = cast(int4)b;
1655         return cast(__m128i)(a * b);
1656     }
1657     else
1658     {
1659         // DMD doesn't take the above
1660         int4 ia = cast(int4)a;
1661         int4 ib = cast(int4)b;
1662         int4 r;
1663         r.ptr[0] = ia.array[0] * ib.array[0];
1664         r.ptr[1] = ia.array[1] * ib.array[1];
1665         r.ptr[2] = ia.array[2] * ib.array[2];
1666         r.ptr[3] = ia.array[3] * ib.array[3];
1667         return r;
1668     }
1669 }
1670 unittest
1671 {
1672     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1673     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1674     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1675     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1676     assert(R.array == correct);
1677 }
1678 
1679 
1680 /// Convert packed signed 32-bit integers from `a` and `b` 
1681 /// to packed 16-bit integers using unsigned saturation.
1682 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
1683 {
1684     static if (GDC_with_SSE41)
1685     {
1686         // PERF For some reason doesn't generates the builtin???
1687         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1688     }
1689     else static if (LDC_with_SSE41)
1690     {
1691         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1692     }
1693     else static if (LDC_with_ARM64)
1694     {
1695        int4 z;
1696        z = 0;       
1697        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1698                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1699     }
1700     else
1701     {
1702         // PERF: not great without SSE4.1
1703         int4 sa = cast(int4)a;
1704         int4 sb = cast(int4)b;
1705         align(16) ushort[8] result;
1706         for (int i = 0; i < 4; ++i)
1707         {
1708             int s = sa.array[i];
1709             if (s < 0) s = 0;
1710             if (s > 65535) s = 65535;
1711             result.ptr[i] = cast(ushort)s;
1712 
1713             s = sb.array[i];
1714             if (s < 0) s = 0;
1715             if (s > 65535) s = 65535;
1716             result.ptr[i+4] = cast(ushort)s;
1717         }
1718         return *cast(__m128i*)(result.ptr);
1719     }
1720 }
1721 unittest
1722 {
1723     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1724     short8 R = cast(short8) _mm_packus_epi32(A, A);
1725     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1726     assert(R.array == correct);
1727 }
1728 
1729 
1730 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1731 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1732 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1733 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1734 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1735 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1736 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1737 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1738 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1739 {
1740     // PERF DMD
1741     static if (GDC_with_SSE41)
1742     {
1743         return __builtin_ia32_roundpd(a, rounding);
1744     }
1745     else static if (LDC_with_SSE41)
1746     {
1747         return __builtin_ia32_roundpd(a, rounding);
1748     }
1749     else
1750     {
1751         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1752         {
1753             // Convert to 64-bit integers
1754             long lo = _mm_cvtsd_si64(a);
1755             a.ptr[0] = a.array[1];
1756             long hi = _mm_cvtsd_si64(a);
1757             return _mm_setr_pd(lo, hi);
1758         }
1759         else
1760         {
1761             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1762 
1763             uint old = _MM_GET_ROUNDING_MODE();
1764             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1765             
1766             // Convert to 64-bit integers
1767             long lo = _mm_cvtsd_si64(a);
1768             a.ptr[0] = a.array[1];
1769             long hi = _mm_cvtsd_si64(a);
1770 
1771             // Convert back to double to achieve the rounding
1772             // The problem is that a 64-bit double can't represent all the values 
1773             // a 64-bit integer can (and vice-versa). So this function won't work for
1774             // large values. (TODO: what range exactly?)
1775             _MM_SET_ROUNDING_MODE(old);
1776             return _mm_setr_pd(lo, hi);
1777         }
1778     }
1779 }
1780 unittest
1781 {
1782     // tested in other intrinsics
1783 }
1784 
1785 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1786 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1787 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1788 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1789 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1790 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1791 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1792 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1793 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1794 {
1795     static if (GDC_with_SSE41)
1796     {
1797         return __builtin_ia32_roundps(a, rounding);
1798     }
1799     else static if (LDC_with_SSE41)
1800     {
1801         return __builtin_ia32_roundps(a, rounding);
1802     }
1803     else
1804     {
1805         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1806         {
1807             __m128i integers = _mm_cvtps_epi32(a);
1808             return _mm_cvtepi32_ps(integers);
1809         }
1810         else
1811         {
1812             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1813             uint old = _MM_GET_ROUNDING_MODE();
1814             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1815             scope(exit) _MM_SET_ROUNDING_MODE(old);
1816 
1817             // Convert to 64-bit integers
1818             __m128i integers = _mm_cvtps_epi32(a);
1819 
1820             // Convert back to float to achieve the rounding
1821             // The problem is that a 32-float can't represent all the values 
1822             // a 32-bit integer can (and vice-versa). So this function won't work for
1823             // large values. (TODO: what range exactly?)
1824             __m128 result = _mm_cvtepi32_ps(integers);
1825 
1826             return result;
1827         }
1828     }
1829 }
1830 unittest
1831 {
1832     // tested in other intrinsics
1833 }
1834 
1835 
1836 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1837 /// rounding parameter, store the result as a double-precision floating-point element 
1838 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1839 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1840 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1841 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1842 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1843 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1844 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1845 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1846 {
1847     static if (GDC_with_SSE41)
1848     {
1849         return __builtin_ia32_roundsd(a, b, rounding);
1850     }
1851     else static if (LDC_with_SSE41)
1852     {
1853         return __builtin_ia32_roundsd(a, b, rounding);
1854     }
1855     else
1856     {
1857         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1858         {
1859             // Convert to 64-bit integer
1860             long b0 = _mm_cvtsd_si64(b);
1861             a.ptr[0] = b0;
1862             return a;
1863         }
1864         else
1865         {
1866             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1867 
1868             uint old = _MM_GET_ROUNDING_MODE();
1869             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1870             
1871             // Convert to 64-bit integer
1872             long b0 = _mm_cvtsd_si64(b);
1873             a.ptr[0] = b0;
1874 
1875             // Convert back to double to achieve the rounding
1876             // The problem is that a 64-bit double can't represent all the values 
1877             // a 64-bit integer can (and vice-versa). So this function won't work for
1878             // large values. (TODO: what range exactly?)
1879             _MM_SET_ROUNDING_MODE(old);
1880             return a;
1881         }
1882     }
1883 }
1884 unittest
1885 {
1886     // tested in other intrinsics
1887 }
1888 
1889 
1890 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1891 /// rounding parameter, store the result as a single-precision floating-point element 
1892 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1893 /// to the upper elements of result.
1894 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1895 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1896 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1897 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1898 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1899 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1900 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1901 {
1902     static if (GDC_with_SSE41)
1903     {
1904         return __builtin_ia32_roundss(a, b, rounding);
1905     }
1906     else static if (LDC_with_SSE41)
1907     {
1908         return __builtin_ia32_roundss(a, b, rounding);
1909     }
1910     else
1911     {
1912         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1913         {
1914             int b0 = _mm_cvtss_si32(b);
1915             a.ptr[0] = b0;   
1916             return a;
1917         }
1918         else version(GNU)
1919         {
1920             pragma(inline, false)
1921             __m128 GDCworkaround() nothrow @nogc @trusted 
1922             {
1923                 uint old = _MM_GET_ROUNDING_MODE();
1924                 _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1925 
1926                 // Convert to 32-bit integer
1927                 int b0 = _mm_cvtss_si32(b);
1928                 a.ptr[0] = b0;       
1929 
1930                 // Convert back to double to achieve the rounding
1931                 // The problem is that a 64-bit double can't represent all the values 
1932                 // a 64-bit integer can (and vice-versa). So this function won't work for
1933                 // large values. (TODO: what range exactly?)
1934                 _MM_SET_ROUNDING_MODE(old);
1935                 return a;
1936             }
1937             return GDCworkaround();
1938         }
1939         else
1940         {
1941             uint old = _MM_GET_ROUNDING_MODE();
1942             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1943 
1944             // Convert to 32-bit integer
1945             int b0 = _mm_cvtss_si32(b);
1946             a.ptr[0] = b0;       
1947 
1948             // Convert back to double to achieve the rounding
1949             // The problem is that a 64-bit double can't represent all the values 
1950             // a 64-bit integer can (and vice-versa). So this function won't work for
1951             // large values. (TODO: what range exactly?)
1952             _MM_SET_ROUNDING_MODE(old);
1953             return a;
1954         }
1955     }
1956 }
1957 unittest
1958 {
1959     // tested in other intrinsics
1960 }
1961 
1962 
1963 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
1964 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
1965 /// exception may be generated.
1966 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted
1967 {
1968     // PERF DMD D_SIMD
1969     static if (GDC_with_SSE41)
1970     {
1971         return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
1972     }
1973     else version(LDC)
1974     {
1975         enum prefix = `!0 = !{ i32 1 }`;
1976         enum ir = `
1977             %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
1978             ret <4 x i32> %r`;
1979         return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr);
1980     }
1981     else
1982     {
1983         return *mem_addr; // regular move instead
1984     }
1985 }
1986 // TODO unittest
1987 
1988 
1989 /// Return 1 if all bits in `a` are all 1's. Else return 0.
1990 int _mm_test_all_ones (__m128i a) @safe
1991 {
1992     return _mm_testc_si128(a, _mm_set1_epi32(-1));
1993 }
1994 unittest
1995 {
1996     __m128i A = _mm_set1_epi32(-1);
1997     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
1998     assert(_mm_test_all_ones(A) == 1);
1999     assert(_mm_test_all_ones(B) == 0);
2000 }
2001 
2002 /// Return 1 if all bits in `a` are all 0's. Else return 0.
2003 // This is a #BONUS since it was lacking in Intel Intrinsics API.
2004 int _mm_test_all_zeros (__m128i a) @safe
2005 {
2006     return _mm_testz_si128(a, _mm_set1_epi32(-1));
2007 }
2008 unittest
2009 {
2010     __m128i A = _mm_set1_epi32(0);
2011     __m128i B = _mm_set_epi32(0, 8, 0, 0);
2012     assert(_mm_test_all_zeros(A) == 1);
2013     assert(_mm_test_all_zeros(B) == 0);
2014 }
2015 
2016 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
2017 /// and return 1 if the result is zero, otherwise return 0.
2018 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
2019 {
2020     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
2021 }
2022 
2023 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
2024 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
2025 {
2026     return _mm_testnzc_si128(a, mask);
2027 }
2028 
2029 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
2030 /// result is zero, otherwise return 0.
2031 /// In other words, test if all bits masked by `b` are 1 in `a`.
2032 int _mm_testc_si128 (__m128i a, __m128i b) @trusted
2033 {
2034     // PERF DMD
2035     static if (GDC_with_SSE41)
2036     {
2037         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2038     }
2039     else static if (LDC_with_SSE41)
2040     {
2041         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2042     }
2043     else static if (LDC_with_ARM64)
2044     {
2045         // Acceptable since LDC 1.8 -02
2046         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2047         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2048     }
2049     else
2050     {
2051         __m128i c = ~a & b;
2052         int[4] zero = [0, 0, 0, 0];
2053         return c.array == zero;
2054     }
2055 }
2056 unittest
2057 {
2058     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2059     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2060     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2061     assert(_mm_testc_si128(A, A) == 1);
2062     assert(_mm_testc_si128(A, M1) == 0);
2063     assert(_mm_testc_si128(A, M2) == 1);
2064 }
2065 
2066 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2067 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2068 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2069 /// result is zero, otherwise set CF to 0. 
2070 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2071 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2072 {
2073     // PERF DMD
2074     static if (GDC_with_SSE41)
2075     {
2076         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2077     }
2078     else static if (LDC_with_SSE41)
2079     {
2080         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2081     }
2082     else static if (LDC_with_ARM64)
2083     {
2084         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2085         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2086 
2087         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2088                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2089     }
2090     else
2091     {
2092         __m128i c = a & b;
2093         __m128i d = ~a & b;
2094         int[4] zero = [0, 0, 0, 0];
2095         return !( (c.array == zero) || (d.array == zero));
2096     }    
2097 }
2098 unittest
2099 {
2100     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2101     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2102     __m128i Z = _mm_setzero_si128();
2103     assert(_mm_testnzc_si128(A, Z) == 0);
2104     assert(_mm_testnzc_si128(A, M) == 1);
2105     assert(_mm_testnzc_si128(A, A) == 0);
2106 }
2107 
2108 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2109 /// and return 1 if the result is zero, otherwise return 0.
2110 /// In other words, test if all bits masked by `b` are 0 in `a`.
2111 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2112 {
2113     // PERF DMD
2114     static if (GDC_with_SSE41)
2115     {
2116         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2117     }
2118     else static if (LDC_with_SSE41)
2119     {
2120         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2121     }
2122     else static if (LDC_with_ARM64)
2123     {
2124         // Acceptable since LDC 1.8 -02
2125         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2126         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2127     }
2128     else 
2129     {
2130         __m128i c = a & b;
2131         int[4] zero = [0, 0, 0, 0];
2132         return c.array == zero;
2133     }    
2134 }
2135 unittest
2136 {
2137     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2138     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2139     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2140     assert(_mm_testz_si128(A, A) == 0);
2141     assert(_mm_testz_si128(A, M1) == 1);
2142     assert(_mm_testz_si128(A, M2) == 0);
2143 }
2144