inteli.smmintrin source code

1 /**
2 * SSE4.1 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2021.
5 *            Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.smmintrin;
9 
10 // SSE4.1 instructions
11 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
12 // Note: this header will work whether you have SSE4.1 enabled or not.
13 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
14 // generate SSE4.1 instructions.
15 
16 public import inteli.types;
17 import inteli.internals;
18 
19 // smmintrin pulls in all previous instruction set intrinsics.
20 public import inteli.tmmintrin;
21 
22 nothrow @nogc:
23 
24 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
25 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
26 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
27 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
28 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
29 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
30 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
31 
32 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
33 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
34 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
35 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
36 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
37 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
38 
39 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
40 // Note: changed signature, GDC needs a compile-time value for imm8.
41 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted
42 {
43     // PERF DMD
44     static if (GDC_with_SSE41)
45     {
46         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
47     }
48     else 
49     {
50         // LDC x86 This generates pblendw since LDC 1.1 and -O2
51         short8 r;
52         short8 sa = cast(short8)a;
53         short8 sb = cast(short8)b;
54         for (int n = 0; n < 8; ++n)
55         {
56             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
57         }
58         return cast(__m128i)r;
59     }
60 }
61 unittest
62 {
63     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
64     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
65     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
66     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
67     assert(C.array == correct);
68 }
69 
70 
71 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
72 // Note: changed signature, GDC needs a compile-time value for `imm8`.
73 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
74 {
75     static assert(imm8 >= 0 && imm8 < 4);
76     // PERF DMD
77     static if (GDC_with_SSE41)
78     {
79         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
80     }
81     else
82     {
83         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
84         double2 r;
85         for (int n = 0; n < 2; ++n)
86         {
87             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
88         }
89         return cast(__m128d)r;
90     }
91 }
92 unittest
93 {
94     __m128d A = _mm_setr_pd(0, 1);
95     __m128d B = _mm_setr_pd(8, 9);
96     double2 C = _mm_blend_pd!2(A, B);
97     double[2] correct =    [0, 9];
98     assert(C.array == correct);
99 }
100 
101 
102 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`.
103 // Note: changed signature, GDC needs a compile-time value for imm8.
104 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted
105 {
106     // PERF DMD
107     static assert(imm8 >= 0 && imm8 < 16);
108     static if (GDC_with_SSE41)
109     {
110         return __builtin_ia32_blendps(a, b, imm8);
111     }
112     else version(LDC)
113     {
114         // LDC x86: generates blendps since LDC 1.1 -O2
115         //   arm64: pretty good, two instructions worst case
116         return shufflevector!(float4, (imm8 & 1) ? 4 : 0,
117                                       (imm8 & 2) ? 5 : 1,
118                                       (imm8 & 4) ? 6 : 2,
119                                       (imm8 & 8) ? 7 : 3)(a, b);
120     }
121     else
122     {
123         __m128 r;
124         for (int n = 0; n < 4; ++n)
125         {
126             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
127         }
128         return r;
129     }
130 }
131 unittest
132 {
133     __m128 A = _mm_setr_ps(0, 1,  2,  3);
134     __m128 B = _mm_setr_ps(8, 9, 10, 11);
135     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
136     float[4] correct =    [8, 1, 10, 11];
137     assert(C.array == correct);
138 }
139 
140 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
141 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
142 {
143     // PERF DMD
144     static if (GDC_with_SSE41)
145     {
146         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
147     }
148     else static if (LDC_with_SSE41)
149     {
150         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
151     }
152     else static if (LDC_with_ARM64)
153     {
154         // LDC arm64: two instructions since LDC 1.12 -O2
155         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
156         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
157     }
158     else
159     {
160         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
161         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
162     }
163 }
164 unittest
165 {
166     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
167                                8,  9, 10, 11, 12, 13, 14, 15);
168     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
169                               24, 25, 26, 27, 28, 29, 30, 31);
170     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
171                                1,  1, -1, -1,  4,  1,  8, -128);
172     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
173     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
174                                8,  9, 26, 27, 12, 13, 14, 31 ];
175     assert(R.array == correct);
176 }
177 
178 
179 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
180 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
181 {
182     // PERF DMD
183     static if (GDC_with_SSE42)
184     {
185         // Amazingly enough, GCC/GDC generates the blendvpd instruction
186         // with -msse4.2 but not -msse4.1.
187         // Not sure what is the reason, and there is a replacement sequence.
188         // Sounds like a bug.
189         return __builtin_ia32_blendvpd(a, b, mask);
190     }
191     else static if (LDC_with_SSE41)
192     {
193         return __builtin_ia32_blendvpd(a, b, mask);
194     }
195     else static if (LDC_with_ARM64)
196     {
197         long2 shift;
198         shift = 63;
199         long2 lmask = cast(long2)mask >> shift;
200         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
201     }
202     else
203     {
204         __m128d r;
205         long2 lmask = cast(long2)mask;
206         for (int n = 0; n < 2; ++n)
207         {
208             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
209         }
210         return r;
211     }
212 }
213 unittest
214 {
215     __m128d A = _mm_setr_pd(1.0, 2.0);
216     __m128d B = _mm_setr_pd(3.0, 4.0);
217     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
218     __m128d R1 = _mm_blendv_pd(A, B, M1);
219     double[2] correct1 = [3.0, 2.0];
220     assert(R1.array == correct1);
221 
222     // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason.
223     // but it does work in arm64 macOS
224     // yields different results despite FP seemingly not being used
225     version(linux)
226     {}
227     else
228     {
229         __m128d M2 = _mm_setr_pd(double.nan, -double.nan);
230         __m128d R2 = _mm_blendv_pd(A, B, M2);
231         double[2] correct2 = [1.0, 4.0];
232         assert(R2.array == correct2);
233     }
234 }
235 
236 
237 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
238 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
239 {
240     // PERF DMD
241     static if (GDC_with_SSE41)
242     {
243         return __builtin_ia32_blendvps(a, b, mask);
244     }
245     else static if (LDC_with_SSE41)
246     {
247         return __builtin_ia32_blendvps(a, b, mask);
248     }
249     else static if (LDC_with_ARM64)
250     {
251         int4 shift;
252         shift = 31;
253         int4 lmask = cast(int4)mask >> shift;
254         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
255     }
256     else
257     {
258         __m128 r;
259         int4 lmask = cast(int4)mask;
260         for (int n = 0; n < 4; ++n)
261         {
262             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
263         }
264         return r;
265     }
266 }
267 unittest
268 {
269     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
270     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
271     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
272     __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f);
273     __m128 R1 = _mm_blendv_ps(A, B, M1);
274     __m128 R2 = _mm_blendv_ps(A, B, M2);
275     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
276     float[4] correct2 =    [ 0.0f, 5.0f, 6.0f, 3.0f];
277     assert(R1.array == correct1);
278 
279     // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason.
280     // yields different results despite FP seemingly not being used
281     version(linux)
282     {}
283     else
284     {
285         assert(R2.array == correct2);
286     }
287 }
288 
289 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
290 /// and store the results as packed double-precision floating-point elements.
291 __m128d _mm_ceil_pd (__m128d a) @trusted
292 {
293     static if (LDC_with_ARM64)
294     {
295         // LDC arm64 acceptable since 1.8 -O2
296         // Unfortunately x86 intrinsics force a round-trip back to double2
297         // ARM neon semantics wouldn't have that
298         long2 l = vcvtpq_s64_f64(a);
299         double2 r;
300         r.ptr[0] = l.array[0];
301         r.ptr[1] = l.array[1];
302         return r;
303     }
304     else
305     {
306         return _mm_round_pd!2(a);
307     }
308 }
309 unittest
310 {
311     __m128d A = _mm_setr_pd(1.3f, -2.12f);
312     __m128d B = _mm_setr_pd(53.6f, -2.7f);
313     A = _mm_ceil_pd(A);
314     B = _mm_ceil_pd(B);
315     double[2] correctA = [2.0, -2.0];
316     double[2] correctB = [54.0, -2.0];
317     assert(A.array == correctA);
318     assert(B.array == correctB);
319 }
320 
321 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
322 /// and store the results as packed single-precision floating-point elements.
323 __m128 _mm_ceil_ps (__m128 a) @trusted
324 {
325     static if (LDC_with_ARM64)
326     {
327         // LDC arm64 acceptable since 1.8 -O1
328         int4 l = vcvtpq_s32_f32(a);
329         float4 r;
330         r.ptr[0] = l.array[0];
331         r.ptr[1] = l.array[1];
332         r.ptr[2] = l.array[2];
333         r.ptr[3] = l.array[3];
334         return r;
335     }
336     else
337     {
338         return _mm_round_ps!2(a);
339     }
340 }
341 unittest
342 {
343     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
344     __m128 C = _mm_ceil_ps(A);
345     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
346     assert(C.array == correct);
347 }
348 
349 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
350 /// store the result as a double-precision floating-point element in the lower element of result, 
351 /// and copy the upper element from `a` to the upper element of dst.
352 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
353 {
354     static if (LDC_with_ARM64)
355     {
356         a[0] = vcvtps_s64_f64(b[0]);
357         return a;
358     }
359     else
360     {
361         return _mm_round_sd!2(a, b);
362     }
363 }
364 unittest
365 {
366     __m128d A = _mm_setr_pd(1.3, -2.12);
367     __m128d B = _mm_setr_pd(53.6, -3.7);
368     __m128d C = _mm_ceil_sd(A, B);
369     double[2] correct = [54.0, -2.12];
370     assert(C.array == correct);
371 }
372 
373 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
374 /// store the result as a single-precision floating-point element in the lower element of result, 
375 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
376 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
377 {
378     static if (LDC_with_ARM64)
379     {
380         a[0] = vcvtps_s32_f32(b[0]);
381         return a;
382     }
383     else
384     {
385         return _mm_round_ss!2(a, b);
386     }
387 }
388 unittest
389 {
390     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
391     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
392     __m128 C = _mm_ceil_ss(A, B);
393     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
394     assert(C.array == correct);
395 }
396 
397 /// Compare packed 64-bit integers in `a` and `b` for equality.
398 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
399 {
400     // PERF DMD
401     static if (GDC_with_SSE41)
402     {
403         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
404     }
405     else version(LDC)
406     {
407         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
408         //     arm64: generates cmeq since LDC 1.8 -O1
409         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
410     }
411     else
412     {
413         // Clever pcmpeqd + pand use with LDC 1.24 -O2
414         long2 la = cast(long2)a;
415         long2 lb = cast(long2)b;
416         long2 res;
417         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
418         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
419         return cast(__m128i)res;
420     }
421 }
422 unittest
423 {
424     __m128i A = _mm_setr_epi64(-1, -2);
425     __m128i B = _mm_setr_epi64(-3, -2);
426     __m128i C = _mm_setr_epi64(-1, -4);
427     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
428     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
429     long[2] correct1 = [0, -1];
430     long[2] correct2 = [-1, 0];
431     assert(AB.array == correct1);
432     assert(AC.array == correct2);
433 }
434 
435 
436 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
437 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
438 {
439     // PERF DMD
440     static if (GDC_with_SSE41)
441     {
442         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
443     }
444     else version(LDC)
445     {
446         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
447         enum ir = `
448             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
449             %r = sext <4 x i16> %v to <4 x i32>
450             ret <4 x i32> %r`;
451         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
452     }
453     else
454     {
455         short8 sa = cast(short8)a;
456         int4 r;
457         r.ptr[0] = sa.array[0];
458         r.ptr[1] = sa.array[1];
459         r.ptr[2] = sa.array[2];
460         r.ptr[3] = sa.array[3];
461         return r;
462     }
463 }
464 unittest
465 {
466     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
467     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
468     int[4] correct = [-1, 0, -32768, 32767];
469     assert(C.array == correct);
470 }
471 
472 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
473 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
474 {
475     // PERF DMD
476     static if (GDC_with_SSE41)
477     {
478         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
479     }
480     else version(LDC)
481     {
482         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
483         enum ir = `
484             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
485             %r = sext <2 x i16> %v to <2 x i64>
486             ret <2 x i64> %r`;
487         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
488     }
489     else
490     {
491         short8 sa = cast(short8)a;
492         long2 r;
493         r.ptr[0] = sa.array[0];
494         r.ptr[1] = sa.array[1];
495         return cast(__m128i)r;
496     }
497 }
498 unittest
499 {
500     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
501     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
502     long[2] correct = [-32768, 32767];
503     assert(C.array == correct);
504 }
505 
506 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
507 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
508 {
509     // PERF DMD
510     static if (GDC_with_SSE41)
511     {
512         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
513     }
514     else version(LDC)
515     {
516         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
517         enum ir = `
518             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
519             %r = sext <2 x i32> %v to <2 x i64>
520             ret <2 x i64> %r`;
521         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
522     }
523     else
524     {
525         int4 sa = cast(int4)a;
526         long2 r;
527         r.ptr[0] = sa.array[0];
528         r.ptr[1] = sa.array[1];
529         return cast(__m128i)r;
530     }
531 }
532 unittest
533 {
534     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
535     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
536     long[2] correct = [-4, 42];
537     assert(C.array == correct);
538 }
539 
540 
541 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
542 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
543 {
544     // PERF DMD
545     static if (GDC_with_SSE41)
546     {
547         alias ubyte16 = __vector(ubyte[16]);
548         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
549     }
550     else version(LDC)
551     {
552         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
553         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
554         enum ir = `
555             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
556             %r = sext <8 x i8> %v to <8 x i16>
557             ret <8 x i16> %r`;
558         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
559     }
560     else
561     {
562         byte16 sa = cast(byte16)a;
563         short8 r;
564         foreach(n; 0..8)
565             r.ptr[n] = sa.array[n];
566         return cast(__m128i)r;
567     }
568 }
569 unittest
570 {
571     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
572     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
573     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
574     assert(C.array == correct);
575 }
576 
577 
578 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
579 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
580 {
581     // PERF DMD
582     static if (GDC_with_SSE41)
583     {
584         alias ubyte16 = __vector(ubyte[16]);
585         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
586     }
587     else static if (LDC_with_SSE41)
588     {
589         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
590         enum ir = `
591             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
592             %r = sext <4 x i8> %v to <4 x i32>
593             ret <4 x i32> %r`;
594         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
595     }
596     else
597     {
598         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
599         byte16 sa = cast(byte16)a;
600         int4 r;
601         r.ptr[0] = sa.array[0];
602         r.ptr[1] = sa.array[1];
603         r.ptr[2] = sa.array[2];
604         r.ptr[3] = sa.array[3];
605         return cast(__m128i)r;
606     }
607 }
608 unittest
609 {
610     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
611     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
612     int[4] correct = [127, -128, 1, -1];
613     assert(C.array == correct);
614 }
615 
616 
617 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
618 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
619 {
620     // PERF DMD
621     static if (GDC_with_SSE41)
622     {
623         alias ubyte16 = __vector(ubyte[16]);
624         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
625     }
626     else version(LDC)
627     {
628         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
629         // LDC arm64: it's ok since LDC 1.8 -O1
630         enum ir = `
631             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
632             %r = sext <2 x i8> %v to <2 x i64>
633             ret <2 x i64> %r`;
634         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
635     }
636     else
637     {
638         byte16 sa = cast(byte16)a;
639         long2 r;
640         foreach(n; 0..2)
641             r.ptr[n] = sa.array[n];
642         return cast(__m128i)r;
643     }
644 }
645 unittest
646 {
647     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
648     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
649     long[2] correct = [127, -128];
650     assert(C.array == correct);
651 }
652 
653 
654 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
655 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
656 {
657     // PERF DMD
658     static if (GDC_with_SSE41)
659     {
660         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
661     }
662     else
663     {
664         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
665         //     arm64: ushll since LDC 1.12 -O1
666         short8 sa = cast(short8)a;
667         int4 r;
668         r.ptr[0] = cast(ushort)sa.array[0];
669         r.ptr[1] = cast(ushort)sa.array[1];
670         r.ptr[2] = cast(ushort)sa.array[2];
671         r.ptr[3] = cast(ushort)sa.array[3];
672         return cast(__m128i)r;
673     }
674 }
675 unittest
676 {
677     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
678     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
679     int[4] correct = [65535, 0, 32768, 32767];
680     assert(C.array == correct);
681 }
682 
683 
684 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
685 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
686 {
687     // PERF DMD
688     static if (GDC_with_SSE41)
689     {
690         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
691     }
692     else static if (LDC_with_ARM64)
693     {
694         // LDC arm64: a bit shorter than below, in -O2
695         short8 sa = cast(short8)a;
696         long2 r;
697         for(int n = 0; n < 2; ++n)
698             r.ptr[n] = cast(ushort)sa.array[n];
699         return cast(__m128i)r;
700     }
701     else
702     {
703         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
704         short8 sa = cast(short8)a;
705         long2 r;
706         r.ptr[0] = cast(ushort)sa.array[0];
707         r.ptr[1] = cast(ushort)sa.array[1];
708         return cast(__m128i)r;
709     }
710 }
711 unittest
712 {
713     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
714     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
715     long[2] correct = [65535, 0];
716     assert(C.array == correct);
717 }
718 
719 
720 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
721 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
722 {
723     // PERF DMD
724     static if (GDC_with_SSE41)
725     {
726         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
727     }
728     else
729     {
730         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
731         //     arm64: generates ushll since LDC 1.12 -O1
732         int4 sa = cast(int4)a;
733         long2 r;
734         r.ptr[0] = cast(uint)sa.array[0];
735         r.ptr[1] = cast(uint)sa.array[1];
736         return cast(__m128i)r;
737     }
738 }
739 unittest
740 {
741     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
742     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
743     long[2] correct = [4294967295, 42];
744     assert(C.array == correct);
745 }
746 
747 
748 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
749 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
750 {
751     // PERF DMD
752     static if (GDC_with_SSE41)
753     {
754         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(short8)a);
755     }
756     else
757     {
758         // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1
759         //     arm64: ushll since LDC 1.12 -O1
760         // PERF: catastrophic with GDC without SSE4.1
761         byte16 sa = cast(byte16)a;
762         short8 r;
763         r.ptr[0] = cast(ubyte)sa.array[0];
764         r.ptr[1] = cast(ubyte)sa.array[1];
765         r.ptr[2] = cast(ubyte)sa.array[2];
766         r.ptr[3] = cast(ubyte)sa.array[3];
767         r.ptr[4] = cast(ubyte)sa.array[4];
768         r.ptr[5] = cast(ubyte)sa.array[5];
769         r.ptr[6] = cast(ubyte)sa.array[6];
770         r.ptr[7] = cast(ubyte)sa.array[7];
771         return cast(__m128i)r;
772     }
773 }
774 unittest
775 {
776     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
777     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
778     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
779     assert(C.array == correct);
780 }
781 
782 
783 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
784 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
785 {
786     // PERF DMD
787     static if (GDC_with_SSE41)
788     {
789         alias ubyte16 = __vector(ubyte[16]);
790         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
791     }
792     else static if (LDC_with_ARM64)
793     {
794         // LDC arm64: a bit better than below in -O2
795         byte16 sa = cast(byte16)a;
796         int4 r;
797         for(int n = 0; n < 4; ++n) 
798             r.ptr[n] = cast(ubyte)sa.array[n];
799         return cast(__m128i)r;
800     }
801     else
802     {
803         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
804         // PERF: catastrophic with GDC without SSE4.1
805         byte16 sa = cast(byte16)a;
806         int4 r;
807         r.ptr[0] = cast(ubyte)sa.array[0];
808         r.ptr[1] = cast(ubyte)sa.array[1];
809         r.ptr[2] = cast(ubyte)sa.array[2];
810         r.ptr[3] = cast(ubyte)sa.array[3];
811         return cast(__m128i)r;
812     }
813 }
814 unittest
815 {
816     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
817     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
818     int[4] correct = [127, 128, 1, 255];
819     assert(C.array == correct);
820 }
821 
822 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
823 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
824 {
825     // PERF DMD
826     static if (GDC_with_SSE41)
827     {
828         alias ubyte16 = __vector(ubyte[16]);
829         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
830     }
831     else static if (LDC_with_ARM64)
832     {
833         // LDC arm64: this optimizes better than the loop below
834         byte16 sa = cast(byte16)a;
835         long2 r;
836         for (int n = 0; n < 2; ++n)
837             r.ptr[n] = cast(ubyte)sa.array[n];
838         return cast(__m128i)r;
839     }
840     else
841     {
842         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
843         byte16 sa = cast(byte16)a;
844         long2 r;
845         r.ptr[0] = cast(ubyte)sa.array[0];
846         r.ptr[1] = cast(ubyte)sa.array[1];
847         return cast(__m128i)r;
848     }
849 }
850 unittest
851 {
852     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
853     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
854     long[2] correct = [127, 254];
855     assert(C.array == correct);
856 }
857 
858 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
859 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
860 /// store the sum in dst using the low 4 bits of `imm8`.
861 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
862 {
863     // PERF DMD
864     static if (GDC_with_SSE41)
865     {
866         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
867     }
868     else static if (LDC_with_SSE41)
869     {
870         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
871     }
872     else
873     {
874         __m128d zero = _mm_setzero_pd();
875         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
876         double sum = temp.array[0] + temp.array[1];
877         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
878     }
879 }
880 unittest
881 {
882     __m128d A = _mm_setr_pd(1.0, 2.0);
883     __m128d B = _mm_setr_pd(4.0, 8.0);
884     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
885     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
886     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
887     double[2] correct1 = [ 4.0,  4.0];
888     double[2] correct2 = [16.0,  0.0];
889     double[2] correct3 = [ 0.0, 20.0];
890     assert(R1.array == correct1);
891     assert(R2.array == correct2);
892     assert(R3.array == correct3);
893 }
894 
895 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
896 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
897 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
898 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
899 {
900       // PERF DMD
901     static if (GDC_with_SSE41)
902     {
903         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
904     }
905     else static if (LDC_with_SSE41)
906     {
907         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
908     }
909     else
910     {
911         __m128 zero = _mm_setzero_ps();
912         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
913         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
914         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
915     }        
916 }
917 unittest
918 {
919     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
920     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
921     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
922     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
923     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
924     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
925     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
926     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
927     assert(R1.array == correct1);
928     assert(R2.array == correct2);
929     assert(R3.array == correct3);
930 }
931 
932 
933 /// Extract a 32-bit integer from `a`, selected with `imm8`.
934 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
935 {
936     return (cast(int4)a).array[imm8 & 3];
937 }
938 unittest
939 {
940     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
941     assert(_mm_extract_epi32(A, 0) == 1);
942     assert(_mm_extract_epi32(A, 1 + 8) == 2);
943     assert(_mm_extract_epi32(A, 3 + 4) == 4);
944 }
945 
946 /// Extract a 64-bit integer from `a`, selected with `imm8`.
947 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
948 {
949     long2 la = cast(long2)a;
950     return la.array[imm8 & 1];
951 }
952 unittest
953 {
954     __m128i A = _mm_setr_epi64(45, -67);
955     assert(_mm_extract_epi64(A, 0) == 45);
956     assert(_mm_extract_epi64(A, 1) == -67);
957     assert(_mm_extract_epi64(A, 2) == 45);
958 }
959 
960 /// Extract an 8-bit integer from `a`, selected with `imm8`.
961 /// Warning: the returned value is zero-extended to 32-bits.
962 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
963 {
964     byte16 ba = cast(byte16)a;
965     return cast(ubyte) ba.array[imm8 & 15];
966 }
967 unittest
968 {
969     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
970     assert(_mm_extract_epi8(A, 7) == 7);
971     assert(_mm_extract_epi8(A, 13) == 255);
972     assert(_mm_extract_epi8(A, 7 + 16) == 7);
973 }
974 
975 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
976 /// Note: returns a 32-bit $(I integer).
977 int _mm_extract_ps (__m128 a, const int imm8) @trusted
978 {
979     return (cast(int4)a).array[imm8 & 3];
980 }
981 unittest
982 {
983     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
984     assert(_mm_extract_ps(A, 0) == 0x3f800000);
985     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
986     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
987 }
988 
989 
990 
991 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
992 /// integer value, and store the results as packed double-precision floating-point elements.
993 __m128d _mm_floor_pd (__m128d a) @trusted
994 {
995     static if (LDC_with_ARM64)
996     {
997         // LDC arm64 acceptable since 1.8 -O2
998         long2 l = vcvtmq_s64_f64(a);
999         double2 r;
1000         r.ptr[0] = l.array[0];
1001         r.ptr[1] = l.array[1];
1002         return r;
1003     }
1004     else
1005     {
1006         return _mm_round_pd!1(a);
1007     }
1008 }
1009 unittest
1010 {
1011     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1012     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1013     A = _mm_floor_pd(A);
1014     B = _mm_floor_pd(B);
1015     double[2] correctA = [1.0, -3.0];
1016     double[2] correctB = [53.0, -3.0];
1017     assert(A.array == correctA);
1018     assert(B.array == correctB);
1019 }
1020 
1021 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1022 /// integer value, and store the results as packed single-precision floating-point elements.
1023 __m128 _mm_floor_ps (__m128 a) @trusted
1024 {
1025     static if (LDC_with_ARM64)
1026     {
1027         // LDC arm64 acceptable since 1.8 -O1
1028         int4 l = vcvtmq_s32_f32(a);
1029         float4 r;
1030         r.ptr[0] = l.array[0];
1031         r.ptr[1] = l.array[1];
1032         r.ptr[2] = l.array[2];
1033         r.ptr[3] = l.array[3];
1034         return r;
1035     }
1036     else
1037     {
1038         return _mm_round_ps!1(a);
1039     }
1040 }
1041 unittest
1042 {
1043     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1044     __m128 C = _mm_floor_ps(A);
1045     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1046     assert(C.array == correct);
1047 }
1048 
1049 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1050 /// integer value, store the result as a double-precision floating-point element in the 
1051 /// lower element, and copy the upper element from `a` to the upper element.
1052 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1053 {
1054     static if (LDC_with_ARM64)
1055     {
1056         a[0] = vcvtms_s64_f64(b[0]);
1057         return a;
1058     }
1059     else
1060     {
1061         return _mm_round_sd!1(a, b);
1062     }
1063 }
1064 unittest
1065 {
1066     __m128d A = _mm_setr_pd(1.3, -2.12);
1067     __m128d B = _mm_setr_pd(-53.1, -3.7);
1068     __m128d C = _mm_floor_sd(A, B);
1069     double[2] correct = [-54.0, -2.12];
1070     assert(C.array == correct);
1071 }
1072 
1073 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1074 /// integer value, store the result as a single-precision floating-point element in the
1075 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1076 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1077 {
1078     static if (LDC_with_ARM64)
1079     {
1080         a[0] = vcvtms_s32_f32(b[0]);
1081         return a;
1082     }
1083     else
1084     {
1085         return _mm_round_ss!1(a, b);
1086     }
1087 }
1088 unittest
1089 {
1090     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1091     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1092     __m128 C = _mm_floor_ss(A, B);
1093     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1094     assert(C.array == correct);
1095 }
1096 
1097 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1098 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1099 {
1100     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1101     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1102     // LDC arm64: ins.s since LDC 1.8 -O2
1103     int4 ia = cast(int4)a;
1104     ia.ptr[imm8 & 3] = i;
1105     return cast(__m128i)ia; 
1106 }
1107 unittest
1108 {
1109     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1110     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1111     int[4] result = [1, 2, 5, 4];
1112     assert(C.array == result);
1113 }
1114 
1115 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1116 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1117 {
1118     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1119     // LDC x86: always do something sensible.
1120     long2 la = cast(long2)a;
1121     la.ptr[imm8 & 1] = i;
1122     return cast(__m128i)la;
1123 }
1124 unittest
1125 {
1126     __m128i A = _mm_setr_epi64(1, 2);
1127     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1128     long[2] result = [1, 5];
1129     assert(C.array == result);
1130 }
1131 
1132 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1133 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1134 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1135 {
1136     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1137     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1138     byte16 ba = cast(byte16)a;
1139     ba.ptr[imm8 & 15] = cast(byte)i;
1140     return cast(__m128i)ba; 
1141 }
1142 unittest
1143 {
1144     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1145     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1146     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1147     assert(C.array == result);
1148 }
1149 
1150 
1151 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1152 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1153 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1154 /// (elements are zeroed out when the corresponding bit is set).
1155 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1156 {
1157     // PERF DMD
1158     static if (GDC_with_SSE41)
1159     {
1160         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1161     }
1162     else static if (LDC_with_SSE41)
1163     {
1164         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1165     }
1166     else
1167     {
1168         float4 tmp2 = a;
1169         float tmp1 = b.array[(imm8 >> 6) & 3];
1170         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1171         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1172     }
1173 }
1174 unittest
1175 {
1176     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1177     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1178     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1179     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1180     assert(C.array == correct);
1181 }
1182 
1183 
1184 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1185 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
1186 {
1187     static if (GDC_with_SSE41)
1188     {
1189         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1190     }
1191     else version(LDC)
1192     {
1193         // x86: pmaxsd since LDC 1.1 -O1
1194         // ARM: smax.4s since LDC 1.8 -01
1195         int4 sa = cast(int4)a;
1196         int4 sb = cast(int4)b;
1197         int4 greater = greaterMask!int4(sa, sb);
1198         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1199     }
1200     else
1201     {
1202         __m128i higher = _mm_cmpgt_epi32(a, b);
1203         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1204         __m128i mask = _mm_and_si128(aTob, higher);
1205         return _mm_xor_si128(b, mask);
1206     }
1207 }
1208 unittest
1209 {
1210     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1211                                       _mm_setr_epi32(        -4,-8,  9, -8));
1212     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1213     assert(R.array == correct);
1214 }
1215 
1216 /// Compare packed signed 8-bit integers in `a` and `b`, 
1217 /// and return packed maximum values.
1218 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
1219 {
1220     // PERF DMD
1221     static if (GDC_with_SSE41)
1222     {
1223         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1224     }
1225     else version(LDC)
1226     {
1227         // x86: pmaxsb since LDC 1.1 -O1
1228         // ARM64: smax.16b since LDC 1.8.0 -O1
1229         byte16 sa = cast(byte16)a;
1230         byte16 sb = cast(byte16)b;
1231         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1232         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1233     }
1234     else
1235     {
1236         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1237         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1238         __m128i mask = _mm_and_si128(aTob, lower);
1239         return _mm_xor_si128(b, mask);
1240     }
1241 }
1242 unittest
1243 {
1244     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1245     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1246     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1247     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1248     assert(R.array == correct);
1249 }
1250 
1251 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1252 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
1253 {
1254     // PERF DMD
1255     static if (GDC_with_SSE41)
1256     {
1257         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1258     }
1259     else version(LDC)
1260     {
1261         // x86: pmaxuw since LDC 1.1 -O1
1262         // ARM64: umax.8h since LDC 1.8.0 -O1
1263         // PERF: without sse4.1, LLVM 12 produces a very interesting
1264         //          psubusw xmm0, xmm1
1265         //          paddw   xmm0, xmm1
1266         //       sequence that maybe should go in other min/max intrinsics? 
1267         ushort8 sa = cast(ushort8)a;
1268         ushort8 sb = cast(ushort8)b;
1269         ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1270         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1271     }
1272     else
1273     {
1274         b = _mm_subs_epu16(b, a);
1275         b = _mm_add_epi16(b, a);
1276         return b;
1277     }
1278 }
1279 unittest
1280 {
1281     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1282                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1283     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1284     assert(R.array == correct);
1285 }
1286 
1287 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1288 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
1289 {
1290     // PERF DMD
1291     static if (GDC_with_SSE41)
1292     {
1293         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1294     }
1295     else version(LDC)
1296     {
1297         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1298         // ARM64: umax.4s since LDC 1.8.0 -O1
1299         uint4 sa = cast(uint4)a;
1300         uint4 sb = cast(uint4)b;
1301         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1302         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1303     }
1304     else
1305     {
1306         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1307         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1308         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1309         __m128i mask = _mm_and_si128(aTob, higher);
1310         return _mm_xor_si128(b, mask);
1311     }
1312 }
1313 unittest
1314 {
1315     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1316                                       _mm_setr_epi32(        -4,-8,  9, -8));
1317     int[4] correct =                                [        -4,-8,  9, -7];
1318     assert(R.array == correct);
1319 }
1320 
1321 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1322 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
1323 {
1324     // PERF DMD
1325     static if (GDC_with_SSE41)
1326     {
1327         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1328     }
1329     else version(LDC)
1330     {
1331         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1332         // ARM: smin.4s since LDC 1.8 -01
1333         int4 sa = cast(int4)a;
1334         int4 sb = cast(int4)b;
1335         int4 greater = greaterMask!int4(sa, sb);
1336         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1337     }
1338     else
1339     {
1340         __m128i higher = _mm_cmplt_epi32(a, b);
1341         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1342         __m128i mask = _mm_and_si128(aTob, higher);
1343         return _mm_xor_si128(b, mask);
1344     }
1345 }
1346 unittest
1347 {
1348     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1349                                       _mm_setr_epi32(        -4, -8,  9, -8));
1350     int[4] correct =                               [         -4, -8, -4, -8];
1351     assert(R.array == correct);
1352 }
1353 
1354 /// Compare packed signed 8-bit integers in `a` and `b`, 
1355 /// and return packed minimum values.
1356 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
1357 {
1358     // PERF DMD
1359     static if (GDC_with_SSE41)
1360     {
1361         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1362     }
1363     else version(LDC)
1364     {
1365         // x86: pminsb since LDC 1.1 -O1
1366         // ARM64: smin.16b since LDC 1.8.0 -O1
1367         byte16 sa = cast(byte16)a;
1368         byte16 sb = cast(byte16)b;
1369         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1370         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1371     }
1372     else
1373     {
1374         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1375         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1376         __m128i mask = _mm_and_si128(aTob, lower);
1377         return _mm_xor_si128(b, mask);
1378     }
1379 }
1380 unittest
1381 {
1382     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1383     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1384     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1385     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1386     assert(R.array == correct);
1387 }
1388 
1389 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1390 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
1391 {
1392     // PERF DMD
1393     static if (GDC_with_SSE41)
1394     {
1395         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1396     }
1397     else version(LDC)
1398     {
1399         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1400         // ARM64: umin.8h since LDC 1.8.0 -O1
1401         ushort8 sa = cast(ushort8)a;
1402         ushort8 sb = cast(ushort8)b;
1403         ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1404         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1405     }
1406     else
1407     {
1408         __m128i c = _mm_subs_epu16(b, a);
1409         b = _mm_sub_epi16(b, c);
1410         return b;
1411     }
1412 }
1413 unittest
1414 {
1415     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1416                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1417     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1418     assert(R.array == correct);
1419 }
1420 
1421 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1422 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
1423 {
1424     // PERF DMD
1425     static if (GDC_with_SSE41)
1426     {
1427         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1428     }
1429     else version(LDC)
1430     {
1431         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1432         // ARM64: umin.4s since LDC 1.8.0 -O1
1433         uint4 sa = cast(uint4)a;
1434         uint4 sb = cast(uint4)b;
1435         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1436         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1437     }
1438     else
1439     {
1440         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1441         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1442         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1443         __m128i mask = _mm_and_si128(aTob, higher);
1444         return _mm_xor_si128(b, mask);
1445     }
1446 }
1447 unittest
1448 {
1449     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1450                                       _mm_setr_epi32(        -4,-8,  9, -8));
1451     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1452     assert(R.array == correct);
1453 }
1454 
1455 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1456 /// store the minimum and index in return value, and zero the remaining bits.
1457 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1458 {
1459     // PERF DMD
1460     static if (GDC_with_SSE41)
1461     {
1462         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1463     }
1464     else static if (LDC_with_SSE41)
1465     {
1466         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1467     }
1468     else static if (LDC_with_ARM64)
1469     {
1470         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1471         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1472         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1473         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1474         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1475         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1476         short8 sbest = cast(short8)best;
1477         short8 r;
1478         r[0] = sbest[1];
1479         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1480         r[2] = 0;
1481         r[3] = 0;
1482         r[4] = 0;
1483         r[5] = 0;
1484         r[6] = 0;
1485         r[7] = 0;
1486         return cast(__m128i)r;
1487     }
1488     else
1489     {
1490         short8 sa = cast(short8)a;
1491         ushort min = 0xffff;
1492         int index = 0;
1493         for(int n = 0; n < 8; ++n)
1494         {
1495             ushort c = sa.array[n];
1496             if (c < min)
1497             {
1498                 min = c;
1499                 index = n;
1500             }
1501         }
1502         short8 r;
1503         r.ptr[0] = min;
1504         r.ptr[1] = cast(short)index;
1505         return cast(__m128i)r;
1506     }
1507 }
1508 unittest
1509 {
1510     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1511     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1512     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1513     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1514     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1515     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1516     assert(R1.array == correct1);
1517     assert(R2.array == correct2);
1518 }
1519 
1520 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1521 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1522 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1523 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1524 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1525 /// at the offset specified in `imm8[2]`.
1526 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1527 {
1528     // PERF DMD
1529     static if (GDC_with_SSE41)
1530     {
1531         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1532     }
1533     else static if (LDC_with_SSE41)
1534     {
1535         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1536     }
1537     else
1538     {
1539         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1540         int b_offset = (imm8 & 3) * 4;
1541 
1542         byte16 ba = cast(byte16)a;
1543         byte16 bb = cast(byte16)b;
1544         short8 r;
1545 
1546         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1547 
1548         for (int j = 0; j < 8; j += 2)
1549         {
1550             int k = a_offset + j;
1551             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1552                                            0, 0, 0, 0, 
1553                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1554                                            0, 0, 0, 0);
1555             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1556             r.ptr[j] = diffs.array[0];
1557             r.ptr[j+1] = diffs.array[4];
1558         }
1559         return cast(__m128i)r;
1560     }
1561 }
1562 unittest
1563 {
1564     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1565     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1566     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1567     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1568     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1569     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1570     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1571     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B,);
1572     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B,);
1573     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B,);
1574     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B,);
1575     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B,);
1576     assert(r1.array == correct1);
1577     assert(r4.array == correct4);
1578     assert(r5.array == correct5);
1579     assert(r7.array == correct7);
1580     assert(r8.array == correct0);
1581 }
1582 
1583 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1584 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
1585 {
1586     // PERF DMD
1587     static if (GDC_with_SSE41)
1588     {
1589         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1590     }
1591     else static if (LDC_with_SSE41)
1592     {
1593         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1594         // Use IR instead.
1595         // This generates pmuldq with since LDC 1.2.0 -O0 
1596         enum ir = `
1597             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1598             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1599             %la = sext <2 x i32> %ia to <2 x i64>
1600             %lb = sext <2 x i32> %ib to <2 x i64>
1601             %r = mul <2 x i64> %la, %lb
1602             ret <2 x i64> %r`;
1603         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1604     }
1605     else static if (LDC_with_ARM64)  
1606     {
1607         // 3 instructions since LDC 1.8 -O2
1608         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1609         int2 a_lo = vmovn_s64(cast(long2)a);
1610         int2 b_lo = vmovn_s64(cast(long2)b);
1611         return cast(__m128i) vmull_s32(a_lo, b_lo);
1612     }
1613     else
1614     {
1615         int4 ia = cast(int4)a;
1616         int4 ib = cast(int4)b;
1617         long2 r;
1618         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1619         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1620         return cast(__m128i)r;
1621     }
1622 }
1623 unittest
1624 {
1625     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1626     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1627     long2 R = cast(long2) _mm_mul_epi32(A, B);
1628     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1629     assert(R.array == correct);
1630 }
1631 
1632 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1633 /// return the low 32 bits of the intermediate integers.
1634 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
1635 {
1636     // PERF DMD
1637     // PERF GDC without SSE4.1 could be better
1638     static if (GDC_with_SSE41)
1639     {
1640         int4 ia = cast(int4)a;
1641         int4 ib = cast(int4)b;
1642         // Note: older GDC doesn't have that op, but older GDC
1643         // also has no support for -msse4.1 detection
1644         return cast(__m128i)(a * b); 
1645     }
1646     else version(LDC)
1647     {
1648         int4 ia = cast(int4)a;
1649         int4 ib = cast(int4)b;
1650         return cast(__m128i)(a * b);
1651     }
1652     else
1653     {
1654         // DMD doesn't take the above
1655         int4 ia = cast(int4)a;
1656         int4 ib = cast(int4)b;
1657         int4 r;
1658         r.ptr[0] = ia.array[0] * ib.array[0];
1659         r.ptr[1] = ia.array[1] * ib.array[1];
1660         r.ptr[2] = ia.array[2] * ib.array[2];
1661         r.ptr[3] = ia.array[3] * ib.array[3];
1662         return r;
1663     }
1664 }
1665 unittest
1666 {
1667     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1668     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1669     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1670     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1671     assert(R.array == correct);
1672 }
1673 
1674 
1675 /// Convert packed signed 32-bit integers from `a` and `b` 
1676 /// to packed 16-bit integers using unsigned saturation.
1677 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
1678 {
1679     static if (GDC_with_SSE41)
1680     {
1681         // PERF For some reason doesn't generates the builtin???
1682         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1683     }
1684     else static if (LDC_with_SSE41)
1685     {
1686         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1687     }
1688     else static if (LDC_with_ARM64)
1689     {
1690        int4 z;
1691        z = 0;       
1692        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1693                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1694     }
1695     else
1696     {
1697         // PERF: not great without SSE4.1
1698         int4 sa = cast(int4)a;
1699         int4 sb = cast(int4)b;
1700         ushort[8] result;
1701         for (int i = 0; i < 4; ++i)
1702         {
1703             int s = sa.array[i];
1704             if (s < 0) s = 0;
1705             if (s > 65535) s = 65535;
1706             result.ptr[i] = cast(ushort)s;
1707 
1708             s = sb.array[i];
1709             if (s < 0) s = 0;
1710             if (s > 65535) s = 65535;
1711             result.ptr[i+4] = cast(ushort)s;
1712         }
1713         return cast(__m128i) loadUnaligned!(short8)(cast(short*)result.ptr);
1714     }
1715 }
1716 unittest
1717 {
1718     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1719     short8 R = cast(short8) _mm_packus_epi32(A, A);
1720     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1721     assert(R.array == correct);
1722 }
1723 
1724 
1725 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1726 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1727 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1728 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1729 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1730 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1731 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1732 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1733 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1734 {
1735     // PERF DMD
1736     static if (GDC_with_SSE41)
1737     {
1738         return __builtin_ia32_roundpd(a, rounding);
1739     }
1740     else static if (LDC_with_SSE41)
1741     {
1742         return __builtin_ia32_roundpd(a, rounding);
1743     }
1744     else
1745     {
1746         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1747         {
1748             // Convert to 64-bit integers
1749             long lo = _mm_cvtsd_si64(a);
1750             a.ptr[0] = a.array[1];
1751             long hi = _mm_cvtsd_si64(a);
1752             return _mm_setr_pd(lo, hi);
1753         }
1754         else
1755         {
1756             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1757 
1758             uint old = _MM_GET_ROUNDING_MODE();
1759             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1760             
1761             // Convert to 64-bit integers
1762             long lo = _mm_cvtsd_si64(a);
1763             a.ptr[0] = a.array[1];
1764             long hi = _mm_cvtsd_si64(a);
1765 
1766             // Convert back to double to achieve the rounding
1767             // The problem is that a 64-bit double can't represent all the values 
1768             // a 64-bit integer can (and vice-versa). So this function won't work for
1769             // large values. (TODO: what range exactly?)
1770             _MM_SET_ROUNDING_MODE(old);
1771             return _mm_setr_pd(lo, hi);
1772         }
1773     }
1774 }
1775 unittest
1776 {
1777     // tested in other intrinsics
1778 }
1779 
1780 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1781 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1782 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1783 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1784 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1785 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1786 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1787 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1788 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1789 {
1790     static if (GDC_with_SSE41)
1791     {
1792         return __builtin_ia32_roundps(a, rounding);
1793     }
1794     else static if (LDC_with_SSE41)
1795     {
1796         return __builtin_ia32_roundps(a, rounding);
1797     }
1798     else
1799     {
1800         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1801         {
1802             __m128i integers = _mm_cvtps_epi32(a);
1803             return _mm_cvtepi32_ps(integers);
1804         }
1805         else
1806         {
1807             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1808             uint old = _MM_GET_ROUNDING_MODE();
1809             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1810             scope(exit) _MM_SET_ROUNDING_MODE(old);
1811 
1812             // Convert to 64-bit integers
1813             __m128i integers = _mm_cvtps_epi32(a);
1814 
1815             // Convert back to float to achieve the rounding
1816             // The problem is that a 32-float can't represent all the values 
1817             // a 32-bit integer can (and vice-versa). So this function won't work for
1818             // large values. (TODO: what range exactly?)
1819             __m128 result = _mm_cvtepi32_ps(integers);
1820 
1821             return result;
1822         }
1823     }
1824 }
1825 unittest
1826 {
1827     // tested in other intrinsics
1828 }
1829 
1830 
1831 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1832 /// rounding parameter, store the result as a double-precision floating-point element 
1833 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1834 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1835 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1836 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1837 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1838 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1839 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1840 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1841 {
1842     static if (GDC_with_SSE41)
1843     {
1844         return __builtin_ia32_roundsd(a, b, rounding);
1845     }
1846     else static if (LDC_with_SSE41)
1847     {
1848         return __builtin_ia32_roundsd(a, b, rounding);
1849     }
1850     else
1851     {
1852         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1853         {
1854             // Convert to 64-bit integer
1855             long b0 = _mm_cvtsd_si64(b);
1856             a.ptr[0] = b0;
1857             return a;
1858         }
1859         else
1860         {
1861             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1862 
1863             uint old = _MM_GET_ROUNDING_MODE();
1864             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1865             
1866             // Convert to 64-bit integer
1867             long b0 = _mm_cvtsd_si64(b);
1868             a.ptr[0] = b0;
1869 
1870             // Convert back to double to achieve the rounding
1871             // The problem is that a 64-bit double can't represent all the values 
1872             // a 64-bit integer can (and vice-versa). So this function won't work for
1873             // large values. (TODO: what range exactly?)
1874             _MM_SET_ROUNDING_MODE(old);
1875             return a;
1876         }
1877     }
1878 }
1879 unittest
1880 {
1881     // tested in other intrinsics
1882 }
1883 
1884 
1885 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1886 /// rounding parameter, store the result as a single-precision floating-point element 
1887 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1888 /// to the upper elements of result.
1889 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1890 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1891 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1892 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1893 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1894 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1895 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1896 {
1897     static if (GDC_with_SSE41)
1898     {
1899         return __builtin_ia32_roundss(a, b, rounding);
1900     }
1901     else static if (LDC_with_SSE41)
1902     {
1903         return __builtin_ia32_roundss(a, b, rounding);
1904     }
1905     else
1906     {
1907         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1908         {
1909             int b0 = _mm_cvtss_si32(b);
1910             a.ptr[0] = b0;   
1911             return a;
1912         }
1913         else
1914         {
1915             uint old = _MM_GET_ROUNDING_MODE();
1916             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1917             
1918             // Convert to 32-bit integer
1919             int b0 = _mm_cvtss_si32(b);
1920             a.ptr[0] = b0;       
1921 
1922             // Convert back to double to achieve the rounding
1923             // The problem is that a 64-bit double can't represent all the values 
1924             // a 64-bit integer can (and vice-versa). So this function won't work for
1925             // large values. (TODO: what range exactly?)
1926             _MM_SET_ROUNDING_MODE(old);
1927             return a;
1928         }
1929     }
1930 }
1931 unittest
1932 {
1933     // tested in other intrinsics
1934 }
1935 
1936 
1937 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
1938 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
1939 /// exception may be generated.
1940 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted
1941 {
1942     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
1943     return *mem_addr; // it's a regular move instead
1944 }
1945 
1946 
1947 /// Return 1 if all bits in `a` are all 1's. Else return 0.
1948 int _mm_test_all_ones (__m128i a) @safe
1949 {
1950     return _mm_testc_si128(a, _mm_set1_epi32(-1));
1951 }
1952 unittest
1953 {
1954     __m128i A = _mm_set1_epi32(-1);
1955     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
1956     assert(_mm_test_all_ones(A) == 1);
1957     assert(_mm_test_all_ones(B) == 0);
1958 }
1959 
1960 /// Return 1 if all bits in `a` are all 0's. Else return 0.
1961 // This is a #BONUS since it was lacking in Intel Intrinsics API.
1962 int _mm_test_all_zeros (__m128i a) @safe
1963 {
1964     return _mm_testz_si128(a, _mm_set1_epi32(-1));
1965 }
1966 unittest
1967 {
1968     __m128i A = _mm_set1_epi32(0);
1969     __m128i B = _mm_set_epi32(0, 8, 0, 0);
1970     assert(_mm_test_all_zeros(A) == 1);
1971     assert(_mm_test_all_zeros(B) == 0);
1972 }
1973 
1974 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
1975 /// and return 1 if the result is zero, otherwise return 0.
1976 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
1977 {
1978     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
1979 }
1980 
1981 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
1982 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
1983 {
1984     return _mm_testnzc_si128(a, mask);
1985 }
1986 
1987 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
1988 /// result is zero, otherwise return 0.
1989 /// In other words, test if all bits masked by `b` are 1 in `a`.
1990 int _mm_testc_si128 (__m128i a, __m128i b) @trusted
1991 {
1992     // PERF DMD
1993     static if (GDC_with_SSE41)
1994     {
1995         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
1996     }
1997     else static if (LDC_with_SSE41)
1998     {
1999         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2000     }
2001     else static if (LDC_with_ARM64)
2002     {
2003         // Acceptable since LDC 1.8 -02
2004         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2005         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2006     }
2007     else
2008     {
2009         __m128i c = ~a & b;
2010         int[4] zero = [0, 0, 0, 0];
2011         return c.array == zero;
2012     }
2013 }
2014 unittest
2015 {
2016     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2017     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2018     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2019     assert(_mm_testc_si128(A, A) == 1);
2020     assert(_mm_testc_si128(A, M1) == 0);
2021     assert(_mm_testc_si128(A, M2) == 1);
2022 }
2023 
2024 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2025 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2026 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2027 /// result is zero, otherwise set CF to 0. 
2028 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2029 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2030 {
2031     // PERF DMD
2032     static if (GDC_with_SSE41)
2033     {
2034         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2035     }
2036     else static if (LDC_with_SSE41)
2037     {
2038         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2039     }
2040     else static if (LDC_with_ARM64)
2041     {
2042         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2043         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2044 
2045         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2046                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2047     }
2048     else
2049     {
2050         __m128i c = a & b;
2051         __m128i d = ~a & b;
2052         int[4] zero = [0, 0, 0, 0];
2053         return !( (c.array == zero) || (d.array == zero));
2054     }    
2055 }
2056 unittest
2057 {
2058     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2059     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2060     __m128i Z = _mm_setzero_si128();
2061     assert(_mm_testnzc_si128(A, Z) == 0);
2062     assert(_mm_testnzc_si128(A, M) == 1);
2063     assert(_mm_testnzc_si128(A, A) == 0);
2064 }
2065 
2066 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2067 /// and return 1 if the result is zero, otherwise return 0.
2068 /// In other words, test if all bits masked by `b` are 0 in `a`.
2069 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2070 {
2071     // PERF DMD
2072     static if (GDC_with_SSE41)
2073     {
2074         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2075     }
2076     else static if (LDC_with_SSE41)
2077     {
2078         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2079     }
2080     else static if (LDC_with_ARM64)
2081     {
2082         // Acceptable since LDC 1.8 -02
2083         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2084         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2085     }
2086     else 
2087     {
2088         __m128i c = a & b;
2089         int[4] zero = [0, 0, 0, 0];
2090         return c.array == zero;
2091     }    
2092 }
2093 unittest
2094 {
2095     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2096     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2097     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2098     assert(_mm_testz_si128(A, A) == 0);
2099     assert(_mm_testz_si128(A, M1) == 1);
2100     assert(_mm_testz_si128(A, M2) == 0);
2101 }
2102