1 /**
2 * SSE4.1 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.smmintrin;
10 
11 // SSE4.1 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
13 // Note: this header will work whether you have SSE4.1 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
15 // generate SSE4.1 instructions.
16 
17 public import inteli.types;
18 import inteli.internals;
19 
20 // smmintrin pulls in all previous instruction set intrinsics.
21 public import inteli.tmmintrin;
22 
23 nothrow @nogc:
24 
25 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
26 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
27 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
28 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
29 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
30 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
31 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
32 
33 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
34 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
35 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
36 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
37 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
38 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
39 
40 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
41 // Note: changed signature, GDC needs a compile-time value for imm8.
42 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted
43 {
44     // PERF DMD
45     static if (GDC_with_SSE41)
46     {
47         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
48     }
49     else 
50     {
51         // LDC x86 This generates pblendw since LDC 1.1 and -O2
52         short8 r;
53         short8 sa = cast(short8)a;
54         short8 sb = cast(short8)b;
55         for (int n = 0; n < 8; ++n)
56         {
57             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
58         }
59         return cast(__m128i)r;
60     }
61 }
62 unittest
63 {
64     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
65     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
66     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
67     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
68     assert(C.array == correct);
69 }
70 
71 
72 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
73 // Note: changed signature, GDC needs a compile-time value for `imm8`.
74 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
75 {
76     static assert(imm8 >= 0 && imm8 < 4);
77     // PERF DMD
78     static if (GDC_with_SSE41)
79     {
80         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
81     }
82     else
83     {
84         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
85         double2 r;
86         for (int n = 0; n < 2; ++n)
87         {
88             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
89         }
90         return cast(__m128d)r;
91     }
92 }
93 unittest
94 {
95     __m128d A = _mm_setr_pd(0, 1);
96     __m128d B = _mm_setr_pd(8, 9);
97     double2 C = _mm_blend_pd!2(A, B);
98     double[2] correct =    [0, 9];
99     assert(C.array == correct);
100 }
101 
102 
103 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`.
104 // Note: changed signature, GDC needs a compile-time value for imm8.
105 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted
106 {
107     // PERF DMD
108     static assert(imm8 >= 0 && imm8 < 16);
109     static if (GDC_with_SSE41)
110     {
111         return __builtin_ia32_blendps(a, b, imm8);
112     }
113     else version(LDC)
114     {
115         // LDC x86: generates blendps since LDC 1.1 -O2
116         //   arm64: pretty good, two instructions worst case
117         return shufflevector!(float4, (imm8 & 1) ? 4 : 0,
118                                       (imm8 & 2) ? 5 : 1,
119                                       (imm8 & 4) ? 6 : 2,
120                                       (imm8 & 8) ? 7 : 3)(a, b);
121     }
122     else
123     {
124         __m128 r;
125         for (int n = 0; n < 4; ++n)
126         {
127             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
128         }
129         return r;
130     }
131 }
132 unittest
133 {
134     __m128 A = _mm_setr_ps(0, 1,  2,  3);
135     __m128 B = _mm_setr_ps(8, 9, 10, 11);
136     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
137     float[4] correct =    [8, 1, 10, 11];
138     assert(C.array == correct);
139 }
140 
141 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
142 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
143 {
144     // PERF DMD
145     static if (GDC_with_SSE41)
146     {
147         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
148     }
149     else static if (LDC_with_SSE41)
150     {
151         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
152     }
153     else static if (LDC_with_ARM64)
154     {
155         // LDC arm64: two instructions since LDC 1.12 -O2
156         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
157         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
158     }
159     else
160     {
161         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
162         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
163     }
164 }
165 unittest
166 {
167     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
168                                8,  9, 10, 11, 12, 13, 14, 15);
169     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
170                               24, 25, 26, 27, 28, 29, 30, 31);
171     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
172                                1,  1, -1, -1,  4,  1,  8, -128);
173     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
174     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
175                                8,  9, 26, 27, 12, 13, 14, 31 ];
176     assert(R.array == correct);
177 }
178 
179 
180 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
181 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
182 {
183     // PERF DMD
184     static if (GDC_with_SSE42)
185     {
186         // Amazingly enough, GCC/GDC generates the blendvpd instruction
187         // with -msse4.2 but not -msse4.1.
188         // Not sure what is the reason, and there is a replacement sequence.
189         // Sounds like a bug.
190         return __builtin_ia32_blendvpd(a, b, mask);
191     }
192     else static if (LDC_with_SSE41)
193     {
194         return __builtin_ia32_blendvpd(a, b, mask);
195     }
196     else static if (LDC_with_ARM64)
197     {
198         long2 shift;
199         shift = 63;
200         long2 lmask = cast(long2)mask >> shift;
201         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
202     }
203     else
204     {
205         __m128d r;
206         long2 lmask = cast(long2)mask;
207         for (int n = 0; n < 2; ++n)
208         {
209             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
210         }
211         return r;
212     }
213 }
214 unittest
215 {
216     __m128d A = _mm_setr_pd(1.0, 2.0);
217     __m128d B = _mm_setr_pd(3.0, 4.0);
218     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
219     __m128d R1 = _mm_blendv_pd(A, B, M1);
220     double[2] correct1 = [3.0, 2.0];
221     assert(R1.array == correct1);
222 
223     // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason.
224     // but it does work in arm64 macOS
225     // yields different results despite FP seemingly not being used
226     version(linux)
227     {}
228     else
229     {
230         __m128d M2 = _mm_setr_pd(double.nan, -double.nan);
231         __m128d R2 = _mm_blendv_pd(A, B, M2);
232         double[2] correct2 = [1.0, 4.0];
233         assert(R2.array == correct2);
234     }
235 }
236 
237 
238 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
239 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
240 {
241     // PERF DMD
242     static if (GDC_with_SSE41)
243     {
244         return __builtin_ia32_blendvps(a, b, mask);
245     }
246     else static if (LDC_with_SSE41)
247     {
248         return __builtin_ia32_blendvps(a, b, mask);
249     }
250     else static if (LDC_with_ARM64)
251     {
252         int4 shift;
253         shift = 31;
254         int4 lmask = cast(int4)mask >> shift;
255         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
256     }
257     else
258     {
259         __m128 r;
260         int4 lmask = cast(int4)mask;
261         for (int n = 0; n < 4; ++n)
262         {
263             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
264         }
265         return r;
266     }
267 }
268 unittest
269 {
270     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
271     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
272     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
273     __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f);
274     __m128 R1 = _mm_blendv_ps(A, B, M1);
275     __m128 R2 = _mm_blendv_ps(A, B, M2);
276     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
277     float[4] correct2 =    [ 0.0f, 5.0f, 6.0f, 3.0f];
278     assert(R1.array == correct1);
279 
280     // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason.
281     // yields different results despite FP seemingly not being used
282     version(linux)
283     {}
284     else
285     {
286         assert(R2.array == correct2);
287     }
288 }
289 
290 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
291 /// and store the results as packed double-precision floating-point elements.
292 __m128d _mm_ceil_pd (__m128d a) @trusted
293 {
294     static if (LDC_with_ARM64)
295     {
296         // LDC arm64 acceptable since 1.8 -O2
297         // Unfortunately x86 intrinsics force a round-trip back to double2
298         // ARM neon semantics wouldn't have that
299         long2 l = vcvtpq_s64_f64(a);
300         double2 r;
301         r.ptr[0] = l.array[0];
302         r.ptr[1] = l.array[1];
303         return r;
304     }
305     else
306     {
307         return _mm_round_pd!2(a);
308     }
309 }
310 unittest
311 {
312     __m128d A = _mm_setr_pd(1.3f, -2.12f);
313     __m128d B = _mm_setr_pd(53.6f, -2.7f);
314     A = _mm_ceil_pd(A);
315     B = _mm_ceil_pd(B);
316     double[2] correctA = [2.0, -2.0];
317     double[2] correctB = [54.0, -2.0];
318     assert(A.array == correctA);
319     assert(B.array == correctB);
320 }
321 
322 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
323 /// and store the results as packed single-precision floating-point elements.
324 __m128 _mm_ceil_ps (__m128 a) @trusted
325 {
326     static if (LDC_with_ARM64)
327     {
328         // LDC arm64 acceptable since 1.8 -O1
329         int4 l = vcvtpq_s32_f32(a);
330         float4 r;
331         r.ptr[0] = l.array[0];
332         r.ptr[1] = l.array[1];
333         r.ptr[2] = l.array[2];
334         r.ptr[3] = l.array[3];
335         return r;
336     }
337     else
338     {
339         return _mm_round_ps!2(a);
340     }
341 }
342 unittest
343 {
344     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
345     __m128 C = _mm_ceil_ps(A);
346     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
347     assert(C.array == correct);
348 }
349 
350 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
351 /// store the result as a double-precision floating-point element in the lower element of result, 
352 /// and copy the upper element from `a` to the upper element of dst.
353 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
354 {
355     static if (LDC_with_ARM64)
356     {
357         a[0] = vcvtps_s64_f64(b[0]);
358         return a;
359     }
360     else
361     {
362         return _mm_round_sd!2(a, b);
363     }
364 }
365 unittest
366 {
367     __m128d A = _mm_setr_pd(1.3, -2.12);
368     __m128d B = _mm_setr_pd(53.6, -3.7);
369     __m128d C = _mm_ceil_sd(A, B);
370     double[2] correct = [54.0, -2.12];
371     assert(C.array == correct);
372 }
373 
374 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
375 /// store the result as a single-precision floating-point element in the lower element of result, 
376 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
377 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
378 {
379     static if (LDC_with_ARM64)
380     {
381         a[0] = vcvtps_s32_f32(b[0]);
382         return a;
383     }
384     else
385     {
386         return _mm_round_ss!2(a, b);
387     }
388 }
389 unittest
390 {
391     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
392     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
393     __m128 C = _mm_ceil_ss(A, B);
394     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
395     assert(C.array == correct);
396 }
397 
398 /// Compare packed 64-bit integers in `a` and `b` for equality.
399 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
400 {
401     // PERF DMD
402     static if (GDC_with_SSE41)
403     {
404         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
405     }
406     else version(LDC)
407     {
408         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
409         //     arm64: generates cmeq since LDC 1.8 -O1
410         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
411     }
412     else
413     {
414         // Clever pcmpeqd + pand use with LDC 1.24 -O2
415         long2 la = cast(long2)a;
416         long2 lb = cast(long2)b;
417         long2 res;
418         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
419         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
420         return cast(__m128i)res;
421     }
422 }
423 unittest
424 {
425     __m128i A = _mm_setr_epi64(-1, -2);
426     __m128i B = _mm_setr_epi64(-3, -2);
427     __m128i C = _mm_setr_epi64(-1, -4);
428     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
429     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
430     long[2] correct1 = [0, -1];
431     long[2] correct2 = [-1, 0];
432     assert(AB.array == correct1);
433     assert(AC.array == correct2);
434 }
435 
436 
437 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
438 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
439 {
440     // PERF DMD
441     static if (GDC_with_SSE41)
442     {
443         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
444     }
445     else version(LDC)
446     {
447         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
448         enum ir = `
449             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
450             %r = sext <4 x i16> %v to <4 x i32>
451             ret <4 x i32> %r`;
452         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
453     }
454     else
455     {
456         short8 sa = cast(short8)a;
457         int4 r;
458         r.ptr[0] = sa.array[0];
459         r.ptr[1] = sa.array[1];
460         r.ptr[2] = sa.array[2];
461         r.ptr[3] = sa.array[3];
462         return r;
463     }
464 }
465 unittest
466 {
467     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
468     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
469     int[4] correct = [-1, 0, -32768, 32767];
470     assert(C.array == correct);
471 }
472 
473 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
474 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
475 {
476     // PERF DMD
477     static if (GDC_with_SSE41)
478     {
479         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
480     }
481     else version(LDC)
482     {
483         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
484         enum ir = `
485             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
486             %r = sext <2 x i16> %v to <2 x i64>
487             ret <2 x i64> %r`;
488         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
489     }
490     else
491     {
492         short8 sa = cast(short8)a;
493         long2 r;
494         r.ptr[0] = sa.array[0];
495         r.ptr[1] = sa.array[1];
496         return cast(__m128i)r;
497     }
498 }
499 unittest
500 {
501     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
502     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
503     long[2] correct = [-32768, 32767];
504     assert(C.array == correct);
505 }
506 
507 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
508 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
509 {
510     // PERF DMD
511     static if (GDC_with_SSE41)
512     {
513         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
514     }
515     else version(LDC)
516     {
517         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
518         enum ir = `
519             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
520             %r = sext <2 x i32> %v to <2 x i64>
521             ret <2 x i64> %r`;
522         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
523     }
524     else
525     {
526         int4 sa = cast(int4)a;
527         long2 r;
528         r.ptr[0] = sa.array[0];
529         r.ptr[1] = sa.array[1];
530         return cast(__m128i)r;
531     }
532 }
533 unittest
534 {
535     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
536     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
537     long[2] correct = [-4, 42];
538     assert(C.array == correct);
539 }
540 
541 
542 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
543 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
544 {
545     // PERF DMD
546     static if (GDC_with_SSE41)
547     {
548         alias ubyte16 = __vector(ubyte[16]);
549         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
550     }
551     else version(LDC)
552     {
553         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
554         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
555         enum ir = `
556             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
557             %r = sext <8 x i8> %v to <8 x i16>
558             ret <8 x i16> %r`;
559         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
560     }
561     else
562     {
563         byte16 sa = cast(byte16)a;
564         short8 r;
565         foreach(n; 0..8)
566             r.ptr[n] = sa.array[n];
567         return cast(__m128i)r;
568     }
569 }
570 unittest
571 {
572     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
573     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
574     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
575     assert(C.array == correct);
576 }
577 
578 
579 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
580 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
581 {
582     // PERF DMD
583     static if (GDC_with_SSE41)
584     {
585         alias ubyte16 = __vector(ubyte[16]);
586         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
587     }
588     else static if (LDC_with_SSE41)
589     {
590         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
591         enum ir = `
592             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
593             %r = sext <4 x i8> %v to <4 x i32>
594             ret <4 x i32> %r`;
595         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
596     }
597     else
598     {
599         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
600         byte16 sa = cast(byte16)a;
601         int4 r;
602         r.ptr[0] = sa.array[0];
603         r.ptr[1] = sa.array[1];
604         r.ptr[2] = sa.array[2];
605         r.ptr[3] = sa.array[3];
606         return cast(__m128i)r;
607     }
608 }
609 unittest
610 {
611     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
612     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
613     int[4] correct = [127, -128, 1, -1];
614     assert(C.array == correct);
615 }
616 
617 
618 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
619 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
620 {
621     // PERF DMD
622     static if (GDC_with_SSE41)
623     {
624         alias ubyte16 = __vector(ubyte[16]);
625         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
626     }
627     else version(LDC)
628     {
629         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
630         // LDC arm64: it's ok since LDC 1.8 -O1
631         enum ir = `
632             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
633             %r = sext <2 x i8> %v to <2 x i64>
634             ret <2 x i64> %r`;
635         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
636     }
637     else
638     {
639         byte16 sa = cast(byte16)a;
640         long2 r;
641         foreach(n; 0..2)
642             r.ptr[n] = sa.array[n];
643         return cast(__m128i)r;
644     }
645 }
646 unittest
647 {
648     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
649     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
650     long[2] correct = [127, -128];
651     assert(C.array == correct);
652 }
653 
654 
655 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
656 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
657 {
658     // PERF DMD
659     static if (GDC_with_SSE41)
660     {
661         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
662     }
663     else
664     {
665         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
666         //     arm64: ushll since LDC 1.12 -O1
667         short8 sa = cast(short8)a;
668         int4 r;
669         r.ptr[0] = cast(ushort)sa.array[0];
670         r.ptr[1] = cast(ushort)sa.array[1];
671         r.ptr[2] = cast(ushort)sa.array[2];
672         r.ptr[3] = cast(ushort)sa.array[3];
673         return cast(__m128i)r;
674     }
675 }
676 unittest
677 {
678     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
679     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
680     int[4] correct = [65535, 0, 32768, 32767];
681     assert(C.array == correct);
682 }
683 
684 
685 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
686 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
687 {
688     // PERF DMD
689     static if (GDC_with_SSE41)
690     {
691         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
692     }
693     else static if (LDC_with_ARM64)
694     {
695         // LDC arm64: a bit shorter than below, in -O2
696         short8 sa = cast(short8)a;
697         long2 r;
698         for(int n = 0; n < 2; ++n)
699             r.ptr[n] = cast(ushort)sa.array[n];
700         return cast(__m128i)r;
701     }
702     else
703     {
704         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
705         short8 sa = cast(short8)a;
706         long2 r;
707         r.ptr[0] = cast(ushort)sa.array[0];
708         r.ptr[1] = cast(ushort)sa.array[1];
709         return cast(__m128i)r;
710     }
711 }
712 unittest
713 {
714     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
715     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
716     long[2] correct = [65535, 0];
717     assert(C.array == correct);
718 }
719 
720 
721 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
722 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
723 {
724     // PERF DMD
725     static if (GDC_with_SSE41)
726     {
727         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
728     }
729     else
730     {
731         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
732         //     arm64: generates ushll since LDC 1.12 -O1
733         int4 sa = cast(int4)a;
734         long2 r;
735         r.ptr[0] = cast(uint)sa.array[0];
736         r.ptr[1] = cast(uint)sa.array[1];
737         return cast(__m128i)r;
738     }
739 }
740 unittest
741 {
742     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
743     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
744     long[2] correct = [4294967295, 42];
745     assert(C.array == correct);
746 }
747 
748 
749 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
750 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
751 {
752     // PERF DMD
753     static if (GDC_with_SSE41)
754     {
755         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(short8)a);
756     }
757     else
758     {
759         // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1
760         //     arm64: ushll since LDC 1.12 -O1
761         // PERF: catastrophic with GDC without SSE4.1
762         byte16 sa = cast(byte16)a;
763         short8 r;
764         r.ptr[0] = cast(ubyte)sa.array[0];
765         r.ptr[1] = cast(ubyte)sa.array[1];
766         r.ptr[2] = cast(ubyte)sa.array[2];
767         r.ptr[3] = cast(ubyte)sa.array[3];
768         r.ptr[4] = cast(ubyte)sa.array[4];
769         r.ptr[5] = cast(ubyte)sa.array[5];
770         r.ptr[6] = cast(ubyte)sa.array[6];
771         r.ptr[7] = cast(ubyte)sa.array[7];
772         return cast(__m128i)r;
773     }
774 }
775 unittest
776 {
777     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
778     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
779     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
780     assert(C.array == correct);
781 }
782 
783 
784 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
785 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
786 {
787     // PERF DMD
788     static if (GDC_with_SSE41)
789     {
790         alias ubyte16 = __vector(ubyte[16]);
791         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
792     }
793     else static if (LDC_with_ARM64)
794     {
795         // LDC arm64: a bit better than below in -O2
796         byte16 sa = cast(byte16)a;
797         int4 r;
798         for(int n = 0; n < 4; ++n) 
799             r.ptr[n] = cast(ubyte)sa.array[n];
800         return cast(__m128i)r;
801     }
802     else
803     {
804         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
805         // PERF: catastrophic with GDC without SSE4.1
806         byte16 sa = cast(byte16)a;
807         int4 r;
808         r.ptr[0] = cast(ubyte)sa.array[0];
809         r.ptr[1] = cast(ubyte)sa.array[1];
810         r.ptr[2] = cast(ubyte)sa.array[2];
811         r.ptr[3] = cast(ubyte)sa.array[3];
812         return cast(__m128i)r;
813     }
814 }
815 unittest
816 {
817     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
818     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
819     int[4] correct = [127, 128, 1, 255];
820     assert(C.array == correct);
821 }
822 
823 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
824 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
825 {
826     // PERF DMD
827     static if (GDC_with_SSE41)
828     {
829         alias ubyte16 = __vector(ubyte[16]);
830         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
831     }
832     else static if (LDC_with_ARM64)
833     {
834         // LDC arm64: this optimizes better than the loop below
835         byte16 sa = cast(byte16)a;
836         long2 r;
837         for (int n = 0; n < 2; ++n)
838             r.ptr[n] = cast(ubyte)sa.array[n];
839         return cast(__m128i)r;
840     }
841     else
842     {
843         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
844         byte16 sa = cast(byte16)a;
845         long2 r;
846         r.ptr[0] = cast(ubyte)sa.array[0];
847         r.ptr[1] = cast(ubyte)sa.array[1];
848         return cast(__m128i)r;
849     }
850 }
851 unittest
852 {
853     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
854     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
855     long[2] correct = [127, 254];
856     assert(C.array == correct);
857 }
858 
859 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
860 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
861 /// store the sum in dst using the low 4 bits of `imm8`.
862 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
863 {
864     // PERF DMD
865     static if (GDC_with_SSE41)
866     {
867         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
868     }
869     else static if (LDC_with_SSE41)
870     {
871         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
872     }
873     else
874     {
875         __m128d zero = _mm_setzero_pd();
876         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
877         double sum = temp.array[0] + temp.array[1];
878         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
879     }
880 }
881 unittest
882 {
883     __m128d A = _mm_setr_pd(1.0, 2.0);
884     __m128d B = _mm_setr_pd(4.0, 8.0);
885     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
886     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
887     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
888     double[2] correct1 = [ 4.0,  4.0];
889     double[2] correct2 = [16.0,  0.0];
890     double[2] correct3 = [ 0.0, 20.0];
891     assert(R1.array == correct1);
892     assert(R2.array == correct2);
893     assert(R3.array == correct3);
894 }
895 
896 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
897 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
898 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
899 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
900 {
901       // PERF DMD
902     static if (GDC_with_SSE41)
903     {
904         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
905     }
906     else static if (LDC_with_SSE41)
907     {
908         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
909     }
910     else
911     {
912         __m128 zero = _mm_setzero_ps();
913         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
914         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
915         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
916     }        
917 }
918 unittest
919 {
920     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
921     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
922     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
923     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
924     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
925     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
926     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
927     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
928     assert(R1.array == correct1);
929     assert(R2.array == correct2);
930     assert(R3.array == correct3);
931 }
932 
933 
934 /// Extract a 32-bit integer from `a`, selected with `imm8`.
935 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
936 {
937     return (cast(int4)a).array[imm8 & 3];
938 }
939 unittest
940 {
941     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
942     assert(_mm_extract_epi32(A, 0) == 1);
943     assert(_mm_extract_epi32(A, 1 + 8) == 2);
944     assert(_mm_extract_epi32(A, 3 + 4) == 4);
945 }
946 
947 /// Extract a 64-bit integer from `a`, selected with `imm8`.
948 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
949 {
950     long2 la = cast(long2)a;
951     return la.array[imm8 & 1];
952 }
953 unittest
954 {
955     __m128i A = _mm_setr_epi64(45, -67);
956     assert(_mm_extract_epi64(A, 0) == 45);
957     assert(_mm_extract_epi64(A, 1) == -67);
958     assert(_mm_extract_epi64(A, 2) == 45);
959 }
960 
961 /// Extract an 8-bit integer from `a`, selected with `imm8`.
962 /// Warning: the returned value is zero-extended to 32-bits.
963 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
964 {
965     byte16 ba = cast(byte16)a;
966     return cast(ubyte) ba.array[imm8 & 15];
967 }
968 unittest
969 {
970     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
971     assert(_mm_extract_epi8(A, 7) == 7);
972     assert(_mm_extract_epi8(A, 13) == 255);
973     assert(_mm_extract_epi8(A, 7 + 16) == 7);
974 }
975 
976 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
977 /// Note: returns a 32-bit $(I integer).
978 int _mm_extract_ps (__m128 a, const int imm8) @trusted
979 {
980     return (cast(int4)a).array[imm8 & 3];
981 }
982 unittest
983 {
984     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
985     assert(_mm_extract_ps(A, 0) == 0x3f800000);
986     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
987     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
988 }
989 
990 
991 
992 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
993 /// integer value, and store the results as packed double-precision floating-point elements.
994 __m128d _mm_floor_pd (__m128d a) @trusted
995 {
996     static if (LDC_with_ARM64)
997     {
998         // LDC arm64 acceptable since 1.8 -O2
999         long2 l = vcvtmq_s64_f64(a);
1000         double2 r;
1001         r.ptr[0] = l.array[0];
1002         r.ptr[1] = l.array[1];
1003         return r;
1004     }
1005     else
1006     {
1007         return _mm_round_pd!1(a);
1008     }
1009 }
1010 unittest
1011 {
1012     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1013     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1014     A = _mm_floor_pd(A);
1015     B = _mm_floor_pd(B);
1016     double[2] correctA = [1.0, -3.0];
1017     double[2] correctB = [53.0, -3.0];
1018     assert(A.array == correctA);
1019     assert(B.array == correctB);
1020 }
1021 
1022 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1023 /// integer value, and store the results as packed single-precision floating-point elements.
1024 __m128 _mm_floor_ps (__m128 a) @trusted
1025 {
1026     static if (LDC_with_ARM64)
1027     {
1028         // LDC arm64 acceptable since 1.8 -O1
1029         int4 l = vcvtmq_s32_f32(a);
1030         float4 r;
1031         r.ptr[0] = l.array[0];
1032         r.ptr[1] = l.array[1];
1033         r.ptr[2] = l.array[2];
1034         r.ptr[3] = l.array[3];
1035         return r;
1036     }
1037     else
1038     {
1039         return _mm_round_ps!1(a);
1040     }
1041 }
1042 unittest
1043 {
1044     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1045     __m128 C = _mm_floor_ps(A);
1046     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1047     assert(C.array == correct);
1048 }
1049 
1050 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1051 /// integer value, store the result as a double-precision floating-point element in the 
1052 /// lower element, and copy the upper element from `a` to the upper element.
1053 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1054 {
1055     static if (LDC_with_ARM64)
1056     {
1057         a[0] = vcvtms_s64_f64(b[0]);
1058         return a;
1059     }
1060     else
1061     {
1062         return _mm_round_sd!1(a, b);
1063     }
1064 }
1065 unittest
1066 {
1067     __m128d A = _mm_setr_pd(1.3, -2.12);
1068     __m128d B = _mm_setr_pd(-53.1, -3.7);
1069     __m128d C = _mm_floor_sd(A, B);
1070     double[2] correct = [-54.0, -2.12];
1071     assert(C.array == correct);
1072 }
1073 
1074 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1075 /// integer value, store the result as a single-precision floating-point element in the
1076 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1077 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1078 {
1079     static if (LDC_with_ARM64)
1080     {
1081         a[0] = vcvtms_s32_f32(b[0]);
1082         return a;
1083     }
1084     else
1085     {
1086         return _mm_round_ss!1(a, b);
1087     }
1088 }
1089 unittest
1090 {
1091     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1092     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1093     __m128 C = _mm_floor_ss(A, B);
1094     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1095     assert(C.array == correct);
1096 }
1097 
1098 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1099 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1100 {
1101     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1102     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1103     // LDC arm64: ins.s since LDC 1.8 -O2
1104     int4 ia = cast(int4)a;
1105     ia.ptr[imm8 & 3] = i;
1106     return cast(__m128i)ia; 
1107 }
1108 unittest
1109 {
1110     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1111     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1112     int[4] result = [1, 2, 5, 4];
1113     assert(C.array == result);
1114 }
1115 
1116 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1117 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1118 {
1119     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1120     // LDC x86: always do something sensible.
1121     long2 la = cast(long2)a;
1122     la.ptr[imm8 & 1] = i;
1123     return cast(__m128i)la;
1124 }
1125 unittest
1126 {
1127     __m128i A = _mm_setr_epi64(1, 2);
1128     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1129     long[2] result = [1, 5];
1130     assert(C.array == result);
1131 }
1132 
1133 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1134 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1135 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1136 {
1137     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1138     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1139     byte16 ba = cast(byte16)a;
1140     ba.ptr[imm8 & 15] = cast(byte)i;
1141     return cast(__m128i)ba; 
1142 }
1143 unittest
1144 {
1145     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1146     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1147     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1148     assert(C.array == result);
1149 }
1150 
1151 
1152 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1153 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1154 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1155 /// (elements are zeroed out when the corresponding bit is set).
1156 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1157 {
1158     // PERF DMD
1159     static if (GDC_with_SSE41)
1160     {
1161         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1162     }
1163     else static if (LDC_with_SSE41)
1164     {
1165         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1166     }
1167     else
1168     {
1169         float4 tmp2 = a;
1170         float tmp1 = b.array[(imm8 >> 6) & 3];
1171         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1172         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1173     }
1174 }
1175 unittest
1176 {
1177     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1178     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1179     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1180     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1181     assert(C.array == correct);
1182 }
1183 
1184 
1185 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1186 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
1187 {
1188     static if (GDC_with_SSE41)
1189     {
1190         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1191     }
1192     else version(LDC)
1193     {
1194         // x86: pmaxsd since LDC 1.1 -O1
1195         // ARM: smax.4s since LDC 1.8 -01
1196         int4 sa = cast(int4)a;
1197         int4 sb = cast(int4)b;
1198         int4 greater = greaterMask!int4(sa, sb);
1199         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1200     }
1201     else
1202     {
1203         __m128i higher = _mm_cmpgt_epi32(a, b);
1204         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1205         __m128i mask = _mm_and_si128(aTob, higher);
1206         return _mm_xor_si128(b, mask);
1207     }
1208 }
1209 unittest
1210 {
1211     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1212                                       _mm_setr_epi32(        -4,-8,  9, -8));
1213     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1214     assert(R.array == correct);
1215 }
1216 
1217 /// Compare packed signed 8-bit integers in `a` and `b`, 
1218 /// and return packed maximum values.
1219 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
1220 {
1221     // PERF DMD
1222     static if (GDC_with_SSE41)
1223     {
1224         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1225     }
1226     else version(LDC)
1227     {
1228         // x86: pmaxsb since LDC 1.1 -O1
1229         // ARM64: smax.16b since LDC 1.8.0 -O1
1230         byte16 sa = cast(byte16)a;
1231         byte16 sb = cast(byte16)b;
1232         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1233         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1234     }
1235     else
1236     {
1237         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1238         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1239         __m128i mask = _mm_and_si128(aTob, lower);
1240         return _mm_xor_si128(b, mask);
1241     }
1242 }
1243 unittest
1244 {
1245     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1246     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1247     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1248     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1249     assert(R.array == correct);
1250 }
1251 
1252 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1253 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
1254 {
1255     // PERF DMD
1256     static if (GDC_with_SSE41)
1257     {
1258         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1259     }
1260     else version(LDC)
1261     {
1262         // x86: pmaxuw since LDC 1.1 -O1
1263         // ARM64: umax.8h since LDC 1.8.0 -O1
1264         // PERF: without sse4.1, LLVM 12 produces a very interesting
1265         //          psubusw xmm0, xmm1
1266         //          paddw   xmm0, xmm1
1267         //       sequence that maybe should go in other min/max intrinsics? 
1268         ushort8 sa = cast(ushort8)a;
1269         ushort8 sb = cast(ushort8)b;
1270         ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1271         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1272     }
1273     else
1274     {
1275         b = _mm_subs_epu16(b, a);
1276         b = _mm_add_epi16(b, a);
1277         return b;
1278     }
1279 }
1280 unittest
1281 {
1282     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1283                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1284     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1285     assert(R.array == correct);
1286 }
1287 
1288 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1289 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
1290 {
1291     // PERF DMD
1292     static if (GDC_with_SSE41)
1293     {
1294         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1295     }
1296     else version(LDC)
1297     {
1298         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1299         // ARM64: umax.4s since LDC 1.8.0 -O1
1300         uint4 sa = cast(uint4)a;
1301         uint4 sb = cast(uint4)b;
1302         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1303         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1304     }
1305     else
1306     {
1307         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1308         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1309         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1310         __m128i mask = _mm_and_si128(aTob, higher);
1311         return _mm_xor_si128(b, mask);
1312     }
1313 }
1314 unittest
1315 {
1316     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1317                                       _mm_setr_epi32(        -4,-8,  9, -8));
1318     int[4] correct =                                [        -4,-8,  9, -7];
1319     assert(R.array == correct);
1320 }
1321 
1322 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1323 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
1324 {
1325     // PERF DMD
1326     static if (GDC_with_SSE41)
1327     {
1328         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1329     }
1330     else version(LDC)
1331     {
1332         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1333         // ARM: smin.4s since LDC 1.8 -01
1334         int4 sa = cast(int4)a;
1335         int4 sb = cast(int4)b;
1336         int4 greater = greaterMask!int4(sa, sb);
1337         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1338     }
1339     else
1340     {
1341         __m128i higher = _mm_cmplt_epi32(a, b);
1342         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1343         __m128i mask = _mm_and_si128(aTob, higher);
1344         return _mm_xor_si128(b, mask);
1345     }
1346 }
1347 unittest
1348 {
1349     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1350                                       _mm_setr_epi32(        -4, -8,  9, -8));
1351     int[4] correct =                               [         -4, -8, -4, -8];
1352     assert(R.array == correct);
1353 }
1354 
1355 /// Compare packed signed 8-bit integers in `a` and `b`, 
1356 /// and return packed minimum values.
1357 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
1358 {
1359     // PERF DMD
1360     static if (GDC_with_SSE41)
1361     {
1362         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1363     }
1364     else version(LDC)
1365     {
1366         // x86: pminsb since LDC 1.1 -O1
1367         // ARM64: smin.16b since LDC 1.8.0 -O1
1368         byte16 sa = cast(byte16)a;
1369         byte16 sb = cast(byte16)b;
1370         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1371         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1372     }
1373     else
1374     {
1375         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1376         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1377         __m128i mask = _mm_and_si128(aTob, lower);
1378         return _mm_xor_si128(b, mask);
1379     }
1380 }
1381 unittest
1382 {
1383     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1384     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1385     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1386     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1387     assert(R.array == correct);
1388 }
1389 
1390 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1391 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
1392 {
1393     // PERF DMD
1394     static if (GDC_with_SSE41)
1395     {
1396         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1397     }
1398     else version(LDC)
1399     {
1400         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1401         // ARM64: umin.8h since LDC 1.8.0 -O1
1402         ushort8 sa = cast(ushort8)a;
1403         ushort8 sb = cast(ushort8)b;
1404         ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1405         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1406     }
1407     else
1408     {
1409         __m128i c = _mm_subs_epu16(b, a);
1410         b = _mm_sub_epi16(b, c);
1411         return b;
1412     }
1413 }
1414 unittest
1415 {
1416     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1417                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1418     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1419     assert(R.array == correct);
1420 }
1421 
1422 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1423 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
1424 {
1425     // PERF DMD
1426     static if (GDC_with_SSE41)
1427     {
1428         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1429     }
1430     else version(LDC)
1431     {
1432         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1433         // ARM64: umin.4s since LDC 1.8.0 -O1
1434         uint4 sa = cast(uint4)a;
1435         uint4 sb = cast(uint4)b;
1436         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1437         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1438     }
1439     else
1440     {
1441         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1442         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1443         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1444         __m128i mask = _mm_and_si128(aTob, higher);
1445         return _mm_xor_si128(b, mask);
1446     }
1447 }
1448 unittest
1449 {
1450     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1451                                       _mm_setr_epi32(        -4,-8,  9, -8));
1452     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1453     assert(R.array == correct);
1454 }
1455 
1456 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1457 /// store the minimum and index in return value, and zero the remaining bits.
1458 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1459 {
1460     // PERF DMD
1461     static if (GDC_with_SSE41)
1462     {
1463         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1464     }
1465     else static if (LDC_with_SSE41)
1466     {
1467         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1468     }
1469     else static if (LDC_with_ARM64)
1470     {
1471         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1472         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1473         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1474         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1475         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1476         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1477         short8 sbest = cast(short8)best;
1478         short8 r;
1479         r[0] = sbest[1];
1480         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1481         r[2] = 0;
1482         r[3] = 0;
1483         r[4] = 0;
1484         r[5] = 0;
1485         r[6] = 0;
1486         r[7] = 0;
1487         return cast(__m128i)r;
1488     }
1489     else
1490     {
1491         short8 sa = cast(short8)a;
1492         ushort min = 0xffff;
1493         int index = 0;
1494         for(int n = 0; n < 8; ++n)
1495         {
1496             ushort c = sa.array[n];
1497             if (c < min)
1498             {
1499                 min = c;
1500                 index = n;
1501             }
1502         }
1503         short8 r;
1504         r.ptr[0] = min;
1505         r.ptr[1] = cast(short)index;
1506         return cast(__m128i)r;
1507     }
1508 }
1509 unittest
1510 {
1511     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1512     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1513     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1514     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1515     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1516     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1517     assert(R1.array == correct1);
1518     assert(R2.array == correct2);
1519 }
1520 
1521 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1522 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1523 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1524 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1525 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1526 /// at the offset specified in `imm8[2]`.
1527 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1528 {
1529     // PERF DMD
1530     static if (GDC_with_SSE41)
1531     {
1532         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1533     }
1534     else static if (LDC_with_SSE41)
1535     {
1536         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1537     }
1538     else
1539     {
1540         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1541         int b_offset = (imm8 & 3) * 4;
1542 
1543         byte16 ba = cast(byte16)a;
1544         byte16 bb = cast(byte16)b;
1545         short8 r;
1546 
1547         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1548 
1549         for (int j = 0; j < 8; j += 2)
1550         {
1551             int k = a_offset + j;
1552             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1553                                            0, 0, 0, 0, 
1554                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1555                                            0, 0, 0, 0);
1556             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1557             r.ptr[j] = diffs.array[0];
1558             r.ptr[j+1] = diffs.array[4];
1559         }
1560         return cast(__m128i)r;
1561     }
1562 }
1563 unittest
1564 {
1565     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1566     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1567     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1568     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1569     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1570     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1571     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1572     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B,);
1573     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B,);
1574     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B,);
1575     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B,);
1576     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B,);
1577     assert(r1.array == correct1);
1578     assert(r4.array == correct4);
1579     assert(r5.array == correct5);
1580     assert(r7.array == correct7);
1581     assert(r8.array == correct0);
1582 }
1583 
1584 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1585 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
1586 {
1587     // PERF DMD
1588     static if (GDC_with_SSE41)
1589     {
1590         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1591     }
1592     else static if (LDC_with_SSE41)
1593     {
1594         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1595         // Use IR instead.
1596         // This generates pmuldq with since LDC 1.2.0 -O0 
1597         enum ir = `
1598             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1599             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1600             %la = sext <2 x i32> %ia to <2 x i64>
1601             %lb = sext <2 x i32> %ib to <2 x i64>
1602             %r = mul <2 x i64> %la, %lb
1603             ret <2 x i64> %r`;
1604         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1605     }
1606     else static if (LDC_with_ARM64)  
1607     {
1608         // 3 instructions since LDC 1.8 -O2
1609         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1610         int2 a_lo = vmovn_s64(cast(long2)a);
1611         int2 b_lo = vmovn_s64(cast(long2)b);
1612         return cast(__m128i) vmull_s32(a_lo, b_lo);
1613     }
1614     else
1615     {
1616         int4 ia = cast(int4)a;
1617         int4 ib = cast(int4)b;
1618         long2 r;
1619         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1620         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1621         return cast(__m128i)r;
1622     }
1623 }
1624 unittest
1625 {
1626     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1627     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1628     long2 R = cast(long2) _mm_mul_epi32(A, B);
1629     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1630     assert(R.array == correct);
1631 }
1632 
1633 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1634 /// return the low 32 bits of the intermediate integers.
1635 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
1636 {
1637     // PERF DMD
1638     // PERF GDC without SSE4.1 could be better
1639     static if (GDC_with_SSE41)
1640     {
1641         int4 ia = cast(int4)a;
1642         int4 ib = cast(int4)b;
1643         // Note: older GDC doesn't have that op, but older GDC
1644         // also has no support for -msse4.1 detection
1645         return cast(__m128i)(a * b); 
1646     }
1647     else version(LDC)
1648     {
1649         int4 ia = cast(int4)a;
1650         int4 ib = cast(int4)b;
1651         return cast(__m128i)(a * b);
1652     }
1653     else
1654     {
1655         // DMD doesn't take the above
1656         int4 ia = cast(int4)a;
1657         int4 ib = cast(int4)b;
1658         int4 r;
1659         r.ptr[0] = ia.array[0] * ib.array[0];
1660         r.ptr[1] = ia.array[1] * ib.array[1];
1661         r.ptr[2] = ia.array[2] * ib.array[2];
1662         r.ptr[3] = ia.array[3] * ib.array[3];
1663         return r;
1664     }
1665 }
1666 unittest
1667 {
1668     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1669     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1670     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1671     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1672     assert(R.array == correct);
1673 }
1674 
1675 
1676 /// Convert packed signed 32-bit integers from `a` and `b` 
1677 /// to packed 16-bit integers using unsigned saturation.
1678 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
1679 {
1680     static if (GDC_with_SSE41)
1681     {
1682         // PERF For some reason doesn't generates the builtin???
1683         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1684     }
1685     else static if (LDC_with_SSE41)
1686     {
1687         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1688     }
1689     else static if (LDC_with_ARM64)
1690     {
1691        int4 z;
1692        z = 0;       
1693        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1694                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1695     }
1696     else
1697     {
1698         // PERF: not great without SSE4.1
1699         int4 sa = cast(int4)a;
1700         int4 sb = cast(int4)b;
1701         ushort[8] result;
1702         for (int i = 0; i < 4; ++i)
1703         {
1704             int s = sa.array[i];
1705             if (s < 0) s = 0;
1706             if (s > 65535) s = 65535;
1707             result.ptr[i] = cast(ushort)s;
1708 
1709             s = sb.array[i];
1710             if (s < 0) s = 0;
1711             if (s > 65535) s = 65535;
1712             result.ptr[i+4] = cast(ushort)s;
1713         }
1714         return cast(__m128i) loadUnaligned!(short8)(cast(short*)result.ptr);
1715     }
1716 }
1717 unittest
1718 {
1719     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1720     short8 R = cast(short8) _mm_packus_epi32(A, A);
1721     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1722     assert(R.array == correct);
1723 }
1724 
1725 
1726 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1727 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1728 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1729 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1730 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1731 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1732 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1733 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1734 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1735 {
1736     // PERF DMD
1737     static if (GDC_with_SSE41)
1738     {
1739         return __builtin_ia32_roundpd(a, rounding);
1740     }
1741     else static if (LDC_with_SSE41)
1742     {
1743         return __builtin_ia32_roundpd(a, rounding);
1744     }
1745     else
1746     {
1747         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1748         {
1749             // Convert to 64-bit integers
1750             long lo = _mm_cvtsd_si64(a);
1751             a.ptr[0] = a.array[1];
1752             long hi = _mm_cvtsd_si64(a);
1753             return _mm_setr_pd(lo, hi);
1754         }
1755         else
1756         {
1757             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1758 
1759             uint old = _MM_GET_ROUNDING_MODE();
1760             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1761             
1762             // Convert to 64-bit integers
1763             long lo = _mm_cvtsd_si64(a);
1764             a.ptr[0] = a.array[1];
1765             long hi = _mm_cvtsd_si64(a);
1766 
1767             // Convert back to double to achieve the rounding
1768             // The problem is that a 64-bit double can't represent all the values 
1769             // a 64-bit integer can (and vice-versa). So this function won't work for
1770             // large values. (TODO: what range exactly?)
1771             _MM_SET_ROUNDING_MODE(old);
1772             return _mm_setr_pd(lo, hi);
1773         }
1774     }
1775 }
1776 unittest
1777 {
1778     // tested in other intrinsics
1779 }
1780 
1781 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1782 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1783 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1784 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1785 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1786 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1787 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1788 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1789 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1790 {
1791     static if (GDC_with_SSE41)
1792     {
1793         return __builtin_ia32_roundps(a, rounding);
1794     }
1795     else static if (LDC_with_SSE41)
1796     {
1797         return __builtin_ia32_roundps(a, rounding);
1798     }
1799     else
1800     {
1801         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1802         {
1803             __m128i integers = _mm_cvtps_epi32(a);
1804             return _mm_cvtepi32_ps(integers);
1805         }
1806         else
1807         {
1808             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1809             uint old = _MM_GET_ROUNDING_MODE();
1810             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1811             scope(exit) _MM_SET_ROUNDING_MODE(old);
1812 
1813             // Convert to 64-bit integers
1814             __m128i integers = _mm_cvtps_epi32(a);
1815 
1816             // Convert back to float to achieve the rounding
1817             // The problem is that a 32-float can't represent all the values 
1818             // a 32-bit integer can (and vice-versa). So this function won't work for
1819             // large values. (TODO: what range exactly?)
1820             __m128 result = _mm_cvtepi32_ps(integers);
1821 
1822             return result;
1823         }
1824     }
1825 }
1826 unittest
1827 {
1828     // tested in other intrinsics
1829 }
1830 
1831 
1832 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1833 /// rounding parameter, store the result as a double-precision floating-point element 
1834 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1835 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1836 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1837 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1838 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1839 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1840 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1841 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1842 {
1843     static if (GDC_with_SSE41)
1844     {
1845         return __builtin_ia32_roundsd(a, b, rounding);
1846     }
1847     else static if (LDC_with_SSE41)
1848     {
1849         return __builtin_ia32_roundsd(a, b, rounding);
1850     }
1851     else
1852     {
1853         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1854         {
1855             // Convert to 64-bit integer
1856             long b0 = _mm_cvtsd_si64(b);
1857             a.ptr[0] = b0;
1858             return a;
1859         }
1860         else
1861         {
1862             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1863 
1864             uint old = _MM_GET_ROUNDING_MODE();
1865             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1866             
1867             // Convert to 64-bit integer
1868             long b0 = _mm_cvtsd_si64(b);
1869             a.ptr[0] = b0;
1870 
1871             // Convert back to double to achieve the rounding
1872             // The problem is that a 64-bit double can't represent all the values 
1873             // a 64-bit integer can (and vice-versa). So this function won't work for
1874             // large values. (TODO: what range exactly?)
1875             _MM_SET_ROUNDING_MODE(old);
1876             return a;
1877         }
1878     }
1879 }
1880 unittest
1881 {
1882     // tested in other intrinsics
1883 }
1884 
1885 
1886 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1887 /// rounding parameter, store the result as a single-precision floating-point element 
1888 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1889 /// to the upper elements of result.
1890 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1891 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1892 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1893 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1894 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1895 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1896 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1897 {
1898     static if (GDC_with_SSE41)
1899     {
1900         return __builtin_ia32_roundss(a, b, rounding);
1901     }
1902     else static if (LDC_with_SSE41)
1903     {
1904         return __builtin_ia32_roundss(a, b, rounding);
1905     }
1906     else
1907     {
1908         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1909         {
1910             int b0 = _mm_cvtss_si32(b);
1911             a.ptr[0] = b0;   
1912             return a;
1913         }
1914         else version(GNU)
1915         {
1916             pragma(inline, false)
1917             __m128 GDCworkaround() nothrow @nogc @trusted 
1918             {
1919                 uint old = _MM_GET_ROUNDING_MODE();
1920                 _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1921 
1922                 // Convert to 32-bit integer
1923                 int b0 = _mm_cvtss_si32(b);
1924                 a.ptr[0] = b0;       
1925 
1926                 // Convert back to double to achieve the rounding
1927                 // The problem is that a 64-bit double can't represent all the values 
1928                 // a 64-bit integer can (and vice-versa). So this function won't work for
1929                 // large values. (TODO: what range exactly?)
1930                 _MM_SET_ROUNDING_MODE(old);
1931                 return a;
1932             }
1933             return GDCworkaround();
1934         }
1935         else
1936         {
1937             uint old = _MM_GET_ROUNDING_MODE();
1938             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1939 
1940             // Convert to 32-bit integer
1941             int b0 = _mm_cvtss_si32(b);
1942             a.ptr[0] = b0;       
1943 
1944             // Convert back to double to achieve the rounding
1945             // The problem is that a 64-bit double can't represent all the values 
1946             // a 64-bit integer can (and vice-versa). So this function won't work for
1947             // large values. (TODO: what range exactly?)
1948             _MM_SET_ROUNDING_MODE(old);
1949             return a;
1950         }
1951     }
1952 }
1953 unittest
1954 {
1955     // tested in other intrinsics
1956 }
1957 
1958 
1959 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
1960 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
1961 /// exception may be generated.
1962 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted
1963 {
1964     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
1965     return *mem_addr; // it's a regular move instead
1966 }
1967 
1968 
1969 /// Return 1 if all bits in `a` are all 1's. Else return 0.
1970 int _mm_test_all_ones (__m128i a) @safe
1971 {
1972     return _mm_testc_si128(a, _mm_set1_epi32(-1));
1973 }
1974 unittest
1975 {
1976     __m128i A = _mm_set1_epi32(-1);
1977     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
1978     assert(_mm_test_all_ones(A) == 1);
1979     assert(_mm_test_all_ones(B) == 0);
1980 }
1981 
1982 /// Return 1 if all bits in `a` are all 0's. Else return 0.
1983 // This is a #BONUS since it was lacking in Intel Intrinsics API.
1984 int _mm_test_all_zeros (__m128i a) @safe
1985 {
1986     return _mm_testz_si128(a, _mm_set1_epi32(-1));
1987 }
1988 unittest
1989 {
1990     __m128i A = _mm_set1_epi32(0);
1991     __m128i B = _mm_set_epi32(0, 8, 0, 0);
1992     assert(_mm_test_all_zeros(A) == 1);
1993     assert(_mm_test_all_zeros(B) == 0);
1994 }
1995 
1996 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
1997 /// and return 1 if the result is zero, otherwise return 0.
1998 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
1999 {
2000     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
2001 }
2002 
2003 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
2004 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
2005 {
2006     return _mm_testnzc_si128(a, mask);
2007 }
2008 
2009 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
2010 /// result is zero, otherwise return 0.
2011 /// In other words, test if all bits masked by `b` are 1 in `a`.
2012 int _mm_testc_si128 (__m128i a, __m128i b) @trusted
2013 {
2014     // PERF DMD
2015     static if (GDC_with_SSE41)
2016     {
2017         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2018     }
2019     else static if (LDC_with_SSE41)
2020     {
2021         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2022     }
2023     else static if (LDC_with_ARM64)
2024     {
2025         // Acceptable since LDC 1.8 -02
2026         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2027         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2028     }
2029     else
2030     {
2031         __m128i c = ~a & b;
2032         int[4] zero = [0, 0, 0, 0];
2033         return c.array == zero;
2034     }
2035 }
2036 unittest
2037 {
2038     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2039     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2040     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2041     assert(_mm_testc_si128(A, A) == 1);
2042     assert(_mm_testc_si128(A, M1) == 0);
2043     assert(_mm_testc_si128(A, M2) == 1);
2044 }
2045 
2046 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2047 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2048 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2049 /// result is zero, otherwise set CF to 0. 
2050 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2051 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2052 {
2053     // PERF DMD
2054     static if (GDC_with_SSE41)
2055     {
2056         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2057     }
2058     else static if (LDC_with_SSE41)
2059     {
2060         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2061     }
2062     else static if (LDC_with_ARM64)
2063     {
2064         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2065         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2066 
2067         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2068                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2069     }
2070     else
2071     {
2072         __m128i c = a & b;
2073         __m128i d = ~a & b;
2074         int[4] zero = [0, 0, 0, 0];
2075         return !( (c.array == zero) || (d.array == zero));
2076     }    
2077 }
2078 unittest
2079 {
2080     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2081     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2082     __m128i Z = _mm_setzero_si128();
2083     assert(_mm_testnzc_si128(A, Z) == 0);
2084     assert(_mm_testnzc_si128(A, M) == 1);
2085     assert(_mm_testnzc_si128(A, A) == 0);
2086 }
2087 
2088 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2089 /// and return 1 if the result is zero, otherwise return 0.
2090 /// In other words, test if all bits masked by `b` are 0 in `a`.
2091 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2092 {
2093     // PERF DMD
2094     static if (GDC_with_SSE41)
2095     {
2096         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2097     }
2098     else static if (LDC_with_SSE41)
2099     {
2100         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2101     }
2102     else static if (LDC_with_ARM64)
2103     {
2104         // Acceptable since LDC 1.8 -02
2105         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2106         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2107     }
2108     else 
2109     {
2110         __m128i c = a & b;
2111         int[4] zero = [0, 0, 0, 0];
2112         return c.array == zero;
2113     }    
2114 }
2115 unittest
2116 {
2117     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2118     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2119     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2120     assert(_mm_testz_si128(A, A) == 0);
2121     assert(_mm_testz_si128(A, M1) == 1);
2122     assert(_mm_testz_si128(A, M2) == 0);
2123 }
2124