inteli.smmintrin source code

1 /**
2 * SSE4.1 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.smmintrin;
10 
11 // SSE4.1 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
13 // Note: this header will work whether you have SSE4.1 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
15 // generate SSE4.1 instructions.
16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
17 
18 public import inteli.types;
19 import inteli.internals;
20 
21 // smmintrin pulls in all previous instruction set intrinsics.
22 public import inteli.tmmintrin;
23 
24 nothrow @nogc:
25 
26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
27 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
28 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
29 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
30 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
31 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
32 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
33 
34 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
35 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
36 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
37 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
38 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
40 
41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
42 // Note: changed signature, GDC needs a compile-time value for imm8.
43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted
44 {
45     // PERF DMD
46     static if (GDC_with_SSE41)
47     {
48         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
49     }
50     else 
51     {
52         // LDC x86 This generates pblendw since LDC 1.1 and -O2
53         short8 r;
54         short8 sa = cast(short8)a;
55         short8 sb = cast(short8)b;
56         for (int n = 0; n < 8; ++n)
57         {
58             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
59         }
60         return cast(__m128i)r;
61     }
62 }
63 unittest
64 {
65     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
66     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
67     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
68     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
69     assert(C.array == correct);
70 }
71 
72 
73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
74 // Note: changed signature, GDC needs a compile-time value for `imm8`.
75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
76 {
77     static assert(imm8 >= 0 && imm8 < 4);
78     // PERF DMD
79     static if (GDC_with_SSE41)
80     {
81         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
82     }
83     else
84     {
85         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
86         double2 r;
87         for (int n = 0; n < 2; ++n)
88         {
89             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
90         }
91         return cast(__m128d)r;
92     }
93 }
94 unittest
95 {
96     __m128d A = _mm_setr_pd(0, 1);
97     __m128d B = _mm_setr_pd(8, 9);
98     double2 C = _mm_blend_pd!2(A, B);
99     double[2] correct =    [0, 9];
100     assert(C.array == correct);
101 }
102 
103 
104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`.
105 // Note: changed signature, GDC needs a compile-time value for imm8.
106 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted
107 {
108     // PERF DMD
109     static assert(imm8 >= 0 && imm8 < 16);
110     static if (GDC_with_SSE41)
111     {
112         return __builtin_ia32_blendps(a, b, imm8);
113     }
114     else version(LDC)
115     {
116         // LDC x86: generates blendps since LDC 1.1 -O2
117         //   arm64: pretty good, two instructions worst case
118         return shufflevector!(float4, (imm8 & 1) ? 4 : 0,
119                                       (imm8 & 2) ? 5 : 1,
120                                       (imm8 & 4) ? 6 : 2,
121                                       (imm8 & 8) ? 7 : 3)(a, b);
122     }
123     else
124     {
125         __m128 r; // PERF =void;
126         for (int n = 0; n < 4; ++n)
127         {
128             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
129         }
130         return r;
131     }
132 }
133 unittest
134 {
135     __m128 A = _mm_setr_ps(0, 1,  2,  3);
136     __m128 B = _mm_setr_ps(8, 9, 10, 11);
137     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
138     float[4] correct =    [8, 1, 10, 11];
139     assert(C.array == correct);
140 }
141 
142 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
144 {
145     // PERF DMD
146     // TODO BUG GDC version
147     static if (GDC_with_SSE41)
148     {
149         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
150     }
151     else static if (LDC_with_SSE41)
152     {
153         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
154     }
155     else static if (LDC_with_ARM64)
156     {
157         // LDC arm64: two instructions since LDC 1.12 -O2
158         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
159         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
160     }
161     else
162     {
163         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
164         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
165     }
166 }
167 unittest
168 {
169     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
170                                8,  9, 10, 11, 12, 13, 14, 15);
171     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
172                               24, 25, 26, 27, 28, 29, 30, 31);
173     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
174                                1,  1, -1, -1,  4,  1,  8, -128);
175     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
176     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
177                                8,  9, 26, 27, 12, 13, 14, 31 ];
178     assert(R.array == correct);
179 }
180 
181 
182 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
183 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
184 {
185     // PERF DMD
186     static if (GDC_with_SSE42)
187     {
188         // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
189         // with -msse4.2 but not -msse4.1.
190         // Not sure what is the reason, and there is a replacement sequence.
191         // Sounds like a bug.
192         return __builtin_ia32_blendvpd(a, b, mask);
193     }
194     else static if (LDC_with_SSE41)
195     {
196         return __builtin_ia32_blendvpd(a, b, mask);
197     }
198     else static if (LDC_with_ARM64)
199     {
200         long2 shift;
201         shift = 63;
202         long2 lmask = cast(long2)mask >> shift;
203         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
204     }
205     else
206     {
207         __m128d r; // PERF =void;
208         long2 lmask = cast(long2)mask;
209         for (int n = 0; n < 2; ++n)
210         {
211             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
212         }
213         return r;
214     }
215 }
216 unittest
217 {
218     __m128d A = _mm_setr_pd(1.0, 2.0);
219     __m128d B = _mm_setr_pd(3.0, 4.0);
220     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
221     __m128d R1 = _mm_blendv_pd(A, B, M1);
222     double[2] correct1 = [3.0, 2.0];
223     assert(R1.array == correct1);
224 
225     // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason.
226     // but it does work in arm64 macOS
227     // yields different results despite FP seemingly not being used
228     version(linux)
229     {}
230     else
231     {
232         __m128d M2 = _mm_setr_pd(double.nan, -double.nan);
233         __m128d R2 = _mm_blendv_pd(A, B, M2);
234         double[2] correct2 = [1.0, 4.0];
235         assert(R2.array == correct2);
236     }
237 }
238 
239 
240 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
241 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
242 {
243     // PERF DMD
244     static if (GDC_with_SSE41)
245     {
246         return __builtin_ia32_blendvps(a, b, mask);
247     }
248     else static if (LDC_with_SSE41)
249     {
250         return __builtin_ia32_blendvps(a, b, mask);
251     }
252     else static if (LDC_with_ARM64)
253     {
254         int4 shift;
255         shift = 31;
256         int4 lmask = cast(int4)mask >> shift;
257         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
258     }
259     else
260     {
261         __m128 r; // PERF =void;
262         int4 lmask = cast(int4)mask;
263         for (int n = 0; n < 4; ++n)
264         {
265             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
266         }
267         return r;
268     }
269 }
270 unittest
271 {
272     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
273     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
274     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
275     __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f);
276     __m128 R1 = _mm_blendv_ps(A, B, M1);
277     __m128 R2 = _mm_blendv_ps(A, B, M2);
278     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
279     float[4] correct2 =    [ 0.0f, 5.0f, 6.0f, 3.0f];
280     assert(R1.array == correct1);
281 
282     // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason.
283     // yields different results despite FP seemingly not being used
284     version(linux)
285     {}
286     else
287     {
288         assert(R2.array == correct2);
289     }
290 }
291 
292 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
293 /// and store the results as packed double-precision floating-point elements.
294 __m128d _mm_ceil_pd (__m128d a) @trusted
295 {
296     static if (LDC_with_ARM64)
297     {
298         // LDC arm64 acceptable since 1.8 -O2
299         // Unfortunately x86 intrinsics force a round-trip back to double2
300         // ARM neon semantics wouldn't have that
301         long2 l = vcvtpq_s64_f64(a);
302         double2 r;
303         r.ptr[0] = l.array[0];
304         r.ptr[1] = l.array[1];
305         return r;
306     }
307     else
308     {
309         return _mm_round_pd!2(a);
310     }
311 }
312 unittest
313 {
314     __m128d A = _mm_setr_pd(1.3f, -2.12f);
315     __m128d B = _mm_setr_pd(53.6f, -2.7f);
316     A = _mm_ceil_pd(A);
317     B = _mm_ceil_pd(B);
318     double[2] correctA = [2.0, -2.0];
319     double[2] correctB = [54.0, -2.0];
320     assert(A.array == correctA);
321     assert(B.array == correctB);
322 }
323 
324 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
325 /// and store the results as packed single-precision floating-point elements.
326 __m128 _mm_ceil_ps (__m128 a) @trusted
327 {
328     static if (LDC_with_ARM64)
329     {
330         // LDC arm64 acceptable since 1.8 -O1
331         int4 l = vcvtpq_s32_f32(a);
332         float4 r;
333         r.ptr[0] = l.array[0];
334         r.ptr[1] = l.array[1];
335         r.ptr[2] = l.array[2];
336         r.ptr[3] = l.array[3];
337         return r;
338     }
339     else
340     {
341         return _mm_round_ps!2(a);
342     }
343 }
344 unittest
345 {
346     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
347     __m128 C = _mm_ceil_ps(A);
348     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
349     assert(C.array == correct);
350 }
351 
352 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
353 /// store the result as a double-precision floating-point element in the lower element of result, 
354 /// and copy the upper element from `a` to the upper element of dst.
355 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
356 {
357     static if (LDC_with_ARM64)
358     {
359         a[0] = vcvtps_s64_f64(b[0]);
360         return a;
361     }
362     else
363     {
364         return _mm_round_sd!2(a, b);
365     }
366 }
367 unittest
368 {
369     __m128d A = _mm_setr_pd(1.3, -2.12);
370     __m128d B = _mm_setr_pd(53.6, -3.7);
371     __m128d C = _mm_ceil_sd(A, B);
372     double[2] correct = [54.0, -2.12];
373     assert(C.array == correct);
374 }
375 
376 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
377 /// store the result as a single-precision floating-point element in the lower element of result, 
378 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
379 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
380 {
381     static if (LDC_with_ARM64)
382     {
383         a[0] = vcvtps_s32_f32(b[0]);
384         return a;
385     }
386     else
387     {
388         return _mm_round_ss!2(a, b);
389     }
390 }
391 unittest
392 {
393     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
394     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
395     __m128 C = _mm_ceil_ss(A, B);
396     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
397     assert(C.array == correct);
398 }
399 
400 /// Compare packed 64-bit integers in `a` and `b` for equality.
401 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
402 {
403     // PERF DMD
404     static if (GDC_with_SSE41)
405     {
406         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
407     }
408     else version(LDC)
409     {
410         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
411         //     arm64: generates cmeq since LDC 1.8 -O1
412         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
413     }
414     else
415     {
416         // Clever pcmpeqd + pand use with LDC 1.24 -O2
417         long2 la = cast(long2)a;
418         long2 lb = cast(long2)b;
419         long2 res;
420         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
421         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
422         return cast(__m128i)res;
423     }
424 }
425 unittest
426 {
427     __m128i A = _mm_setr_epi64(-1, -2);
428     __m128i B = _mm_setr_epi64(-3, -2);
429     __m128i C = _mm_setr_epi64(-1, -4);
430     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
431     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
432     long[2] correct1 = [0, -1];
433     long[2] correct2 = [-1, 0];
434     assert(AB.array == correct1);
435     assert(AC.array == correct2);
436 }
437 
438 
439 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
440 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
441 {
442     // PERF DMD
443     static if (GDC_with_SSE41)
444     {
445         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
446     }
447     else version(LDC)
448     {
449         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
450         enum ir = `
451             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
452             %r = sext <4 x i16> %v to <4 x i32>
453             ret <4 x i32> %r`;
454         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
455     }
456     else
457     {
458         short8 sa = cast(short8)a;
459         int4 r;
460         r.ptr[0] = sa.array[0];
461         r.ptr[1] = sa.array[1];
462         r.ptr[2] = sa.array[2];
463         r.ptr[3] = sa.array[3];
464         return r;
465     }
466 }
467 unittest
468 {
469     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
470     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
471     int[4] correct = [-1, 0, -32768, 32767];
472     assert(C.array == correct);
473 }
474 
475 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
476 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
477 {
478     // PERF DMD
479     static if (GDC_with_SSE41)
480     {
481         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
482     }
483     else version(LDC)
484     {
485         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
486         enum ir = `
487             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
488             %r = sext <2 x i16> %v to <2 x i64>
489             ret <2 x i64> %r`;
490         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
491     }
492     else
493     {
494         short8 sa = cast(short8)a;
495         long2 r;
496         r.ptr[0] = sa.array[0];
497         r.ptr[1] = sa.array[1];
498         return cast(__m128i)r;
499     }
500 }
501 unittest
502 {
503     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
504     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
505     long[2] correct = [-32768, 32767];
506     assert(C.array == correct);
507 }
508 
509 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
510 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
511 {
512     // PERF DMD
513     static if (GDC_with_SSE41)
514     {
515         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
516     }
517     else version(LDC)
518     {
519         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
520         enum ir = `
521             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
522             %r = sext <2 x i32> %v to <2 x i64>
523             ret <2 x i64> %r`;
524         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
525     }
526     else
527     {
528         int4 sa = cast(int4)a;
529         long2 r;
530         r.ptr[0] = sa.array[0];
531         r.ptr[1] = sa.array[1];
532         return cast(__m128i)r;
533     }
534 }
535 unittest
536 {
537     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
538     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
539     long[2] correct = [-4, 42];
540     assert(C.array == correct);
541 }
542 
543 
544 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
545 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
546 {
547     // PERF DMD
548     static if (GDC_with_SSE41)
549     {
550         alias ubyte16 = __vector(ubyte[16]);
551         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
552     }
553     else version(LDC)
554     {
555         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
556         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
557         enum ir = `
558             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
559             %r = sext <8 x i8> %v to <8 x i16>
560             ret <8 x i16> %r`;
561         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
562     }
563     else
564     {
565         byte16 sa = cast(byte16)a;
566         short8 r;
567         foreach(n; 0..8)
568             r.ptr[n] = sa.array[n];
569         return cast(__m128i)r;
570     }
571 }
572 unittest
573 {
574     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
575     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
576     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
577     assert(C.array == correct);
578 }
579 
580 
581 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
582 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
583 {
584     // PERF DMD
585     static if (GDC_with_SSE41)
586     {
587         alias ubyte16 = __vector(ubyte[16]);
588         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
589     }
590     else static if (LDC_with_SSE41)
591     {
592         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
593         enum ir = `
594             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
595             %r = sext <4 x i8> %v to <4 x i32>
596             ret <4 x i32> %r`;
597         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
598     }
599     else
600     {
601         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
602         byte16 sa = cast(byte16)a;
603         int4 r;
604         r.ptr[0] = sa.array[0];
605         r.ptr[1] = sa.array[1];
606         r.ptr[2] = sa.array[2];
607         r.ptr[3] = sa.array[3];
608         return cast(__m128i)r;
609     }
610 }
611 unittest
612 {
613     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
614     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
615     int[4] correct = [127, -128, 1, -1];
616     assert(C.array == correct);
617 }
618 
619 
620 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
621 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
622 {
623     // PERF DMD
624     static if (GDC_with_SSE41)
625     {
626         alias ubyte16 = __vector(ubyte[16]);
627         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
628     }
629     else version(LDC)
630     {
631         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
632         // LDC arm64: it's ok since LDC 1.8 -O1
633         enum ir = `
634             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
635             %r = sext <2 x i8> %v to <2 x i64>
636             ret <2 x i64> %r`;
637         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
638     }
639     else
640     {
641         byte16 sa = cast(byte16)a;
642         long2 r;
643         foreach(n; 0..2)
644             r.ptr[n] = sa.array[n];
645         return cast(__m128i)r;
646     }
647 }
648 unittest
649 {
650     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
651     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
652     long[2] correct = [127, -128];
653     assert(C.array == correct);
654 }
655 
656 
657 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
658 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
659 {
660     // PERF DMD
661     static if (GDC_with_SSE41)
662     {
663         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
664     }
665     else
666     {
667         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
668         //     arm64: ushll since LDC 1.12 -O1
669         short8 sa = cast(short8)a;
670         int4 r;
671         r.ptr[0] = cast(ushort)sa.array[0];
672         r.ptr[1] = cast(ushort)sa.array[1];
673         r.ptr[2] = cast(ushort)sa.array[2];
674         r.ptr[3] = cast(ushort)sa.array[3];
675         return cast(__m128i)r;
676     }
677 }
678 unittest
679 {
680     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
681     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
682     int[4] correct = [65535, 0, 32768, 32767];
683     assert(C.array == correct);
684 }
685 
686 
687 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
688 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
689 {
690     // PERF DMD
691     static if (GDC_with_SSE41)
692     {
693         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
694     }
695     else static if (LDC_with_ARM64)
696     {
697         // LDC arm64: a bit shorter than below, in -O2
698         short8 sa = cast(short8)a;
699         long2 r;
700         for(int n = 0; n < 2; ++n)
701             r.ptr[n] = cast(ushort)sa.array[n];
702         return cast(__m128i)r;
703     }
704     else
705     {
706         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
707         short8 sa = cast(short8)a;
708         long2 r;
709         r.ptr[0] = cast(ushort)sa.array[0];
710         r.ptr[1] = cast(ushort)sa.array[1];
711         return cast(__m128i)r;
712     }
713 }
714 unittest
715 {
716     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
717     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
718     long[2] correct = [65535, 0];
719     assert(C.array == correct);
720 }
721 
722 
723 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
724 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
725 {
726     // PERF DMD
727     static if (GDC_with_SSE41)
728     {
729         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
730     }
731     else
732     {
733         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
734         //     arm64: generates ushll since LDC 1.12 -O1
735         int4 sa = cast(int4)a;
736         long2 r;
737         r.ptr[0] = cast(uint)sa.array[0];
738         r.ptr[1] = cast(uint)sa.array[1];
739         return cast(__m128i)r;
740     }
741 }
742 unittest
743 {
744     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
745     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
746     long[2] correct = [4294967295, 42];
747     assert(C.array == correct);
748 }
749 
750 
751 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
752 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
753 {
754     // PERF DMD
755     static if (GDC_with_SSE41)
756     {
757         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(short8)a);
758     }
759     else
760     {
761         // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1
762         //     arm64: ushll since LDC 1.12 -O1
763         // PERF: catastrophic with GDC without SSE4.1
764         byte16 sa = cast(byte16)a;
765         short8 r;
766         r.ptr[0] = cast(ubyte)sa.array[0];
767         r.ptr[1] = cast(ubyte)sa.array[1];
768         r.ptr[2] = cast(ubyte)sa.array[2];
769         r.ptr[3] = cast(ubyte)sa.array[3];
770         r.ptr[4] = cast(ubyte)sa.array[4];
771         r.ptr[5] = cast(ubyte)sa.array[5];
772         r.ptr[6] = cast(ubyte)sa.array[6];
773         r.ptr[7] = cast(ubyte)sa.array[7];
774         return cast(__m128i)r;
775     }
776 }
777 unittest
778 {
779     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
780     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
781     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
782     assert(C.array == correct);
783 }
784 
785 
786 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
787 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
788 {
789     // PERF DMD
790     static if (GDC_with_SSE41)
791     {
792         alias ubyte16 = __vector(ubyte[16]);
793         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
794     }
795     else static if (LDC_with_ARM64)
796     {
797         // LDC arm64: a bit better than below in -O2
798         byte16 sa = cast(byte16)a;
799         int4 r;
800         for(int n = 0; n < 4; ++n) 
801             r.ptr[n] = cast(ubyte)sa.array[n];
802         return cast(__m128i)r;
803     }
804     else
805     {
806         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
807         // PERF: catastrophic with GDC without SSE4.1
808         byte16 sa = cast(byte16)a;
809         int4 r;
810         r.ptr[0] = cast(ubyte)sa.array[0];
811         r.ptr[1] = cast(ubyte)sa.array[1];
812         r.ptr[2] = cast(ubyte)sa.array[2];
813         r.ptr[3] = cast(ubyte)sa.array[3];
814         return cast(__m128i)r;
815     }
816 }
817 unittest
818 {
819     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
820     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
821     int[4] correct = [127, 128, 1, 255];
822     assert(C.array == correct);
823 }
824 
825 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
826 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
827 {
828     // PERF DMD
829     static if (GDC_with_SSE41)
830     {
831         alias ubyte16 = __vector(ubyte[16]);
832         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
833     }
834     else static if (LDC_with_ARM64)
835     {
836         // LDC arm64: this optimizes better than the loop below
837         byte16 sa = cast(byte16)a;
838         long2 r;
839         for (int n = 0; n < 2; ++n)
840             r.ptr[n] = cast(ubyte)sa.array[n];
841         return cast(__m128i)r;
842     }
843     else
844     {
845         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
846         byte16 sa = cast(byte16)a;
847         long2 r;
848         r.ptr[0] = cast(ubyte)sa.array[0];
849         r.ptr[1] = cast(ubyte)sa.array[1];
850         return cast(__m128i)r;
851     }
852 }
853 unittest
854 {
855     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
856     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
857     long[2] correct = [127, 254];
858     assert(C.array == correct);
859 }
860 
861 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
862 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
863 /// store the sum in dst using the low 4 bits of `imm8`.
864 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
865 {
866     // PERF DMD
867     static if (GDC_with_SSE41)
868     {
869         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
870     }
871     else static if (LDC_with_SSE41)
872     {
873         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
874     }
875     else
876     {
877         __m128d zero = _mm_setzero_pd();
878         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
879         double sum = temp.array[0] + temp.array[1];
880         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
881     }
882 }
883 unittest
884 {
885     __m128d A = _mm_setr_pd(1.0, 2.0);
886     __m128d B = _mm_setr_pd(4.0, 8.0);
887     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
888     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
889     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
890     double[2] correct1 = [ 4.0,  4.0];
891     double[2] correct2 = [16.0,  0.0];
892     double[2] correct3 = [ 0.0, 20.0];
893     assert(R1.array == correct1);
894     assert(R2.array == correct2);
895     assert(R3.array == correct3);
896 }
897 
898 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
899 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
900 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
901 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
902 {
903       // PERF DMD
904     static if (GDC_with_SSE41)
905     {
906         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
907     }
908     else static if (LDC_with_SSE41)
909     {
910         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
911     }
912     else
913     {
914         __m128 zero = _mm_setzero_ps();
915         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
916         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
917         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
918     }        
919 }
920 unittest
921 {
922     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
923     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
924     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
925     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
926     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
927     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
928     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
929     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
930     assert(R1.array == correct1);
931     assert(R2.array == correct2);
932     assert(R3.array == correct3);
933 }
934 
935 
936 /// Extract a 32-bit integer from `a`, selected with `imm8`.
937 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
938 {
939     return (cast(int4)a).array[imm8 & 3];
940 }
941 unittest
942 {
943     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
944     assert(_mm_extract_epi32(A, 0) == 1);
945     assert(_mm_extract_epi32(A, 1 + 8) == 2);
946     assert(_mm_extract_epi32(A, 3 + 4) == 4);
947 }
948 
949 /// Extract a 64-bit integer from `a`, selected with `imm8`.
950 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
951 {
952     long2 la = cast(long2)a;
953     return la.array[imm8 & 1];
954 }
955 unittest
956 {
957     __m128i A = _mm_setr_epi64(45, -67);
958     assert(_mm_extract_epi64(A, 0) == 45);
959     assert(_mm_extract_epi64(A, 1) == -67);
960     assert(_mm_extract_epi64(A, 2) == 45);
961 }
962 
963 /// Extract an 8-bit integer from `a`, selected with `imm8`.
964 /// Warning: the returned value is zero-extended to 32-bits.
965 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
966 {
967     byte16 ba = cast(byte16)a;
968     return cast(ubyte) ba.array[imm8 & 15];
969 }
970 unittest
971 {
972     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
973     assert(_mm_extract_epi8(A, 7) == 7);
974     assert(_mm_extract_epi8(A, 13) == 255);
975     assert(_mm_extract_epi8(A, 7 + 16) == 7);
976 }
977 
978 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
979 /// Note: returns a 32-bit $(I integer).
980 int _mm_extract_ps (__m128 a, const int imm8) @trusted
981 {
982     return (cast(int4)a).array[imm8 & 3];
983 }
984 unittest
985 {
986     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
987     assert(_mm_extract_ps(A, 0) == 0x3f800000);
988     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
989     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
990 }
991 
992 
993 
994 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
995 /// integer value, and store the results as packed double-precision floating-point elements.
996 __m128d _mm_floor_pd (__m128d a) @trusted
997 {
998     static if (LDC_with_ARM64)
999     {
1000         // LDC arm64 acceptable since 1.8 -O2
1001         long2 l = vcvtmq_s64_f64(a);
1002         double2 r;
1003         r.ptr[0] = l.array[0];
1004         r.ptr[1] = l.array[1];
1005         return r;
1006     }
1007     else
1008     {
1009         return _mm_round_pd!1(a);
1010     }
1011 }
1012 unittest
1013 {
1014     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1015     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1016     A = _mm_floor_pd(A);
1017     B = _mm_floor_pd(B);
1018     double[2] correctA = [1.0, -3.0];
1019     double[2] correctB = [53.0, -3.0];
1020     assert(A.array == correctA);
1021     assert(B.array == correctB);
1022 }
1023 
1024 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1025 /// integer value, and store the results as packed single-precision floating-point elements.
1026 __m128 _mm_floor_ps (__m128 a) @trusted
1027 {
1028     static if (LDC_with_ARM64)
1029     {
1030         // LDC arm64 acceptable since 1.8 -O1
1031         int4 l = vcvtmq_s32_f32(a);
1032         float4 r;
1033         r.ptr[0] = l.array[0];
1034         r.ptr[1] = l.array[1];
1035         r.ptr[2] = l.array[2];
1036         r.ptr[3] = l.array[3];
1037         return r;
1038     }
1039     else
1040     {
1041         return _mm_round_ps!1(a);
1042     }
1043 }
1044 unittest
1045 {
1046     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1047     __m128 C = _mm_floor_ps(A);
1048     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1049     assert(C.array == correct);
1050 }
1051 
1052 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1053 /// integer value, store the result as a double-precision floating-point element in the 
1054 /// lower element, and copy the upper element from `a` to the upper element.
1055 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1056 {
1057     static if (LDC_with_ARM64)
1058     {
1059         a[0] = vcvtms_s64_f64(b[0]);
1060         return a;
1061     }
1062     else
1063     {
1064         return _mm_round_sd!1(a, b);
1065     }
1066 }
1067 unittest
1068 {
1069     __m128d A = _mm_setr_pd(1.3, -2.12);
1070     __m128d B = _mm_setr_pd(-53.1, -3.7);
1071     __m128d C = _mm_floor_sd(A, B);
1072     double[2] correct = [-54.0, -2.12];
1073     assert(C.array == correct);
1074 }
1075 
1076 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1077 /// integer value, store the result as a single-precision floating-point element in the
1078 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1079 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1080 {
1081     static if (LDC_with_ARM64)
1082     {
1083         a[0] = vcvtms_s32_f32(b[0]);
1084         return a;
1085     }
1086     else
1087     {
1088         return _mm_round_ss!1(a, b);
1089     }
1090 }
1091 unittest
1092 {
1093     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1094     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1095     __m128 C = _mm_floor_ss(A, B);
1096     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1097     assert(C.array == correct);
1098 }
1099 
1100 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1101 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1102 {
1103     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1104     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1105     // LDC arm64: ins.s since LDC 1.8 -O2
1106     int4 ia = cast(int4)a;
1107     ia.ptr[imm8 & 3] = i;
1108     return cast(__m128i)ia; 
1109 }
1110 unittest
1111 {
1112     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1113     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1114     int[4] result = [1, 2, 5, 4];
1115     assert(C.array == result);
1116 }
1117 
1118 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1119 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1120 {
1121     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1122     // LDC x86: always do something sensible.
1123     long2 la = cast(long2)a;
1124     la.ptr[imm8 & 1] = i;
1125     return cast(__m128i)la;
1126 }
1127 unittest
1128 {
1129     __m128i A = _mm_setr_epi64(1, 2);
1130     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1131     long[2] result = [1, 5];
1132     assert(C.array == result);
1133 }
1134 
1135 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1136 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1137 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1138 {
1139     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1140     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1141     byte16 ba = cast(byte16)a;
1142     ba.ptr[imm8 & 15] = cast(byte)i;
1143     return cast(__m128i)ba; 
1144 }
1145 unittest
1146 {
1147     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1148     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1149     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1150     assert(C.array == result);
1151 }
1152 
1153 
1154 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1155 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1156 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1157 /// (elements are zeroed out when the corresponding bit is set).
1158 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1159 {
1160     // PERF DMD
1161     static if (GDC_with_SSE41)
1162     {
1163         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1164     }
1165     else static if (LDC_with_SSE41)
1166     {
1167         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1168     }
1169     else
1170     {
1171         float4 tmp2 = a;
1172         float tmp1 = b.array[(imm8 >> 6) & 3];
1173         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1174         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1175     }
1176 }
1177 unittest
1178 {
1179     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1180     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1181     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1182     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1183     assert(C.array == correct);
1184 }
1185 
1186 
1187 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1188 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
1189 {
1190     static if (GDC_with_SSE41)
1191     {
1192         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1193     }
1194     else version(LDC)
1195     {
1196         // x86: pmaxsd since LDC 1.1 -O1
1197         // ARM: smax.4s since LDC 1.8 -01
1198         int4 sa = cast(int4)a;
1199         int4 sb = cast(int4)b;
1200         int4 greater = greaterMask!int4(sa, sb);
1201         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1202     }
1203     else
1204     {
1205         __m128i higher = _mm_cmpgt_epi32(a, b);
1206         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1207         __m128i mask = _mm_and_si128(aTob, higher);
1208         return _mm_xor_si128(b, mask);
1209     }
1210 }
1211 unittest
1212 {
1213     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1214                                       _mm_setr_epi32(        -4,-8,  9, -8));
1215     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1216     assert(R.array == correct);
1217 }
1218 
1219 /// Compare packed signed 8-bit integers in `a` and `b`, 
1220 /// and return packed maximum values.
1221 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
1222 {
1223     // PERF DMD
1224     static if (GDC_with_SSE41)
1225     {
1226         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1227     }
1228     else version(LDC)
1229     {
1230         // x86: pmaxsb since LDC 1.1 -O1
1231         // ARM64: smax.16b since LDC 1.8.0 -O1
1232         byte16 sa = cast(byte16)a;
1233         byte16 sb = cast(byte16)b;
1234         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1235         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1236     }
1237     else
1238     {
1239         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1240         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1241         __m128i mask = _mm_and_si128(aTob, lower);
1242         return _mm_xor_si128(b, mask);
1243     }
1244 }
1245 unittest
1246 {
1247     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1248     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1249     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1250     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1251     assert(R.array == correct);
1252 }
1253 
1254 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1255 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
1256 {
1257     // PERF DMD
1258     static if (GDC_with_SSE41)
1259     {
1260         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1261     }
1262     else version(LDC)
1263     {
1264         // x86: pmaxuw since LDC 1.1 -O1
1265         // ARM64: umax.8h since LDC 1.8.0 -O1
1266         // PERF: without sse4.1, LLVM 12 produces a very interesting
1267         //          psubusw xmm0, xmm1
1268         //          paddw   xmm0, xmm1
1269         //       sequence that maybe should go in other min/max intrinsics? 
1270         ushort8 sa = cast(ushort8)a;
1271         ushort8 sb = cast(ushort8)b;
1272         ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1273         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1274     }
1275     else
1276     {
1277         b = _mm_subs_epu16(b, a);
1278         b = _mm_add_epi16(b, a);
1279         return b;
1280     }
1281 }
1282 unittest
1283 {
1284     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1285                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1286     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1287     assert(R.array == correct);
1288 }
1289 
1290 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1291 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
1292 {
1293     // PERF DMD
1294     static if (GDC_with_SSE41)
1295     {
1296         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1297     }
1298     else version(LDC)
1299     {
1300         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1301         // ARM64: umax.4s since LDC 1.8.0 -O1
1302         uint4 sa = cast(uint4)a;
1303         uint4 sb = cast(uint4)b;
1304         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1305         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1306     }
1307     else
1308     {
1309         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1310         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1311         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1312         __m128i mask = _mm_and_si128(aTob, higher);
1313         return _mm_xor_si128(b, mask);
1314     }
1315 }
1316 unittest
1317 {
1318     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1319                                       _mm_setr_epi32(        -4,-8,  9, -8));
1320     int[4] correct =                                [        -4,-8,  9, -7];
1321     assert(R.array == correct);
1322 }
1323 
1324 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1325 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
1326 {
1327     // PERF DMD
1328     static if (GDC_with_SSE41)
1329     {
1330         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1331     }
1332     else version(LDC)
1333     {
1334         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1335         // ARM: smin.4s since LDC 1.8 -01
1336         int4 sa = cast(int4)a;
1337         int4 sb = cast(int4)b;
1338         int4 greater = greaterMask!int4(sa, sb);
1339         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1340     }
1341     else
1342     {
1343         __m128i higher = _mm_cmplt_epi32(a, b);
1344         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1345         __m128i mask = _mm_and_si128(aTob, higher);
1346         return _mm_xor_si128(b, mask);
1347     }
1348 }
1349 unittest
1350 {
1351     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1352                                       _mm_setr_epi32(        -4, -8,  9, -8));
1353     int[4] correct =                               [         -4, -8, -4, -8];
1354     assert(R.array == correct);
1355 }
1356 
1357 /// Compare packed signed 8-bit integers in `a` and `b`, 
1358 /// and return packed minimum values.
1359 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
1360 {
1361     // PERF DMD
1362     static if (GDC_with_SSE41)
1363     {
1364         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1365     }
1366     else version(LDC)
1367     {
1368         // x86: pminsb since LDC 1.1 -O1
1369         // ARM64: smin.16b since LDC 1.8.0 -O1
1370         byte16 sa = cast(byte16)a;
1371         byte16 sb = cast(byte16)b;
1372         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1373         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1374     }
1375     else
1376     {
1377         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1378         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1379         __m128i mask = _mm_and_si128(aTob, lower);
1380         return _mm_xor_si128(b, mask);
1381     }
1382 }
1383 unittest
1384 {
1385     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1386     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1387     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1388     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1389     assert(R.array == correct);
1390 }
1391 
1392 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1393 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
1394 {
1395     // PERF DMD
1396     static if (GDC_with_SSE41)
1397     {
1398         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1399     }
1400     else version(LDC)
1401     {
1402         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1403         // ARM64: umin.8h since LDC 1.8.0 -O1
1404         ushort8 sa = cast(ushort8)a;
1405         ushort8 sb = cast(ushort8)b;
1406         ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1407         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1408     }
1409     else
1410     {
1411         __m128i c = _mm_subs_epu16(b, a);
1412         b = _mm_sub_epi16(b, c);
1413         return b;
1414     }
1415 }
1416 unittest
1417 {
1418     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1419                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1420     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1421     assert(R.array == correct);
1422 }
1423 
1424 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1425 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
1426 {
1427     // PERF DMD
1428     static if (GDC_with_SSE41)
1429     {
1430         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1431     }
1432     else version(LDC)
1433     {
1434         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1435         // ARM64: umin.4s since LDC 1.8.0 -O1
1436         uint4 sa = cast(uint4)a;
1437         uint4 sb = cast(uint4)b;
1438         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1439         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1440     }
1441     else
1442     {
1443         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1444         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1445         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1446         __m128i mask = _mm_and_si128(aTob, higher);
1447         return _mm_xor_si128(b, mask);
1448     }
1449 }
1450 unittest
1451 {
1452     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1453                                       _mm_setr_epi32(        -4,-8,  9, -8));
1454     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1455     assert(R.array == correct);
1456 }
1457 
1458 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1459 /// store the minimum and index in return value, and zero the remaining bits.
1460 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1461 {
1462     // PERF DMD
1463     static if (GDC_with_SSE41)
1464     {
1465         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1466     }
1467     else static if (LDC_with_SSE41)
1468     {
1469         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1470     }
1471     else static if (LDC_with_ARM64)
1472     {
1473         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1474         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1475         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1476         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1477         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1478         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1479         short8 sbest = cast(short8)best;
1480         short8 r;
1481         r[0] = sbest[1];
1482         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1483         r[2] = 0;
1484         r[3] = 0;
1485         r[4] = 0;
1486         r[5] = 0;
1487         r[6] = 0;
1488         r[7] = 0;
1489         return cast(__m128i)r;
1490     }
1491     else
1492     {
1493         short8 sa = cast(short8)a;
1494         ushort min = 0xffff;
1495         int index = 0;
1496         for(int n = 0; n < 8; ++n)
1497         {
1498             ushort c = sa.array[n];
1499             if (c < min)
1500             {
1501                 min = c;
1502                 index = n;
1503             }
1504         }
1505         short8 r;
1506         r.ptr[0] = min;
1507         r.ptr[1] = cast(short)index;
1508         return cast(__m128i)r;
1509     }
1510 }
1511 unittest
1512 {
1513     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1514     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1515     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1516     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1517     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1518     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1519     assert(R1.array == correct1);
1520     assert(R2.array == correct2);
1521 }
1522 
1523 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1524 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1525 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1526 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1527 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1528 /// at the offset specified in `imm8[2]`.
1529 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1530 {
1531     // PERF DMD
1532     static if (GDC_with_SSE41)
1533     {
1534         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1535     }
1536     else static if (LDC_with_SSE41)
1537     {
1538         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1539     }
1540     else
1541     {
1542         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1543         int b_offset = (imm8 & 3) * 4;
1544 
1545         byte16 ba = cast(byte16)a;
1546         byte16 bb = cast(byte16)b;
1547         short8 r;
1548 
1549         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1550 
1551         for (int j = 0; j < 8; j += 2)
1552         {
1553             int k = a_offset + j;
1554             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1555                                            0, 0, 0, 0, 
1556                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1557                                            0, 0, 0, 0);
1558             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1559             r.ptr[j] = diffs.array[0];
1560             r.ptr[j+1] = diffs.array[4];
1561         }
1562         return cast(__m128i)r;
1563     }
1564 }
1565 unittest
1566 {
1567     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1568     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1569     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1570     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1571     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1572     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1573     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1574     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B,);
1575     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B,);
1576     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B,);
1577     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B,);
1578     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B,);
1579     assert(r1.array == correct1);
1580     assert(r4.array == correct4);
1581     assert(r5.array == correct5);
1582     assert(r7.array == correct7);
1583     assert(r8.array == correct0);
1584 }
1585 
1586 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1587 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
1588 {
1589     // PERF DMD
1590     static if (GDC_with_SSE41)
1591     {
1592         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1593     }
1594     else static if (LDC_with_SSE41)
1595     {
1596         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1597         // Use IR instead.
1598         // This generates pmuldq with since LDC 1.2.0 -O0 
1599         enum ir = `
1600             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1601             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1602             %la = sext <2 x i32> %ia to <2 x i64>
1603             %lb = sext <2 x i32> %ib to <2 x i64>
1604             %r = mul <2 x i64> %la, %lb
1605             ret <2 x i64> %r`;
1606         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1607     }
1608     else static if (LDC_with_ARM64)  
1609     {
1610         // 3 instructions since LDC 1.8 -O2
1611         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1612         int2 a_lo = vmovn_s64(cast(long2)a);
1613         int2 b_lo = vmovn_s64(cast(long2)b);
1614         return cast(__m128i) vmull_s32(a_lo, b_lo);
1615     }
1616     else
1617     {
1618         int4 ia = cast(int4)a;
1619         int4 ib = cast(int4)b;
1620         long2 r;
1621         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1622         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1623         return cast(__m128i)r;
1624     }
1625 }
1626 unittest
1627 {
1628     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1629     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1630     long2 R = cast(long2) _mm_mul_epi32(A, B);
1631     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1632     assert(R.array == correct);
1633 }
1634 
1635 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1636 /// return the low 32 bits of the intermediate integers.
1637 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
1638 {
1639     // PERF DMD
1640     // PERF GDC without SSE4.1 could be better
1641     static if (GDC_with_SSE41)
1642     {
1643         int4 ia = cast(int4)a;
1644         int4 ib = cast(int4)b;
1645         // Note: older GDC doesn't have that op, but older GDC
1646         // also has no support for -msse4.1 detection
1647         return cast(__m128i)(a * b); 
1648     }
1649     else version(LDC)
1650     {
1651         int4 ia = cast(int4)a;
1652         int4 ib = cast(int4)b;
1653         return cast(__m128i)(a * b);
1654     }
1655     else
1656     {
1657         // DMD doesn't take the above
1658         int4 ia = cast(int4)a;
1659         int4 ib = cast(int4)b;
1660         int4 r;
1661         r.ptr[0] = ia.array[0] * ib.array[0];
1662         r.ptr[1] = ia.array[1] * ib.array[1];
1663         r.ptr[2] = ia.array[2] * ib.array[2];
1664         r.ptr[3] = ia.array[3] * ib.array[3];
1665         return r;
1666     }
1667 }
1668 unittest
1669 {
1670     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1671     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1672     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1673     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1674     assert(R.array == correct);
1675 }
1676 
1677 
1678 /// Convert packed signed 32-bit integers from `a` and `b` 
1679 /// to packed 16-bit integers using unsigned saturation.
1680 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
1681 {
1682     static if (GDC_with_SSE41)
1683     {
1684         // PERF For some reason doesn't generates the builtin???
1685         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1686     }
1687     else static if (LDC_with_SSE41)
1688     {
1689         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1690     }
1691     else static if (LDC_with_ARM64)
1692     {
1693        int4 z;
1694        z = 0;       
1695        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1696                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1697     }
1698     else
1699     {
1700         // PERF: not great without SSE4.1
1701         int4 sa = cast(int4)a;
1702         int4 sb = cast(int4)b;
1703         align(16) ushort[8] result;
1704         for (int i = 0; i < 4; ++i)
1705         {
1706             int s = sa.array[i];
1707             if (s < 0) s = 0;
1708             if (s > 65535) s = 65535;
1709             result.ptr[i] = cast(ushort)s;
1710 
1711             s = sb.array[i];
1712             if (s < 0) s = 0;
1713             if (s > 65535) s = 65535;
1714             result.ptr[i+4] = cast(ushort)s;
1715         }
1716         return *cast(__m128i*)(result.ptr);
1717     }
1718 }
1719 unittest
1720 {
1721     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1722     short8 R = cast(short8) _mm_packus_epi32(A, A);
1723     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1724     assert(R.array == correct);
1725 }
1726 
1727 
1728 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1729 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1730 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1731 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1732 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1733 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1734 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1735 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1736 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1737 {
1738     // PERF DMD
1739     static if (GDC_with_SSE41)
1740     {
1741         return __builtin_ia32_roundpd(a, rounding);
1742     }
1743     else static if (LDC_with_SSE41)
1744     {
1745         return __builtin_ia32_roundpd(a, rounding);
1746     }
1747     else
1748     {
1749         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1750         {
1751             // Convert to 64-bit integers
1752             long lo = _mm_cvtsd_si64(a);
1753             a.ptr[0] = a.array[1];
1754             long hi = _mm_cvtsd_si64(a);
1755             return _mm_setr_pd(lo, hi);
1756         }
1757         else
1758         {
1759             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1760 
1761             uint old = _MM_GET_ROUNDING_MODE();
1762             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1763             
1764             // Convert to 64-bit integers
1765             long lo = _mm_cvtsd_si64(a);
1766             a.ptr[0] = a.array[1];
1767             long hi = _mm_cvtsd_si64(a);
1768 
1769             // Convert back to double to achieve the rounding
1770             // The problem is that a 64-bit double can't represent all the values 
1771             // a 64-bit integer can (and vice-versa). So this function won't work for
1772             // large values. (TODO: what range exactly?)
1773             _MM_SET_ROUNDING_MODE(old);
1774             return _mm_setr_pd(lo, hi);
1775         }
1776     }
1777 }
1778 unittest
1779 {
1780     // tested in other intrinsics
1781 }
1782 
1783 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1784 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1785 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1786 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1787 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1788 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1789 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1790 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1791 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1792 {
1793     static if (GDC_with_SSE41)
1794     {
1795         return __builtin_ia32_roundps(a, rounding);
1796     }
1797     else static if (LDC_with_SSE41)
1798     {
1799         return __builtin_ia32_roundps(a, rounding);
1800     }
1801     else
1802     {
1803         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1804         {
1805             __m128i integers = _mm_cvtps_epi32(a);
1806             return _mm_cvtepi32_ps(integers);
1807         }
1808         else
1809         {
1810             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1811             uint old = _MM_GET_ROUNDING_MODE();
1812             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1813             scope(exit) _MM_SET_ROUNDING_MODE(old);
1814 
1815             // Convert to 64-bit integers
1816             __m128i integers = _mm_cvtps_epi32(a);
1817 
1818             // Convert back to float to achieve the rounding
1819             // The problem is that a 32-float can't represent all the values 
1820             // a 32-bit integer can (and vice-versa). So this function won't work for
1821             // large values. (TODO: what range exactly?)
1822             __m128 result = _mm_cvtepi32_ps(integers);
1823 
1824             return result;
1825         }
1826     }
1827 }
1828 unittest
1829 {
1830     // tested in other intrinsics
1831 }
1832 
1833 
1834 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1835 /// rounding parameter, store the result as a double-precision floating-point element 
1836 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1837 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1838 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1839 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1840 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1841 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1842 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1843 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1844 {
1845     static if (GDC_with_SSE41)
1846     {
1847         return __builtin_ia32_roundsd(a, b, rounding);
1848     }
1849     else static if (LDC_with_SSE41)
1850     {
1851         return __builtin_ia32_roundsd(a, b, rounding);
1852     }
1853     else
1854     {
1855         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1856         {
1857             // Convert to 64-bit integer
1858             long b0 = _mm_cvtsd_si64(b);
1859             a.ptr[0] = b0;
1860             return a;
1861         }
1862         else
1863         {
1864             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1865 
1866             uint old = _MM_GET_ROUNDING_MODE();
1867             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1868             
1869             // Convert to 64-bit integer
1870             long b0 = _mm_cvtsd_si64(b);
1871             a.ptr[0] = b0;
1872 
1873             // Convert back to double to achieve the rounding
1874             // The problem is that a 64-bit double can't represent all the values 
1875             // a 64-bit integer can (and vice-versa). So this function won't work for
1876             // large values. (TODO: what range exactly?)
1877             _MM_SET_ROUNDING_MODE(old);
1878             return a;
1879         }
1880     }
1881 }
1882 unittest
1883 {
1884     // tested in other intrinsics
1885 }
1886 
1887 
1888 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1889 /// rounding parameter, store the result as a single-precision floating-point element 
1890 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1891 /// to the upper elements of result.
1892 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1893 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1894 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1895 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1896 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1897 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1898 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1899 {
1900     static if (GDC_with_SSE41)
1901     {
1902         return __builtin_ia32_roundss(a, b, rounding);
1903     }
1904     else static if (LDC_with_SSE41)
1905     {
1906         return __builtin_ia32_roundss(a, b, rounding);
1907     }
1908     else
1909     {
1910         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1911         {
1912             int b0 = _mm_cvtss_si32(b);
1913             a.ptr[0] = b0;   
1914             return a;
1915         }
1916         else version(GNU)
1917         {
1918             pragma(inline, false)
1919             __m128 GDCworkaround() nothrow @nogc @trusted 
1920             {
1921                 uint old = _MM_GET_ROUNDING_MODE();
1922                 _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1923 
1924                 // Convert to 32-bit integer
1925                 int b0 = _mm_cvtss_si32(b);
1926                 a.ptr[0] = b0;       
1927 
1928                 // Convert back to double to achieve the rounding
1929                 // The problem is that a 64-bit double can't represent all the values 
1930                 // a 64-bit integer can (and vice-versa). So this function won't work for
1931                 // large values. (TODO: what range exactly?)
1932                 _MM_SET_ROUNDING_MODE(old);
1933                 return a;
1934             }
1935             return GDCworkaround();
1936         }
1937         else
1938         {
1939             uint old = _MM_GET_ROUNDING_MODE();
1940             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1941 
1942             // Convert to 32-bit integer
1943             int b0 = _mm_cvtss_si32(b);
1944             a.ptr[0] = b0;       
1945 
1946             // Convert back to double to achieve the rounding
1947             // The problem is that a 64-bit double can't represent all the values 
1948             // a 64-bit integer can (and vice-versa). So this function won't work for
1949             // large values. (TODO: what range exactly?)
1950             _MM_SET_ROUNDING_MODE(old);
1951             return a;
1952         }
1953     }
1954 }
1955 unittest
1956 {
1957     // tested in other intrinsics
1958 }
1959 
1960 
1961 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
1962 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
1963 /// exception may be generated.
1964 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted
1965 {
1966     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
1967     return *mem_addr; // it's a regular move instead
1968 }
1969 
1970 
1971 /// Return 1 if all bits in `a` are all 1's. Else return 0.
1972 int _mm_test_all_ones (__m128i a) @safe
1973 {
1974     return _mm_testc_si128(a, _mm_set1_epi32(-1));
1975 }
1976 unittest
1977 {
1978     __m128i A = _mm_set1_epi32(-1);
1979     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
1980     assert(_mm_test_all_ones(A) == 1);
1981     assert(_mm_test_all_ones(B) == 0);
1982 }
1983 
1984 /// Return 1 if all bits in `a` are all 0's. Else return 0.
1985 // This is a #BONUS since it was lacking in Intel Intrinsics API.
1986 int _mm_test_all_zeros (__m128i a) @safe
1987 {
1988     return _mm_testz_si128(a, _mm_set1_epi32(-1));
1989 }
1990 unittest
1991 {
1992     __m128i A = _mm_set1_epi32(0);
1993     __m128i B = _mm_set_epi32(0, 8, 0, 0);
1994     assert(_mm_test_all_zeros(A) == 1);
1995     assert(_mm_test_all_zeros(B) == 0);
1996 }
1997 
1998 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
1999 /// and return 1 if the result is zero, otherwise return 0.
2000 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
2001 {
2002     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
2003 }
2004 
2005 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
2006 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
2007 {
2008     return _mm_testnzc_si128(a, mask);
2009 }
2010 
2011 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
2012 /// result is zero, otherwise return 0.
2013 /// In other words, test if all bits masked by `b` are 1 in `a`.
2014 int _mm_testc_si128 (__m128i a, __m128i b) @trusted
2015 {
2016     // PERF DMD
2017     static if (GDC_with_SSE41)
2018     {
2019         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2020     }
2021     else static if (LDC_with_SSE41)
2022     {
2023         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2024     }
2025     else static if (LDC_with_ARM64)
2026     {
2027         // Acceptable since LDC 1.8 -02
2028         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2029         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2030     }
2031     else
2032     {
2033         __m128i c = ~a & b;
2034         int[4] zero = [0, 0, 0, 0];
2035         return c.array == zero;
2036     }
2037 }
2038 unittest
2039 {
2040     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2041     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2042     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2043     assert(_mm_testc_si128(A, A) == 1);
2044     assert(_mm_testc_si128(A, M1) == 0);
2045     assert(_mm_testc_si128(A, M2) == 1);
2046 }
2047 
2048 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2049 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2050 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2051 /// result is zero, otherwise set CF to 0. 
2052 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2053 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2054 {
2055     // PERF DMD
2056     static if (GDC_with_SSE41)
2057     {
2058         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2059     }
2060     else static if (LDC_with_SSE41)
2061     {
2062         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2063     }
2064     else static if (LDC_with_ARM64)
2065     {
2066         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2067         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2068 
2069         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2070                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2071     }
2072     else
2073     {
2074         __m128i c = a & b;
2075         __m128i d = ~a & b;
2076         int[4] zero = [0, 0, 0, 0];
2077         return !( (c.array == zero) || (d.array == zero));
2078     }    
2079 }
2080 unittest
2081 {
2082     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2083     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2084     __m128i Z = _mm_setzero_si128();
2085     assert(_mm_testnzc_si128(A, Z) == 0);
2086     assert(_mm_testnzc_si128(A, M) == 1);
2087     assert(_mm_testnzc_si128(A, A) == 0);
2088 }
2089 
2090 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2091 /// and return 1 if the result is zero, otherwise return 0.
2092 /// In other words, test if all bits masked by `b` are 0 in `a`.
2093 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2094 {
2095     // PERF DMD
2096     static if (GDC_with_SSE41)
2097     {
2098         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2099     }
2100     else static if (LDC_with_SSE41)
2101     {
2102         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2103     }
2104     else static if (LDC_with_ARM64)
2105     {
2106         // Acceptable since LDC 1.8 -02
2107         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2108         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2109     }
2110     else 
2111     {
2112         __m128i c = a & b;
2113         int[4] zero = [0, 0, 0, 0];
2114         return c.array == zero;
2115     }    
2116 }
2117 unittest
2118 {
2119     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2120     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2121     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2122     assert(_mm_testz_si128(A, A) == 0);
2123     assert(_mm_testz_si128(A, M1) == 1);
2124     assert(_mm_testz_si128(A, M2) == 0);
2125 }
2126