1 /**
2 * SSE4.1 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.smmintrin;
10 
11 // SSE4.1 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
13 // Note: this header will work whether you have SSE4.1 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
15 // generate SSE4.1 instructions.
16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
17 
18 public import inteli.types;
19 import inteli.internals;
20 
21 // smmintrin pulls in all previous instruction set intrinsics.
22 public import inteli.tmmintrin;
23 
24 nothrow @nogc:
25 
26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
27 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
28 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
29 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
30 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
31 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
32 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
33 
34 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
35 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
36 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
37 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
38 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
40 
41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
42 // Note: changed signature, GDC needs a compile-time value for imm8.
43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted
44 {
45     // PERF DMD
46     static if (GDC_with_SSE41)
47     {
48         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
49     }
50     else 
51     {
52         // LDC x86 This generates pblendw since LDC 1.1 and -O2
53         short8 r;
54         short8 sa = cast(short8)a;
55         short8 sb = cast(short8)b;
56         for (int n = 0; n < 8; ++n)
57         {
58             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
59         }
60         return cast(__m128i)r;
61     }
62 }
63 unittest
64 {
65     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
66     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
67     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
68     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
69     assert(C.array == correct);
70 }
71 
72 
73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
74 // Note: changed signature, GDC needs a compile-time value for `imm8`.
75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
76 {
77     static assert(imm8 >= 0 && imm8 < 4);
78     // PERF DMD
79     static if (GDC_with_SSE41)
80     {
81         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
82     }
83     else
84     {
85         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
86         double2 r;
87         for (int n = 0; n < 2; ++n)
88         {
89             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
90         }
91         return cast(__m128d)r;
92     }
93 }
94 unittest
95 {
96     __m128d A = _mm_setr_pd(0, 1);
97     __m128d B = _mm_setr_pd(8, 9);
98     double2 C = _mm_blend_pd!2(A, B);
99     double[2] correct =    [0, 9];
100     assert(C.array == correct);
101 }
102 
103 
104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`.
105 // Note: changed signature, GDC needs a compile-time value for imm8.
106 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted
107 {
108     // PERF DMD
109     static assert(imm8 >= 0 && imm8 < 16);
110     static if (GDC_with_SSE41)
111     {
112         return __builtin_ia32_blendps(a, b, imm8);
113     }
114     else version(LDC)
115     {
116         // LDC x86: generates blendps since LDC 1.1 -O2
117         //   arm64: pretty good, two instructions worst case
118         return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
119                                          (imm8 & 2) ? 5 : 1,
120                                          (imm8 & 4) ? 6 : 2,
121                                          (imm8 & 8) ? 7 : 3)(a, b);
122     }
123     else
124     {
125         __m128 r; // PERF =void;
126         for (int n = 0; n < 4; ++n)
127         {
128             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
129         }
130         return r;
131     }
132 }
133 unittest
134 {
135     __m128 A = _mm_setr_ps(0, 1,  2,  3);
136     __m128 B = _mm_setr_ps(8, 9, 10, 11);
137     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
138     float[4] correct =    [8, 1, 10, 11];
139     assert(C.array == correct);
140 }
141 
142 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
144 {
145     // PERF DMD
146     /*static if (GDC_with_SSE41)
147     {
148         // This intrinsic do nothing in GDC 12.
149         // TODO report to GDC. No problem in GCC.
150         return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
151     }
152     else*/
153     static if (LDC_with_SSE41)
154     {
155         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
156     }
157     else static if (LDC_with_ARM64)
158     {
159         // LDC arm64: two instructions since LDC 1.12 -O2
160         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
161         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
162     }
163     else
164     {
165         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
166         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
167     }
168 }
169 unittest
170 {
171     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
172                                8,  9, 10, 11, 12, 13, 14, 15);
173     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
174                               24, 25, 26, 27, 28, 29, 30, 31);
175     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
176                                1,  1, -1, -1,  4,  1,  8, -128);
177     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
178     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
179                                8,  9, 26, 27, 12, 13, 14, 31 ];
180     assert(R.array == correct);
181 }
182 
183 
184 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
185 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
186 {
187     // PERF DMD
188     static if (GDC_with_SSE42)
189     {
190         // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
191         // with -msse4.2 but not -msse4.1.
192         // Not sure what is the reason, and there is a replacement sequence.
193         // Sounds like a bug.
194         return __builtin_ia32_blendvpd(a, b, mask);
195     }
196     else static if (LDC_with_SSE41)
197     {
198         return __builtin_ia32_blendvpd(a, b, mask);
199     }
200     else static if (LDC_with_ARM64)
201     {
202         long2 shift;
203         shift = 63;
204         long2 lmask = cast(long2)mask >> shift;
205         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
206     }
207     else
208     {
209         __m128d r; // PERF =void;
210         long2 lmask = cast(long2)mask;
211         for (int n = 0; n < 2; ++n)
212         {
213             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
214         }
215         return r;
216     }
217 }
218 unittest
219 {
220     __m128d A = _mm_setr_pd(1.0, 2.0);
221     __m128d B = _mm_setr_pd(3.0, 4.0);
222     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
223     __m128d R1 = _mm_blendv_pd(A, B, M1);
224     double[2] correct1 = [3.0, 2.0];
225     assert(R1.array == correct1);
226 
227     // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
228     // See Issue #78
229     __m128d M2 = _mm_setr_pd(double.nan, double.infinity);
230     __m128d R2 = _mm_blendv_pd(A, B, M2);
231     double[2] correct2 = [1.0, 2.0];
232     assert(R2.array == correct2);
233 }
234 
235 
236 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
237 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
238 {
239     // PERF DMD
240     static if (GDC_with_SSE41)
241     {
242         return __builtin_ia32_blendvps(a, b, mask);
243     }
244     else static if (LDC_with_SSE41)
245     {
246         return __builtin_ia32_blendvps(a, b, mask);
247     }
248     else static if (LDC_with_ARM64)
249     {
250         int4 shift;
251         shift = 31;
252         int4 lmask = cast(int4)mask >> shift;
253         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
254     }
255     else
256     {
257         __m128 r; // PERF =void;
258         int4 lmask = cast(int4)mask;
259         for (int n = 0; n < 4; ++n)
260         {
261             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
262         }
263         return r;
264     }
265 }
266 unittest
267 {
268     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
269     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
270     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
271     __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
272     __m128 R1 = _mm_blendv_ps(A, B, M1);
273     __m128 R2 = _mm_blendv_ps(A, B, M2);
274     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
275     float[4] correct2 =    [ 0.0f, 1.0f, 6.0f, 3.0f];
276     assert(R1.array == correct1);
277 
278     // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
279     // See Issue #78
280     assert(R2.array == correct2);
281 }
282 
283 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
284 /// and store the results as packed double-precision floating-point elements.
285 __m128d _mm_ceil_pd (__m128d a) @trusted
286 {
287     static if (LDC_with_ARM64)
288     {
289         // LDC arm64 acceptable since 1.8 -O2
290         // Unfortunately x86 intrinsics force a round-trip back to double2
291         // ARM neon semantics wouldn't have that
292         long2 l = vcvtpq_s64_f64(a);
293         double2 r;
294         r.ptr[0] = l.array[0];
295         r.ptr[1] = l.array[1];
296         return r;
297     }
298     else
299     {
300         return _mm_round_pd!2(a);
301     }
302 }
303 unittest
304 {
305     __m128d A = _mm_setr_pd(1.3f, -2.12f);
306     __m128d B = _mm_setr_pd(53.6f, -2.7f);
307     A = _mm_ceil_pd(A);
308     B = _mm_ceil_pd(B);
309     double[2] correctA = [2.0, -2.0];
310     double[2] correctB = [54.0, -2.0];
311     assert(A.array == correctA);
312     assert(B.array == correctB);
313 }
314 
315 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
316 /// and store the results as packed single-precision floating-point elements.
317 __m128 _mm_ceil_ps (__m128 a) @trusted
318 {
319     static if (LDC_with_ARM64)
320     {
321         // LDC arm64 acceptable since 1.8 -O1
322         int4 l = vcvtpq_s32_f32(a);
323         float4 r;
324         r.ptr[0] = l.array[0];
325         r.ptr[1] = l.array[1];
326         r.ptr[2] = l.array[2];
327         r.ptr[3] = l.array[3];
328         return r;
329     }
330     else
331     {
332         return _mm_round_ps!2(a);
333     }
334 }
335 unittest
336 {
337     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
338     __m128 C = _mm_ceil_ps(A);
339     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
340     assert(C.array == correct);
341 }
342 
343 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
344 /// store the result as a double-precision floating-point element in the lower element of result, 
345 /// and copy the upper element from `a` to the upper element of dst.
346 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
347 {
348     static if (LDC_with_ARM64)
349     {
350         a[0] = vcvtps_s64_f64(b[0]);
351         return a;
352     }
353     else
354     {
355         return _mm_round_sd!2(a, b);
356     }
357 }
358 unittest
359 {
360     __m128d A = _mm_setr_pd(1.3, -2.12);
361     __m128d B = _mm_setr_pd(53.6, -3.7);
362     __m128d C = _mm_ceil_sd(A, B);
363     double[2] correct = [54.0, -2.12];
364     assert(C.array == correct);
365 }
366 
367 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
368 /// store the result as a single-precision floating-point element in the lower element of result, 
369 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
370 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
371 {
372     static if (LDC_with_ARM64)
373     {
374         a[0] = vcvtps_s32_f32(b[0]);
375         return a;
376     }
377     else
378     {
379         return _mm_round_ss!2(a, b);
380     }
381 }
382 unittest
383 {
384     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
385     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
386     __m128 C = _mm_ceil_ss(A, B);
387     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
388     assert(C.array == correct);
389 }
390 
391 /// Compare packed 64-bit integers in `a` and `b` for equality.
392 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
393 {
394     // PERF DMD
395     static if (GDC_with_SSE41)
396     {
397         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
398     }
399     else version(LDC)
400     {
401         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
402         //     arm64: generates cmeq since LDC 1.8 -O1
403         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
404     }
405     else
406     {
407         // Clever pcmpeqd + pand use with LDC 1.24 -O2
408         long2 la = cast(long2)a;
409         long2 lb = cast(long2)b;
410         long2 res;
411         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
412         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
413         return cast(__m128i)res;
414     }
415 }
416 unittest
417 {
418     __m128i A = _mm_setr_epi64(-1, -2);
419     __m128i B = _mm_setr_epi64(-3, -2);
420     __m128i C = _mm_setr_epi64(-1, -4);
421     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
422     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
423     long[2] correct1 = [0, -1];
424     long[2] correct2 = [-1, 0];
425     assert(AB.array == correct1);
426     assert(AC.array == correct2);
427 }
428 
429 
430 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
431 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
432 {
433     // PERF DMD
434     static if (GDC_with_SSE41)
435     {
436         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
437     }
438     else version(LDC)
439     {
440         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
441         enum ir = `
442             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
443             %r = sext <4 x i16> %v to <4 x i32>
444             ret <4 x i32> %r`;
445         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
446     }
447     else
448     {
449         short8 sa = cast(short8)a;
450         int4 r;
451         r.ptr[0] = sa.array[0];
452         r.ptr[1] = sa.array[1];
453         r.ptr[2] = sa.array[2];
454         r.ptr[3] = sa.array[3];
455         return r;
456     }
457 }
458 unittest
459 {
460     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
461     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
462     int[4] correct = [-1, 0, -32768, 32767];
463     assert(C.array == correct);
464 }
465 
466 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
467 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
468 {
469     // PERF DMD
470     static if (GDC_with_SSE41)
471     {
472         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
473     }
474     else version(LDC)
475     {
476         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
477         enum ir = `
478             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
479             %r = sext <2 x i16> %v to <2 x i64>
480             ret <2 x i64> %r`;
481         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
482     }
483     else
484     {
485         short8 sa = cast(short8)a;
486         long2 r;
487         r.ptr[0] = sa.array[0];
488         r.ptr[1] = sa.array[1];
489         return cast(__m128i)r;
490     }
491 }
492 unittest
493 {
494     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
495     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
496     long[2] correct = [-32768, 32767];
497     assert(C.array == correct);
498 }
499 
500 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
501 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
502 {
503     // PERF DMD
504     static if (GDC_with_SSE41)
505     {
506         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
507     }
508     else version(LDC)
509     {
510         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
511         enum ir = `
512             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
513             %r = sext <2 x i32> %v to <2 x i64>
514             ret <2 x i64> %r`;
515         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
516     }
517     else
518     {
519         int4 sa = cast(int4)a;
520         long2 r;
521         r.ptr[0] = sa.array[0];
522         r.ptr[1] = sa.array[1];
523         return cast(__m128i)r;
524     }
525 }
526 unittest
527 {
528     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
529     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
530     long[2] correct = [-4, 42];
531     assert(C.array == correct);
532 }
533 
534 
535 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
536 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
537 {
538     // PERF DMD
539     static if (GDC_with_SSE41)
540     {
541         alias ubyte16 = __vector(ubyte[16]);
542         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
543     }
544     else version(LDC)
545     {
546         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
547         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
548         enum ir = `
549             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
550             %r = sext <8 x i8> %v to <8 x i16>
551             ret <8 x i16> %r`;
552         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
553     }
554     else
555     {
556         byte16 sa = cast(byte16)a;
557         short8 r;
558         foreach(n; 0..8)
559             r.ptr[n] = sa.array[n];
560         return cast(__m128i)r;
561     }
562 }
563 unittest
564 {
565     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
566     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
567     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
568     assert(C.array == correct);
569 }
570 
571 
572 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
573 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
574 {
575     // PERF DMD
576     static if (GDC_with_SSE41)
577     {
578         alias ubyte16 = __vector(ubyte[16]);
579         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
580     }
581     else static if (LDC_with_SSE41)
582     {
583         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
584         enum ir = `
585             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
586             %r = sext <4 x i8> %v to <4 x i32>
587             ret <4 x i32> %r`;
588         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
589     }
590     else
591     {
592         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
593         byte16 sa = cast(byte16)a;
594         int4 r;
595         r.ptr[0] = sa.array[0];
596         r.ptr[1] = sa.array[1];
597         r.ptr[2] = sa.array[2];
598         r.ptr[3] = sa.array[3];
599         return cast(__m128i)r;
600     }
601 }
602 unittest
603 {
604     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
605     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
606     int[4] correct = [127, -128, 1, -1];
607     assert(C.array == correct);
608 }
609 
610 
611 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
612 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
613 {
614     // PERF DMD
615     static if (GDC_with_SSE41)
616     {
617         alias ubyte16 = __vector(ubyte[16]);
618         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
619     }
620     else version(LDC)
621     {
622         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
623         // LDC arm64: it's ok since LDC 1.8 -O1
624         enum ir = `
625             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
626             %r = sext <2 x i8> %v to <2 x i64>
627             ret <2 x i64> %r`;
628         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
629     }
630     else
631     {
632         byte16 sa = cast(byte16)a;
633         long2 r;
634         foreach(n; 0..2)
635             r.ptr[n] = sa.array[n];
636         return cast(__m128i)r;
637     }
638 }
639 unittest
640 {
641     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
642     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
643     long[2] correct = [127, -128];
644     assert(C.array == correct);
645 }
646 
647 
648 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
649 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
650 {
651     // PERF DMD
652     static if (GDC_with_SSE41)
653     {
654         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
655     }
656     else
657     {
658         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
659         //     arm64: ushll since LDC 1.12 -O1
660         short8 sa = cast(short8)a;
661         int4 r;
662         r.ptr[0] = cast(ushort)sa.array[0];
663         r.ptr[1] = cast(ushort)sa.array[1];
664         r.ptr[2] = cast(ushort)sa.array[2];
665         r.ptr[3] = cast(ushort)sa.array[3];
666         return cast(__m128i)r;
667     }
668 }
669 unittest
670 {
671     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
672     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
673     int[4] correct = [65535, 0, 32768, 32767];
674     assert(C.array == correct);
675 }
676 
677 
678 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
679 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
680 {
681     // PERF DMD
682     static if (GDC_with_SSE41)
683     {
684         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
685     }
686     else static if (LDC_with_ARM64)
687     {
688         // LDC arm64: a bit shorter than below, in -O2
689         short8 sa = cast(short8)a;
690         long2 r;
691         for(int n = 0; n < 2; ++n)
692             r.ptr[n] = cast(ushort)sa.array[n];
693         return cast(__m128i)r;
694     }
695     else
696     {
697         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
698         short8 sa = cast(short8)a;
699         long2 r;
700         r.ptr[0] = cast(ushort)sa.array[0];
701         r.ptr[1] = cast(ushort)sa.array[1];
702         return cast(__m128i)r;
703     }
704 }
705 unittest
706 {
707     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
708     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
709     long[2] correct = [65535, 0];
710     assert(C.array == correct);
711 }
712 
713 
714 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
715 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
716 {
717     // PERF DMD
718     static if (GDC_with_SSE41)
719     {
720         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
721     }
722     else
723     {
724         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
725         //     arm64: generates ushll since LDC 1.12 -O1
726         int4 sa = cast(int4)a;
727         long2 r;
728         r.ptr[0] = cast(uint)sa.array[0];
729         r.ptr[1] = cast(uint)sa.array[1];
730         return cast(__m128i)r;
731     }
732 }
733 unittest
734 {
735     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
736     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
737     long[2] correct = [4294967295, 42];
738     assert(C.array == correct);
739 }
740 
741 
742 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
743 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
744 {
745     // PERF DMD
746     static if (GDC_with_SSE41)
747     {
748         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
749     }
750     else
751     {
752         // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1
753         //     arm64: ushll since LDC 1.12 -O1
754         // PERF: catastrophic with GDC without SSE4.1
755         byte16 sa = cast(byte16)a;
756         short8 r;
757         r.ptr[0] = cast(ubyte)sa.array[0];
758         r.ptr[1] = cast(ubyte)sa.array[1];
759         r.ptr[2] = cast(ubyte)sa.array[2];
760         r.ptr[3] = cast(ubyte)sa.array[3];
761         r.ptr[4] = cast(ubyte)sa.array[4];
762         r.ptr[5] = cast(ubyte)sa.array[5];
763         r.ptr[6] = cast(ubyte)sa.array[6];
764         r.ptr[7] = cast(ubyte)sa.array[7];
765         return cast(__m128i)r;
766     }
767 }
768 unittest
769 {
770     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
771     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
772     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
773     assert(C.array == correct);
774 }
775 
776 
777 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
778 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
779 {
780     // PERF DMD
781     static if (GDC_with_SSE41)
782     {
783         alias ubyte16 = __vector(ubyte[16]);
784         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
785     }
786     else static if (LDC_with_ARM64)
787     {
788         // LDC arm64: a bit better than below in -O2
789         byte16 sa = cast(byte16)a;
790         int4 r;
791         for(int n = 0; n < 4; ++n) 
792             r.ptr[n] = cast(ubyte)sa.array[n];
793         return cast(__m128i)r;
794     }
795     else
796     {
797         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
798         // PERF: catastrophic with GDC without SSE4.1
799         byte16 sa = cast(byte16)a;
800         int4 r;
801         r.ptr[0] = cast(ubyte)sa.array[0];
802         r.ptr[1] = cast(ubyte)sa.array[1];
803         r.ptr[2] = cast(ubyte)sa.array[2];
804         r.ptr[3] = cast(ubyte)sa.array[3];
805         return cast(__m128i)r;
806     }
807 }
808 unittest
809 {
810     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
811     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
812     int[4] correct = [127, 128, 1, 255];
813     assert(C.array == correct);
814 }
815 
816 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
817 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
818 {
819     // PERF DMD
820     static if (GDC_with_SSE41)
821     {
822         alias ubyte16 = __vector(ubyte[16]);
823         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
824     }
825     else static if (LDC_with_ARM64)
826     {
827         // LDC arm64: this optimizes better than the loop below
828         byte16 sa = cast(byte16)a;
829         long2 r;
830         for (int n = 0; n < 2; ++n)
831             r.ptr[n] = cast(ubyte)sa.array[n];
832         return cast(__m128i)r;
833     }
834     else
835     {
836         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
837         byte16 sa = cast(byte16)a;
838         long2 r;
839         r.ptr[0] = cast(ubyte)sa.array[0];
840         r.ptr[1] = cast(ubyte)sa.array[1];
841         return cast(__m128i)r;
842     }
843 }
844 unittest
845 {
846     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
847     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
848     long[2] correct = [127, 254];
849     assert(C.array == correct);
850 }
851 
852 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
853 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
854 /// store the sum in dst using the low 4 bits of `imm8`.
855 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
856 {
857     // PERF DMD
858     static if (GDC_with_SSE41)
859     {
860         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
861     }
862     else static if (LDC_with_SSE41)
863     {
864         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
865     }
866     else
867     {
868         __m128d zero = _mm_setzero_pd();
869         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
870         double sum = temp.array[0] + temp.array[1];
871         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
872     }
873 }
874 unittest
875 {
876     __m128d A = _mm_setr_pd(1.0, 2.0);
877     __m128d B = _mm_setr_pd(4.0, 8.0);
878     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
879     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
880     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
881     double[2] correct1 = [ 4.0,  4.0];
882     double[2] correct2 = [16.0,  0.0];
883     double[2] correct3 = [ 0.0, 20.0];
884     assert(R1.array == correct1);
885     assert(R2.array == correct2);
886     assert(R3.array == correct3);
887 }
888 
889 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
890 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
891 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
892 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
893 {
894       // PERF DMD
895     static if (GDC_with_SSE41)
896     {
897         return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
898     }
899     else static if (LDC_with_SSE41)
900     {
901         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
902     }
903     else
904     {
905         __m128 zero = _mm_setzero_ps();
906         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
907         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
908         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
909     }        
910 }
911 unittest
912 {
913     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
914     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
915     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
916     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
917     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
918     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
919     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
920     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
921     assert(R1.array == correct1);
922     assert(R2.array == correct2);
923     assert(R3.array == correct3);
924 }
925 
926 
927 /// Extract a 32-bit integer from `a`, selected with `imm8`.
928 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
929 {
930     return (cast(int4)a).array[imm8 & 3];
931 }
932 unittest
933 {
934     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
935     assert(_mm_extract_epi32(A, 0) == 1);
936     assert(_mm_extract_epi32(A, 1 + 8) == 2);
937     assert(_mm_extract_epi32(A, 3 + 4) == 4);
938 }
939 
940 /// Extract a 64-bit integer from `a`, selected with `imm8`.
941 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
942 {
943     long2 la = cast(long2)a;
944     return la.array[imm8 & 1];
945 }
946 unittest
947 {
948     __m128i A = _mm_setr_epi64(45, -67);
949     assert(_mm_extract_epi64(A, 0) == 45);
950     assert(_mm_extract_epi64(A, 1) == -67);
951     assert(_mm_extract_epi64(A, 2) == 45);
952 }
953 
954 /// Extract an 8-bit integer from `a`, selected with `imm8`.
955 /// Warning: the returned value is zero-extended to 32-bits.
956 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
957 {
958     byte16 ba = cast(byte16)a;
959     return cast(ubyte) ba.array[imm8 & 15];
960 }
961 unittest
962 {
963     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
964     assert(_mm_extract_epi8(A, 7) == 7);
965     assert(_mm_extract_epi8(A, 13) == 255);
966     assert(_mm_extract_epi8(A, 7 + 16) == 7);
967 }
968 
969 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
970 /// Note: returns a 32-bit $(I integer).
971 int _mm_extract_ps (__m128 a, const int imm8) @trusted
972 {
973     return (cast(int4)a).array[imm8 & 3];
974 }
975 unittest
976 {
977     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
978     assert(_mm_extract_ps(A, 0) == 0x3f800000);
979     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
980     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
981 }
982 
983 
984 
985 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
986 /// integer value, and store the results as packed double-precision floating-point elements.
987 __m128d _mm_floor_pd (__m128d a) @trusted
988 {
989     static if (LDC_with_ARM64)
990     {
991         // LDC arm64 acceptable since 1.8 -O2
992         long2 l = vcvtmq_s64_f64(a);
993         double2 r;
994         r.ptr[0] = l.array[0];
995         r.ptr[1] = l.array[1];
996         return r;
997     }
998     else
999     {
1000         return _mm_round_pd!1(a);
1001     }
1002 }
1003 unittest
1004 {
1005     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1006     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1007     A = _mm_floor_pd(A);
1008     B = _mm_floor_pd(B);
1009     double[2] correctA = [1.0, -3.0];
1010     double[2] correctB = [53.0, -3.0];
1011     assert(A.array == correctA);
1012     assert(B.array == correctB);
1013 }
1014 
1015 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1016 /// integer value, and store the results as packed single-precision floating-point elements.
1017 __m128 _mm_floor_ps (__m128 a) @trusted
1018 {
1019     static if (LDC_with_ARM64)
1020     {
1021         // LDC arm64 acceptable since 1.8 -O1
1022         int4 l = vcvtmq_s32_f32(a);
1023         float4 r;
1024         r.ptr[0] = l.array[0];
1025         r.ptr[1] = l.array[1];
1026         r.ptr[2] = l.array[2];
1027         r.ptr[3] = l.array[3];
1028         return r;
1029     }
1030     else
1031     {
1032         return _mm_round_ps!1(a);
1033     }
1034 }
1035 unittest
1036 {
1037     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1038     __m128 C = _mm_floor_ps(A);
1039     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1040     assert(C.array == correct);
1041 }
1042 
1043 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1044 /// integer value, store the result as a double-precision floating-point element in the 
1045 /// lower element, and copy the upper element from `a` to the upper element.
1046 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1047 {
1048     static if (LDC_with_ARM64)
1049     {
1050         a[0] = vcvtms_s64_f64(b[0]);
1051         return a;
1052     }
1053     else
1054     {
1055         return _mm_round_sd!1(a, b);
1056     }
1057 }
1058 unittest
1059 {
1060     __m128d A = _mm_setr_pd(1.3, -2.12);
1061     __m128d B = _mm_setr_pd(-53.1, -3.7);
1062     __m128d C = _mm_floor_sd(A, B);
1063     double[2] correct = [-54.0, -2.12];
1064     assert(C.array == correct);
1065 }
1066 
1067 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1068 /// integer value, store the result as a single-precision floating-point element in the
1069 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1070 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1071 {
1072     static if (LDC_with_ARM64)
1073     {
1074         a[0] = vcvtms_s32_f32(b[0]);
1075         return a;
1076     }
1077     else
1078     {
1079         return _mm_round_ss!1(a, b);
1080     }
1081 }
1082 unittest
1083 {
1084     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1085     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1086     __m128 C = _mm_floor_ss(A, B);
1087     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1088     assert(C.array == correct);
1089 }
1090 
1091 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1092 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1093 {
1094     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1095     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1096     // LDC arm64: ins.s since LDC 1.8 -O2
1097     int4 ia = cast(int4)a;
1098     ia.ptr[imm8 & 3] = i;
1099     return cast(__m128i)ia; 
1100 }
1101 unittest
1102 {
1103     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1104     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1105     int[4] result = [1, 2, 5, 4];
1106     assert(C.array == result);
1107 }
1108 
1109 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1110 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1111 {
1112     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1113     // LDC x86: always do something sensible.
1114     long2 la = cast(long2)a;
1115     la.ptr[imm8 & 1] = i;
1116     return cast(__m128i)la;
1117 }
1118 unittest
1119 {
1120     __m128i A = _mm_setr_epi64(1, 2);
1121     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1122     long[2] result = [1, 5];
1123     assert(C.array == result);
1124 }
1125 
1126 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1127 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1128 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1129 {
1130     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1131     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1132     byte16 ba = cast(byte16)a;
1133     ba.ptr[imm8 & 15] = cast(byte)i;
1134     return cast(__m128i)ba; 
1135 }
1136 unittest
1137 {
1138     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1139     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1140     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1141     assert(C.array == result);
1142 }
1143 
1144 
1145 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1146 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1147 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1148 /// (elements are zeroed out when the corresponding bit is set).
1149 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1150 {
1151     // PERF DMD
1152     static if (GDC_with_SSE41)
1153     {
1154         return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
1155     }
1156     else static if (LDC_with_SSE41)
1157     {
1158         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1159     }
1160     else
1161     {
1162         float4 tmp2 = a;
1163         float tmp1 = b.array[(imm8 >> 6) & 3];
1164         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1165         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1166     }
1167 }
1168 unittest
1169 {
1170     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1171     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1172     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1173     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1174     assert(C.array == correct);
1175 }
1176 
1177 
1178 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1179 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
1180 {
1181     static if (GDC_with_SSE41)
1182     {
1183         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1184     }
1185     else version(LDC)
1186     {
1187         // x86: pmaxsd since LDC 1.1 -O1
1188         // ARM: smax.4s since LDC 1.8 -01
1189         int4 sa = cast(int4)a;
1190         int4 sb = cast(int4)b;
1191         int4 greater = greaterMask!int4(sa, sb);
1192         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1193     }
1194     else
1195     {
1196         __m128i higher = _mm_cmpgt_epi32(a, b);
1197         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1198         __m128i mask = _mm_and_si128(aTob, higher);
1199         return _mm_xor_si128(b, mask);
1200     }
1201 }
1202 unittest
1203 {
1204     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1205                                       _mm_setr_epi32(        -4,-8,  9, -8));
1206     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1207     assert(R.array == correct);
1208 }
1209 
1210 /// Compare packed signed 8-bit integers in `a` and `b`, 
1211 /// and return packed maximum values.
1212 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
1213 {
1214     // PERF DMD
1215     static if (GDC_with_SSE41)
1216     {
1217         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1218     }
1219     else version(LDC)
1220     {
1221         // x86: pmaxsb since LDC 1.1 -O1
1222         // ARM64: smax.16b since LDC 1.8.0 -O1
1223         byte16 sa = cast(byte16)a;
1224         byte16 sb = cast(byte16)b;
1225         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1226         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1227     }
1228     else
1229     {
1230         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1231         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1232         __m128i mask = _mm_and_si128(aTob, lower);
1233         return _mm_xor_si128(b, mask);
1234     }
1235 }
1236 unittest
1237 {
1238     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1239     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1240     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1241     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1242     assert(R.array == correct);
1243 }
1244 
1245 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1246 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
1247 {
1248     // PERF DMD
1249     static if (GDC_with_SSE41)
1250     {
1251         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1252     }
1253     else version(LDC)
1254     {
1255         // x86: pmaxuw since LDC 1.1 -O1
1256         // ARM64: umax.8h since LDC 1.8.0 -O1
1257         // PERF: without sse4.1, LLVM 12 produces a very interesting
1258         //          psubusw xmm0, xmm1
1259         //          paddw   xmm0, xmm1
1260         //       sequence that maybe should go in other min/max intrinsics? 
1261         ushort8 sa = cast(ushort8)a;
1262         ushort8 sb = cast(ushort8)b;
1263         ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1264         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1265     }
1266     else
1267     {
1268         b = _mm_subs_epu16(b, a);
1269         b = _mm_add_epi16(b, a);
1270         return b;
1271     }
1272 }
1273 unittest
1274 {
1275     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1276                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1277     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1278     assert(R.array == correct);
1279 }
1280 
1281 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1282 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
1283 {
1284     // PERF DMD
1285     static if (GDC_with_SSE41)
1286     {
1287         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1288     }
1289     else version(LDC)
1290     {
1291         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1292         // ARM64: umax.4s since LDC 1.8.0 -O1
1293         uint4 sa = cast(uint4)a;
1294         uint4 sb = cast(uint4)b;
1295         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1296         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1297     }
1298     else
1299     {
1300         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1301         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1302         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1303         __m128i mask = _mm_and_si128(aTob, higher);
1304         return _mm_xor_si128(b, mask);
1305     }
1306 }
1307 unittest
1308 {
1309     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1310                                       _mm_setr_epi32(        -4,-8,  9, -8));
1311     int[4] correct =                                [        -4,-8,  9, -7];
1312     assert(R.array == correct);
1313 }
1314 
1315 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1316 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
1317 {
1318     // PERF DMD
1319     static if (GDC_with_SSE41)
1320     {
1321         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1322     }
1323     else version(LDC)
1324     {
1325         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1326         // ARM: smin.4s since LDC 1.8 -01
1327         int4 sa = cast(int4)a;
1328         int4 sb = cast(int4)b;
1329         int4 greater = greaterMask!int4(sa, sb);
1330         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1331     }
1332     else
1333     {
1334         __m128i higher = _mm_cmplt_epi32(a, b);
1335         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1336         __m128i mask = _mm_and_si128(aTob, higher);
1337         return _mm_xor_si128(b, mask);
1338     }
1339 }
1340 unittest
1341 {
1342     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1343                                       _mm_setr_epi32(        -4, -8,  9, -8));
1344     int[4] correct =                               [         -4, -8, -4, -8];
1345     assert(R.array == correct);
1346 }
1347 
1348 /// Compare packed signed 8-bit integers in `a` and `b`, 
1349 /// and return packed minimum values.
1350 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
1351 {
1352     // PERF DMD
1353     static if (GDC_with_SSE41)
1354     {
1355         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1356     }
1357     else version(LDC)
1358     {
1359         // x86: pminsb since LDC 1.1 -O1
1360         // ARM64: smin.16b since LDC 1.8.0 -O1
1361         byte16 sa = cast(byte16)a;
1362         byte16 sb = cast(byte16)b;
1363         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1364         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1365     }
1366     else
1367     {
1368         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1369         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1370         __m128i mask = _mm_and_si128(aTob, lower);
1371         return _mm_xor_si128(b, mask);
1372     }
1373 }
1374 unittest
1375 {
1376     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1377     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1378     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1379     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1380     assert(R.array == correct);
1381 }
1382 
1383 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1384 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
1385 {
1386     // PERF DMD
1387     static if (GDC_with_SSE41)
1388     {
1389         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1390     }
1391     else version(LDC)
1392     {
1393         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1394         // ARM64: umin.8h since LDC 1.8.0 -O1
1395         ushort8 sa = cast(ushort8)a;
1396         ushort8 sb = cast(ushort8)b;
1397         ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1398         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1399     }
1400     else
1401     {
1402         __m128i c = _mm_subs_epu16(b, a);
1403         b = _mm_sub_epi16(b, c);
1404         return b;
1405     }
1406 }
1407 unittest
1408 {
1409     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1410                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1411     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1412     assert(R.array == correct);
1413 }
1414 
1415 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1416 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
1417 {
1418     // PERF DMD
1419     static if (GDC_with_SSE41)
1420     {
1421         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1422     }
1423     else version(LDC)
1424     {
1425         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1426         // ARM64: umin.4s since LDC 1.8.0 -O1
1427         uint4 sa = cast(uint4)a;
1428         uint4 sb = cast(uint4)b;
1429         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1430         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1431     }
1432     else
1433     {
1434         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1435         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1436         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1437         __m128i mask = _mm_and_si128(aTob, higher);
1438         return _mm_xor_si128(b, mask);
1439     }
1440 }
1441 unittest
1442 {
1443     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1444                                       _mm_setr_epi32(        -4,-8,  9, -8));
1445     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1446     assert(R.array == correct);
1447 }
1448 
1449 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1450 /// store the minimum and index in return value, and zero the remaining bits.
1451 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1452 {
1453     // PERF DMD
1454     static if (GDC_with_SSE41)
1455     {
1456         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1457     }
1458     else static if (LDC_with_SSE41)
1459     {
1460         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1461     }
1462     else static if (LDC_with_ARM64)
1463     {
1464         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1465         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1466         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1467         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1468         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1469         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1470         short8 sbest = cast(short8)best;
1471         short8 r;
1472         r[0] = sbest[1];
1473         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1474         r[2] = 0;
1475         r[3] = 0;
1476         r[4] = 0;
1477         r[5] = 0;
1478         r[6] = 0;
1479         r[7] = 0;
1480         return cast(__m128i)r;
1481     }
1482     else
1483     {
1484         short8 sa = cast(short8)a;
1485         ushort min = 0xffff;
1486         int index = 0;
1487         for(int n = 0; n < 8; ++n)
1488         {
1489             ushort c = sa.array[n];
1490             if (c < min)
1491             {
1492                 min = c;
1493                 index = n;
1494             }
1495         }
1496         short8 r;
1497         r.ptr[0] = min;
1498         r.ptr[1] = cast(short)index;
1499         return cast(__m128i)r;
1500     }
1501 }
1502 unittest
1503 {
1504     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1505     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1506     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1507     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1508     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1509     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1510     assert(R1.array == correct1);
1511     assert(R2.array == correct2);
1512 }
1513 
1514 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1515 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1516 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1517 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1518 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1519 /// at the offset specified in `imm8[2]`.
1520 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1521 {
1522     // PERF DMD
1523     static if (GDC_with_SSE41)
1524     {
1525         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
1526     }
1527     else static if (LDC_with_SSE41)
1528     {
1529         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1530     }
1531     else
1532     {
1533         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1534         int b_offset = (imm8 & 3) * 4;
1535 
1536         byte16 ba = cast(byte16)a;
1537         byte16 bb = cast(byte16)b;
1538         short8 r;
1539 
1540         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1541 
1542         for (int j = 0; j < 8; j += 2)
1543         {
1544             int k = a_offset + j;
1545             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1546                                            0, 0, 0, 0, 
1547                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1548                                            0, 0, 0, 0);
1549             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1550             r.ptr[j] = diffs.array[0];
1551             r.ptr[j+1] = diffs.array[4];
1552         }
1553         return cast(__m128i)r;
1554     }
1555 }
1556 unittest
1557 {
1558     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1559     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1560     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1561     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1562     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1563     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1564     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1565     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
1566     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
1567     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
1568     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
1569     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
1570     assert(r1.array == correct1);
1571     assert(r4.array == correct4);
1572     assert(r5.array == correct5);
1573     assert(r7.array == correct7);
1574     assert(r8.array == correct0);
1575 }
1576 
1577 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1578 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
1579 {
1580     // PERF DMD
1581     static if (GDC_with_SSE41)
1582     {
1583         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1584     }
1585     else static if (LDC_with_SSE41)
1586     {
1587         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1588         // Use IR instead.
1589         // This generates pmuldq with since LDC 1.2.0 -O0 
1590         enum ir = `
1591             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1592             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1593             %la = sext <2 x i32> %ia to <2 x i64>
1594             %lb = sext <2 x i32> %ib to <2 x i64>
1595             %r = mul <2 x i64> %la, %lb
1596             ret <2 x i64> %r`;
1597         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1598     }
1599     else static if (LDC_with_ARM64)  
1600     {
1601         // 3 instructions since LDC 1.8 -O2
1602         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1603         int2 a_lo = vmovn_s64(cast(long2)a);
1604         int2 b_lo = vmovn_s64(cast(long2)b);
1605         return cast(__m128i) vmull_s32(a_lo, b_lo);
1606     }
1607     else
1608     {
1609         int4 ia = cast(int4)a;
1610         int4 ib = cast(int4)b;
1611         long2 r;
1612         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1613         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1614         return cast(__m128i)r;
1615     }
1616 }
1617 unittest
1618 {
1619     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1620     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1621     long2 R = cast(long2) _mm_mul_epi32(A, B);
1622     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1623     assert(R.array == correct);
1624 }
1625 
1626 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1627 /// return the low 32 bits of the intermediate integers.
1628 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
1629 {
1630     // PERF DMD
1631     // PERF GDC without SSE4.1 could be better
1632     static if (GDC_with_SSE41)
1633     {
1634         int4 ia = cast(int4)a;
1635         int4 ib = cast(int4)b;
1636         // Note: older GDC doesn't have that op, but older GDC
1637         // also has no support for -msse4.1 detection
1638         return cast(__m128i)(a * b); 
1639     }
1640     else version(LDC)
1641     {
1642         int4 ia = cast(int4)a;
1643         int4 ib = cast(int4)b;
1644         return cast(__m128i)(a * b);
1645     }
1646     else
1647     {
1648         // DMD doesn't take the above
1649         int4 ia = cast(int4)a;
1650         int4 ib = cast(int4)b;
1651         int4 r;
1652         r.ptr[0] = ia.array[0] * ib.array[0];
1653         r.ptr[1] = ia.array[1] * ib.array[1];
1654         r.ptr[2] = ia.array[2] * ib.array[2];
1655         r.ptr[3] = ia.array[3] * ib.array[3];
1656         return r;
1657     }
1658 }
1659 unittest
1660 {
1661     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1662     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1663     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1664     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1665     assert(R.array == correct);
1666 }
1667 
1668 
1669 /// Convert packed signed 32-bit integers from `a` and `b` 
1670 /// to packed 16-bit integers using unsigned saturation.
1671 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
1672 {
1673     static if (GDC_with_SSE41)
1674     {
1675         // PERF For some reason doesn't generates the builtin???
1676         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1677     }
1678     else static if (LDC_with_SSE41)
1679     {
1680         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1681     }
1682     else static if (LDC_with_ARM64)
1683     {
1684        int4 z;
1685        z = 0;       
1686        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1687                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1688     }
1689     else
1690     {
1691         // PERF: not great without SSE4.1
1692         int4 sa = cast(int4)a;
1693         int4 sb = cast(int4)b;
1694         align(16) ushort[8] result;
1695         for (int i = 0; i < 4; ++i)
1696         {
1697             int s = sa.array[i];
1698             if (s < 0) s = 0;
1699             if (s > 65535) s = 65535;
1700             result.ptr[i] = cast(ushort)s;
1701 
1702             s = sb.array[i];
1703             if (s < 0) s = 0;
1704             if (s > 65535) s = 65535;
1705             result.ptr[i+4] = cast(ushort)s;
1706         }
1707         return *cast(__m128i*)(result.ptr);
1708     }
1709 }
1710 unittest
1711 {
1712     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1713     short8 R = cast(short8) _mm_packus_epi32(A, A);
1714     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1715     assert(R.array == correct);
1716 }
1717 
1718 
1719 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1720 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1721 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1722 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1723 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1724 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1725 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1726 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1727 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1728 {
1729     // PERF DMD
1730     static if (GDC_with_SSE41)
1731     {
1732         return __builtin_ia32_roundpd(a, rounding);
1733     }
1734     else static if (LDC_with_SSE41)
1735     {
1736         return __builtin_ia32_roundpd(a, rounding);
1737     }
1738     else
1739     {
1740         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1741         {
1742             // Convert to 64-bit integers
1743             long lo = _mm_cvtsd_si64(a);
1744             a.ptr[0] = a.array[1];
1745             long hi = _mm_cvtsd_si64(a);
1746             return _mm_setr_pd(lo, hi);
1747         }
1748         else
1749         {
1750             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1751 
1752             uint old = _MM_GET_ROUNDING_MODE();
1753             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1754             
1755             // Convert to 64-bit integers
1756             long lo = _mm_cvtsd_si64(a);
1757             a.ptr[0] = a.array[1];
1758             long hi = _mm_cvtsd_si64(a);
1759 
1760             // Convert back to double to achieve the rounding
1761             // The problem is that a 64-bit double can't represent all the values 
1762             // a 64-bit integer can (and vice-versa). So this function won't work for
1763             // large values. (TODO: what range exactly?)
1764             _MM_SET_ROUNDING_MODE(old);
1765             return _mm_setr_pd(lo, hi);
1766         }
1767     }
1768 }
1769 unittest
1770 {
1771     // tested in other intrinsics
1772 }
1773 
1774 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1775 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1776 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1777 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1778 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1779 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1780 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1781 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1782 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1783 {
1784     // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
1785     static if (GDC_or_LDC_with_SSE41)
1786     {
1787         return __builtin_ia32_roundps(a, rounding);
1788     }
1789     else
1790     {
1791         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1792         {
1793             __m128i integers = _mm_cvtps_epi32(a);
1794             return _mm_cvtepi32_ps(integers);
1795         }
1796         else
1797         {
1798             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1799             uint old = _MM_GET_ROUNDING_MODE();
1800             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1801             scope(exit) _MM_SET_ROUNDING_MODE(old);
1802 
1803             // Convert to 64-bit integers
1804             __m128i integers = _mm_cvtps_epi32(a);
1805 
1806             // Convert back to float to achieve the rounding
1807             // The problem is that a 32-float can't represent all the values 
1808             // a 32-bit integer can (and vice-versa). So this function won't work for
1809             // large values. (TODO: what range exactly?)
1810             __m128 result = _mm_cvtepi32_ps(integers);
1811 
1812             return result;
1813         }
1814     }
1815 }
1816 unittest
1817 {
1818     // tested in other intrinsics
1819 }
1820 
1821 
1822 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1823 /// rounding parameter, store the result as a double-precision floating-point element 
1824 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1825 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1826 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1827 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1828 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1829 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1830 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1831 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1832 {
1833     static if (GDC_with_SSE41)
1834     {
1835         return __builtin_ia32_roundsd(a, b, rounding);
1836     }
1837     else static if (LDC_with_SSE41)
1838     {
1839         return __builtin_ia32_roundsd(a, b, rounding);
1840     }
1841     else
1842     {
1843         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1844         {
1845             // Convert to 64-bit integer
1846             long b0 = _mm_cvtsd_si64(b);
1847             a.ptr[0] = b0;
1848             return a;
1849         }
1850         else
1851         {
1852             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1853 
1854             uint old = _MM_GET_ROUNDING_MODE();
1855             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1856             
1857             // Convert to 64-bit integer
1858             long b0 = _mm_cvtsd_si64(b);
1859             a.ptr[0] = b0;
1860 
1861             // Convert back to double to achieve the rounding
1862             // The problem is that a 64-bit double can't represent all the values 
1863             // a 64-bit integer can (and vice-versa). So this function won't work for
1864             // large values. (TODO: what range exactly?)
1865             _MM_SET_ROUNDING_MODE(old);
1866             return a;
1867         }
1868     }
1869 }
1870 unittest
1871 {
1872     // tested in other intrinsics
1873 }
1874 
1875 
1876 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1877 /// rounding parameter, store the result as a single-precision floating-point element 
1878 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1879 /// to the upper elements of result.
1880 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1881 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1882 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1883 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1884 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1885 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1886 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1887 {
1888     static if (GDC_with_SSE41)
1889     {
1890         return __builtin_ia32_roundss(a, b, rounding);
1891     }
1892     else static if (LDC_with_SSE41)
1893     {
1894         return __builtin_ia32_roundss(a, b, rounding);
1895     }
1896     else
1897     {
1898         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1899         {
1900             int b0 = _mm_cvtss_si32(b);
1901             a.ptr[0] = b0;   
1902             return a;
1903         }
1904         else version(GNU)
1905         {
1906             pragma(inline, false)
1907             __m128 GDCworkaround() nothrow @nogc @trusted 
1908             {
1909                 uint old = _MM_GET_ROUNDING_MODE();
1910                 _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1911 
1912                 // Convert to 32-bit integer
1913                 int b0 = _mm_cvtss_si32(b);
1914                 a.ptr[0] = b0;       
1915 
1916                 // Convert back to double to achieve the rounding
1917                 // The problem is that a 32-bit float can't represent all the values 
1918                 // a 32-bit integer can (and vice-versa). So this function won't work for
1919                 // large values. (TODO: what range exactly?)
1920                 _MM_SET_ROUNDING_MODE(old);
1921                 return a;
1922             }
1923             return GDCworkaround();
1924         }
1925         else
1926         {
1927             uint old = _MM_GET_ROUNDING_MODE();
1928             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1929 
1930             // Convert to 32-bit integer
1931             int b0 = _mm_cvtss_si32(b);
1932             a.ptr[0] = b0;       
1933 
1934             // Convert back to double to achieve the rounding
1935             // The problem is that a 32-bit float can't represent all the values 
1936             // a 32-bit integer can (and vice-versa). So this function won't work for
1937             // large values. (TODO: what range exactly?)
1938             _MM_SET_ROUNDING_MODE(old);
1939             return a;
1940         }
1941     }
1942 }
1943 unittest
1944 {
1945     // tested in other intrinsics
1946 }
1947 
1948 
1949 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
1950 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
1951 /// exception may be generated.
1952 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted
1953 {
1954     // PERF DMD D_SIMD
1955     static if (GDC_with_SSE41)
1956     {
1957         return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
1958     }
1959     else version(LDC)
1960     {
1961         enum prefix = `!0 = !{ i32 1 }`;
1962         enum ir = `
1963             %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
1964             ret <4 x i32> %r`;
1965         return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr);
1966     }
1967     else
1968     {
1969         return *mem_addr; // regular move instead
1970     }
1971 }
1972 // TODO unittest
1973 
1974 
1975 /// Return 1 if all bits in `a` are all 1's. Else return 0.
1976 int _mm_test_all_ones (__m128i a) @safe
1977 {
1978     return _mm_testc_si128(a, _mm_set1_epi32(-1));
1979 }
1980 unittest
1981 {
1982     __m128i A = _mm_set1_epi32(-1);
1983     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
1984     assert(_mm_test_all_ones(A) == 1);
1985     assert(_mm_test_all_ones(B) == 0);
1986 }
1987 
1988 /// Return 1 if all bits in `a` are all 0's. Else return 0.
1989 // This is a #BONUS since it was lacking in Intel Intrinsics API.
1990 int _mm_test_all_zeros (__m128i a) @safe
1991 {
1992     return _mm_testz_si128(a, _mm_set1_epi32(-1));
1993 }
1994 unittest
1995 {
1996     __m128i A = _mm_set1_epi32(0);
1997     __m128i B = _mm_set_epi32(0, 8, 0, 0);
1998     assert(_mm_test_all_zeros(A) == 1);
1999     assert(_mm_test_all_zeros(B) == 0);
2000 }
2001 
2002 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
2003 /// and return 1 if the result is zero, otherwise return 0.
2004 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
2005 {
2006     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
2007 }
2008 
2009 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 
2010 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 
2011 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
2012 /// CF values are zero, otherwise return 0.
2013 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
2014 {
2015     return _mm_testnzc_si128(a, mask);
2016 }
2017 
2018 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
2019 /// result is zero, otherwise return 0.
2020 /// In other words, test if all bits masked by `b` are 1 in `a`.
2021 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
2022 {
2023     // PERF DMD
2024     static if (GDC_with_SSE41)
2025     {
2026         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2027     }
2028     else static if (LDC_with_SSE41)
2029     {
2030         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2031     }
2032     else static if (LDC_with_ARM64)
2033     {
2034         // Acceptable since LDC 1.8 -02
2035         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2036         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2037     }
2038     else
2039     {
2040         __m128i c = ~a & b;
2041         int[4] zero = [0, 0, 0, 0];
2042         return c.array == zero;
2043     }
2044 }
2045 unittest
2046 {
2047     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2048     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2049     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2050     assert(_mm_testc_si128(A, A) == 1);
2051     assert(_mm_testc_si128(A, M1) == 0);
2052     assert(_mm_testc_si128(A, M2) == 1);
2053 }
2054 
2055 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2056 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2057 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2058 /// result is zero, otherwise set CF to 0. 
2059 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2060 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2061 {
2062     // PERF DMD
2063     static if (GDC_with_SSE41)
2064     {
2065         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2066     }
2067     else static if (LDC_with_SSE41)
2068     {
2069         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2070     }
2071     else static if (LDC_with_ARM64)
2072     {
2073         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2074         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2075 
2076         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2077                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2078     }
2079     else
2080     {
2081         __m128i c = a & b;
2082         __m128i d = ~a & b;
2083         int[4] zero = [0, 0, 0, 0];
2084         return !( (c.array == zero) || (d.array == zero));
2085     }    
2086 }
2087 unittest
2088 {
2089     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2090     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2091     __m128i Z = _mm_setzero_si128();
2092     assert(_mm_testnzc_si128(A, Z) == 0);
2093     assert(_mm_testnzc_si128(A, M) == 1);
2094     assert(_mm_testnzc_si128(A, A) == 0);
2095 }
2096 
2097 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2098 /// and return 1 if the result is zero, otherwise return 0.
2099 /// In other words, test if all bits masked by `b` are 0 in `a`.
2100 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2101 {
2102     // PERF DMD
2103     static if (GDC_with_SSE41)
2104     {
2105         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2106     }
2107     else static if (LDC_with_SSE41)
2108     {
2109         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2110     }
2111     else static if (LDC_with_ARM64)
2112     {
2113         // Acceptable since LDC 1.8 -02
2114         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2115         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2116     }
2117     else 
2118     {
2119         __m128i c = a & b;
2120         int[4] zero = [0, 0, 0, 0];
2121         return c.array == zero;
2122     }    
2123 }
2124 unittest
2125 {
2126     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2127     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2128     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2129     assert(_mm_testz_si128(A, A) == 0);
2130     assert(_mm_testz_si128(A, M1) == 1);
2131     assert(_mm_testz_si128(A, M2) == 0);
2132 }
2133