inteli.smmintrin source code

1 /**
2 * SSE4.1 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.smmintrin;
10 
11 // SSE4.1 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
13 // Note: this header will work whether you have SSE4.1 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
15 // generate SSE4.1 instructions.
16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
17 
18 public import inteli.types;
19 import inteli.internals;
20 
21 // smmintrin pulls in all previous instruction set intrinsics.
22 public import inteli.tmmintrin;
23 
24 nothrow @nogc:
25 
26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
27 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
28 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
29 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
30 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
31 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
32 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
33 
34 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
35 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
36 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
37 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
38 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
40 
41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
42 // Note: changed signature, GDC needs a compile-time value for imm8.
43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted
44 {
45     // PERF DMD
46     static if (GDC_with_SSE41)
47     {
48         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
49     }
50     else 
51     {
52         // LDC x86 This generates pblendw since LDC 1.1 and -O2
53         short8 r;
54         short8 sa = cast(short8)a;
55         short8 sb = cast(short8)b;
56         for (int n = 0; n < 8; ++n)
57         {
58             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
59         }
60         return cast(__m128i)r;
61     }
62 }
63 unittest
64 {
65     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
66     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
67     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
68     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
69     assert(C.array == correct);
70 }
71 
72 
73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
74 // Note: changed signature, GDC needs a compile-time value for `imm8`.
75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
76 {
77     static assert(imm8 >= 0 && imm8 < 4);
78     // PERF DMD
79     static if (GDC_with_SSE41)
80     {
81         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
82     }
83     else
84     {
85         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
86         double2 r;
87         for (int n = 0; n < 2; ++n)
88         {
89             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
90         }
91         return cast(__m128d)r;
92     }
93 }
94 unittest
95 {
96     __m128d A = _mm_setr_pd(0, 1);
97     __m128d B = _mm_setr_pd(8, 9);
98     double2 C = _mm_blend_pd!2(A, B);
99     double[2] correct =    [0, 9];
100     assert(C.array == correct);
101 }
102 
103 
104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
105 /// mask `imm8`.
106 // Note: changed signature, GDC needs a compile-time value for imm8.
107 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted
108 {
109     // PERF DMD
110     static assert(imm8 >= 0 && imm8 < 16);
111     static if (GDC_with_SSE41)
112     {
113         return __builtin_ia32_blendps(a, b, imm8);
114     }
115     else version(LDC)
116     {
117         // LDC x86: generates blendps since LDC 1.1 -O2
118         //   arm64: pretty good, two instructions worst case
119         return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
120                                          (imm8 & 2) ? 5 : 1,
121                                          (imm8 & 4) ? 6 : 2,
122                                          (imm8 & 8) ? 7 : 3)(a, b);
123     }
124     else
125     {
126         __m128 r; // PERF =void;
127         for (int n = 0; n < 4; ++n)
128         {
129             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
130         }
131         return r;
132     }
133 }
134 unittest
135 {
136     __m128 A = _mm_setr_ps(0, 1,  2,  3);
137     __m128 B = _mm_setr_ps(8, 9, 10, 11);
138     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
139     float[4] correct =    [8, 1, 10, 11];
140     assert(C.array == correct);
141 }
142 
143 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
144 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
145 {
146     // PERF DMD
147     /*static if (GDC_with_SSE41)
148     {
149         // This intrinsic do nothing in GDC 12.
150         // TODO report to GDC. No problem in GCC.
151         return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
152     }
153     else*/
154     static if (LDC_with_SSE41)
155     {
156         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
157     }
158     else static if (LDC_with_ARM64)
159     {
160         // LDC arm64: two instructions since LDC 1.12 -O2
161         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
162         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
163     }
164     else
165     {
166         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
167         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
168     }
169 }
170 unittest
171 {
172     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
173                                8,  9, 10, 11, 12, 13, 14, 15);
174     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
175                               24, 25, 26, 27, 28, 29, 30, 31);
176     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
177                                1,  1, -1, -1,  4,  1,  8, -128);
178     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
179     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
180                                8,  9, 26, 27, 12, 13, 14, 31 ];
181     assert(R.array == correct);
182 }
183 
184 
185 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
186 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
187 {
188     // PERF DMD
189     static if (GDC_with_SSE42)
190     {
191         // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
192         // with -msse4.2 but not -msse4.1.
193         // Not sure what is the reason, and there is a replacement sequence.
194         // Sounds like a bug.
195         return __builtin_ia32_blendvpd(a, b, mask);
196     }
197     else static if (LDC_with_SSE41)
198     {
199         return __builtin_ia32_blendvpd(a, b, mask);
200     }
201     else static if (LDC_with_ARM64)
202     {
203         long2 shift;
204         shift = 63;
205         long2 lmask = cast(long2)mask >> shift;
206         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
207     }
208     else
209     {
210         __m128d r; // PERF =void;
211         long2 lmask = cast(long2)mask;
212         for (int n = 0; n < 2; ++n)
213         {
214             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
215         }
216         return r;
217     }
218 }
219 unittest
220 {
221     __m128d A = _mm_setr_pd(1.0, 2.0);
222     __m128d B = _mm_setr_pd(3.0, 4.0);
223     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
224     __m128d R1 = _mm_blendv_pd(A, B, M1);
225     double[2] correct1 = [3.0, 2.0];
226     assert(R1.array == correct1);
227 
228     // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
229     // See Issue #78
230     __m128d M2 = _mm_setr_pd(double.nan, double.infinity);
231     __m128d R2 = _mm_blendv_pd(A, B, M2);
232     double[2] correct2 = [1.0, 2.0];
233     assert(R2.array == correct2);
234 }
235 
236 
237 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
238 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
239 {
240     // PERF DMD
241     static if (GDC_with_SSE41)
242     {
243         return __builtin_ia32_blendvps(a, b, mask);
244     }
245     else static if (LDC_with_SSE41)
246     {
247         return __builtin_ia32_blendvps(a, b, mask);
248     }
249     else static if (LDC_with_ARM64)
250     {
251         int4 shift;
252         shift = 31;
253         int4 lmask = cast(int4)mask >> shift;
254         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
255     }
256     else
257     {
258         __m128 r; // PERF =void;
259         int4 lmask = cast(int4)mask;
260         for (int n = 0; n < 4; ++n)
261         {
262             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
263         }
264         return r;
265     }
266 }
267 unittest
268 {
269     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
270     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
271     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
272     __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
273     __m128 R1 = _mm_blendv_ps(A, B, M1);
274     __m128 R2 = _mm_blendv_ps(A, B, M2);
275     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
276     float[4] correct2 =    [ 0.0f, 1.0f, 6.0f, 3.0f];
277     assert(R1.array == correct1);
278 
279     // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
280     // See Issue #78
281     assert(R2.array == correct2);
282 }
283 
284 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
285 /// and store the results as packed double-precision floating-point elements.
286 __m128d _mm_ceil_pd (__m128d a) @trusted
287 {
288     static if (LDC_with_ARM64)
289     {
290         // LDC arm64 acceptable since 1.8 -O2
291         // Unfortunately x86 intrinsics force a round-trip back to double2
292         // ARM neon semantics wouldn't have that
293         long2 l = vcvtpq_s64_f64(a);
294         double2 r;
295         r.ptr[0] = l.array[0];
296         r.ptr[1] = l.array[1];
297         return r;
298     }
299     else
300     {
301         return _mm_round_pd!2(a);
302     }
303 }
304 unittest
305 {
306     __m128d A = _mm_setr_pd(1.3f, -2.12f);
307     __m128d B = _mm_setr_pd(53.6f, -2.7f);
308     A = _mm_ceil_pd(A);
309     B = _mm_ceil_pd(B);
310     double[2] correctA = [2.0, -2.0];
311     double[2] correctB = [54.0, -2.0];
312     assert(A.array == correctA);
313     assert(B.array == correctB);
314 }
315 
316 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
317 /// and store the results as packed single-precision floating-point elements.
318 __m128 _mm_ceil_ps (__m128 a) @trusted
319 {
320     static if (LDC_with_ARM64)
321     {
322         // LDC arm64 acceptable since 1.8 -O1
323         int4 l = vcvtpq_s32_f32(a);
324         float4 r;
325         r.ptr[0] = l.array[0];
326         r.ptr[1] = l.array[1];
327         r.ptr[2] = l.array[2];
328         r.ptr[3] = l.array[3];
329         return r;
330     }
331     else
332     {
333         return _mm_round_ps!2(a);
334     }
335 }
336 unittest
337 {
338     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
339     __m128 C = _mm_ceil_ps(A);
340     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
341     assert(C.array == correct);
342 }
343 
344 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
345 /// store the result as a double-precision floating-point element in the lower element of result, 
346 /// and copy the upper element from `a` to the upper element of dst.
347 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
348 {
349     static if (LDC_with_ARM64)
350     {
351         a[0] = vcvtps_s64_f64(b[0]);
352         return a;
353     }
354     else
355     {
356         return _mm_round_sd!2(a, b);
357     }
358 }
359 unittest
360 {
361     __m128d A = _mm_setr_pd(1.3, -2.12);
362     __m128d B = _mm_setr_pd(53.6, -3.7);
363     __m128d C = _mm_ceil_sd(A, B);
364     double[2] correct = [54.0, -2.12];
365     assert(C.array == correct);
366 }
367 
368 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
369 /// store the result as a single-precision floating-point element in the lower element of result, 
370 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
371 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
372 {
373     static if (LDC_with_ARM64)
374     {
375         a[0] = vcvtps_s32_f32(b[0]);
376         return a;
377     }
378     else
379     {
380         return _mm_round_ss!2(a, b);
381     }
382 }
383 unittest
384 {
385     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
386     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
387     __m128 C = _mm_ceil_ss(A, B);
388     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
389     assert(C.array == correct);
390 }
391 
392 /// Compare packed 64-bit integers in `a` and `b` for equality.
393 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
394 {
395     // PERF DMD
396     static if (GDC_with_SSE41)
397     {
398         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
399     }
400     else version(LDC)
401     {
402         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
403         //     arm64: generates cmeq since LDC 1.8 -O1
404         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
405     }
406     else
407     {
408         // Clever pcmpeqd + pand use with LDC 1.24 -O2
409         long2 la = cast(long2)a;
410         long2 lb = cast(long2)b;
411         long2 res;
412         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
413         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
414         return cast(__m128i)res;
415     }
416 }
417 unittest
418 {
419     __m128i A = _mm_setr_epi64(-1, -2);
420     __m128i B = _mm_setr_epi64(-3, -2);
421     __m128i C = _mm_setr_epi64(-1, -4);
422     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
423     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
424     long[2] correct1 = [0, -1];
425     long[2] correct2 = [-1, 0];
426     assert(AB.array == correct1);
427     assert(AC.array == correct2);
428 }
429 
430 
431 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
432 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
433 {
434     // PERF DMD
435     static if (GDC_with_SSE41)
436     {
437         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
438     }
439     else version(LDC)
440     {
441         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
442         enum ir = `
443             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
444             %r = sext <4 x i16> %v to <4 x i32>
445             ret <4 x i32> %r`;
446         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
447     }
448     else
449     {
450         short8 sa = cast(short8)a;
451         int4 r;
452         r.ptr[0] = sa.array[0];
453         r.ptr[1] = sa.array[1];
454         r.ptr[2] = sa.array[2];
455         r.ptr[3] = sa.array[3];
456         return r;
457     }
458 }
459 unittest
460 {
461     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
462     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
463     int[4] correct = [-1, 0, -32768, 32767];
464     assert(C.array == correct);
465 }
466 
467 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
468 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
469 {
470     // PERF DMD
471     static if (GDC_with_SSE41)
472     {
473         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
474     }
475     else version(LDC)
476     {
477         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
478         enum ir = `
479             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
480             %r = sext <2 x i16> %v to <2 x i64>
481             ret <2 x i64> %r`;
482         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
483     }
484     else
485     {
486         short8 sa = cast(short8)a;
487         long2 r;
488         r.ptr[0] = sa.array[0];
489         r.ptr[1] = sa.array[1];
490         return cast(__m128i)r;
491     }
492 }
493 unittest
494 {
495     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
496     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
497     long[2] correct = [-32768, 32767];
498     assert(C.array == correct);
499 }
500 
501 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
502 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
503 {
504     // PERF DMD
505     static if (GDC_with_SSE41)
506     {
507         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
508     }
509     else version(LDC)
510     {
511         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
512         enum ir = `
513             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
514             %r = sext <2 x i32> %v to <2 x i64>
515             ret <2 x i64> %r`;
516         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
517     }
518     else
519     {
520         int4 sa = cast(int4)a;
521         long2 r;
522         r.ptr[0] = sa.array[0];
523         r.ptr[1] = sa.array[1];
524         return cast(__m128i)r;
525     }
526 }
527 unittest
528 {
529     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
530     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
531     long[2] correct = [-4, 42];
532     assert(C.array == correct);
533 }
534 
535 
536 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
537 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
538 {
539     // PERF DMD
540     static if (GDC_with_SSE41)
541     {
542         alias ubyte16 = __vector(ubyte[16]);
543         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
544     }
545     else version(LDC)
546     {
547         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
548         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
549         enum ir = `
550             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
551             %r = sext <8 x i8> %v to <8 x i16>
552             ret <8 x i16> %r`;
553         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
554     }
555     else
556     {
557         byte16 sa = cast(byte16)a;
558         short8 r;
559         foreach(n; 0..8)
560             r.ptr[n] = sa.array[n];
561         return cast(__m128i)r;
562     }
563 }
564 unittest
565 {
566     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
567     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
568     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
569     assert(C.array == correct);
570 }
571 
572 
573 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
574 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
575 {
576     // PERF DMD
577     static if (GDC_with_SSE41)
578     {
579         alias ubyte16 = __vector(ubyte[16]);
580         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
581     }
582     else static if (LDC_with_SSE41)
583     {
584         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
585         enum ir = `
586             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
587             %r = sext <4 x i8> %v to <4 x i32>
588             ret <4 x i32> %r`;
589         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
590     }
591     else
592     {
593         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
594         byte16 sa = cast(byte16)a;
595         int4 r;
596         r.ptr[0] = sa.array[0];
597         r.ptr[1] = sa.array[1];
598         r.ptr[2] = sa.array[2];
599         r.ptr[3] = sa.array[3];
600         return cast(__m128i)r;
601     }
602 }
603 unittest
604 {
605     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
606     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
607     int[4] correct = [127, -128, 1, -1];
608     assert(C.array == correct);
609 }
610 
611 
612 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
613 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
614 {
615     // PERF DMD
616     static if (GDC_with_SSE41)
617     {
618         alias ubyte16 = __vector(ubyte[16]);
619         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
620     }
621     else version(LDC)
622     {
623         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
624         // LDC arm64: it's ok since LDC 1.8 -O1
625         enum ir = `
626             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
627             %r = sext <2 x i8> %v to <2 x i64>
628             ret <2 x i64> %r`;
629         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
630     }
631     else
632     {
633         byte16 sa = cast(byte16)a;
634         long2 r;
635         foreach(n; 0..2)
636             r.ptr[n] = sa.array[n];
637         return cast(__m128i)r;
638     }
639 }
640 unittest
641 {
642     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
643     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
644     long[2] correct = [127, -128];
645     assert(C.array == correct);
646 }
647 
648 
649 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
650 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
651 {
652     // PERF DMD
653     static if (GDC_with_SSE41)
654     {
655         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
656     }
657     else
658     {
659         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
660         //     arm64: ushll since LDC 1.12 -O1
661         short8 sa = cast(short8)a;
662         int4 r;
663         r.ptr[0] = cast(ushort)sa.array[0];
664         r.ptr[1] = cast(ushort)sa.array[1];
665         r.ptr[2] = cast(ushort)sa.array[2];
666         r.ptr[3] = cast(ushort)sa.array[3];
667         return cast(__m128i)r;
668     }
669 }
670 unittest
671 {
672     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
673     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
674     int[4] correct = [65535, 0, 32768, 32767];
675     assert(C.array == correct);
676 }
677 
678 
679 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
680 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
681 {
682     // PERF DMD
683     static if (GDC_with_SSE41)
684     {
685         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
686     }
687     else static if (LDC_with_ARM64)
688     {
689         // LDC arm64: a bit shorter than below, in -O2
690         short8 sa = cast(short8)a;
691         long2 r;
692         for(int n = 0; n < 2; ++n)
693             r.ptr[n] = cast(ushort)sa.array[n];
694         return cast(__m128i)r;
695     }
696     else
697     {
698         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
699         short8 sa = cast(short8)a;
700         long2 r;
701         r.ptr[0] = cast(ushort)sa.array[0];
702         r.ptr[1] = cast(ushort)sa.array[1];
703         return cast(__m128i)r;
704     }
705 }
706 unittest
707 {
708     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
709     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
710     long[2] correct = [65535, 0];
711     assert(C.array == correct);
712 }
713 
714 
715 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
716 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
717 {
718     // PERF DMD
719     static if (GDC_with_SSE41)
720     {
721         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
722     }
723     else
724     {
725         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
726         //     arm64: generates ushll since LDC 1.12 -O1
727         int4 sa = cast(int4)a;
728         long2 r;
729         r.ptr[0] = cast(uint)sa.array[0];
730         r.ptr[1] = cast(uint)sa.array[1];
731         return cast(__m128i)r;
732     }
733 }
734 unittest
735 {
736     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
737     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
738     long[2] correct = [4294967295, 42];
739     assert(C.array == correct);
740 }
741 
742 
743 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
744 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
745 {
746     // PERF DMD
747     static if (GDC_with_SSE41)
748     {
749         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
750     }
751     else
752     {
753         // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1
754         //     arm64: ushll since LDC 1.12 -O1
755         // PERF: catastrophic with GDC without SSE4.1
756         byte16 sa = cast(byte16)a;
757         short8 r;
758         r.ptr[0] = cast(ubyte)sa.array[0];
759         r.ptr[1] = cast(ubyte)sa.array[1];
760         r.ptr[2] = cast(ubyte)sa.array[2];
761         r.ptr[3] = cast(ubyte)sa.array[3];
762         r.ptr[4] = cast(ubyte)sa.array[4];
763         r.ptr[5] = cast(ubyte)sa.array[5];
764         r.ptr[6] = cast(ubyte)sa.array[6];
765         r.ptr[7] = cast(ubyte)sa.array[7];
766         return cast(__m128i)r;
767     }
768 }
769 unittest
770 {
771     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
772     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
773     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
774     assert(C.array == correct);
775 }
776 
777 
778 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
779 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
780 {
781     // PERF DMD
782     static if (GDC_with_SSE41)
783     {
784         alias ubyte16 = __vector(ubyte[16]);
785         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
786     }
787     else static if (LDC_with_ARM64)
788     {
789         // LDC arm64: a bit better than below in -O2
790         byte16 sa = cast(byte16)a;
791         int4 r;
792         for(int n = 0; n < 4; ++n) 
793             r.ptr[n] = cast(ubyte)sa.array[n];
794         return cast(__m128i)r;
795     }
796     else
797     {
798         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
799         // PERF: catastrophic with GDC without SSE4.1
800         byte16 sa = cast(byte16)a;
801         int4 r;
802         r.ptr[0] = cast(ubyte)sa.array[0];
803         r.ptr[1] = cast(ubyte)sa.array[1];
804         r.ptr[2] = cast(ubyte)sa.array[2];
805         r.ptr[3] = cast(ubyte)sa.array[3];
806         return cast(__m128i)r;
807     }
808 }
809 unittest
810 {
811     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
812     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
813     int[4] correct = [127, 128, 1, 255];
814     assert(C.array == correct);
815 }
816 
817 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
818 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
819 {
820     // PERF DMD
821     static if (GDC_with_SSE41)
822     {
823         alias ubyte16 = __vector(ubyte[16]);
824         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
825     }
826     else static if (LDC_with_ARM64)
827     {
828         // LDC arm64: this optimizes better than the loop below
829         byte16 sa = cast(byte16)a;
830         long2 r;
831         for (int n = 0; n < 2; ++n)
832             r.ptr[n] = cast(ubyte)sa.array[n];
833         return cast(__m128i)r;
834     }
835     else
836     {
837         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
838         byte16 sa = cast(byte16)a;
839         long2 r;
840         r.ptr[0] = cast(ubyte)sa.array[0];
841         r.ptr[1] = cast(ubyte)sa.array[1];
842         return cast(__m128i)r;
843     }
844 }
845 unittest
846 {
847     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
848     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
849     long[2] correct = [127, 254];
850     assert(C.array == correct);
851 }
852 
853 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
854 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
855 /// store the sum in dst using the low 4 bits of `imm8`.
856 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
857 {
858     // PERF DMD
859     static if (GDC_with_SSE41)
860     {
861         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
862     }
863     else static if (LDC_with_SSE41)
864     {
865         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
866     }
867     else
868     {
869         __m128d zero = _mm_setzero_pd();
870         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
871         double sum = temp.array[0] + temp.array[1];
872         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
873     }
874 }
875 unittest
876 {
877     __m128d A = _mm_setr_pd(1.0, 2.0);
878     __m128d B = _mm_setr_pd(4.0, 8.0);
879     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
880     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
881     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
882     double[2] correct1 = [ 4.0,  4.0];
883     double[2] correct2 = [16.0,  0.0];
884     double[2] correct3 = [ 0.0, 20.0];
885     assert(R1.array == correct1);
886     assert(R2.array == correct2);
887     assert(R3.array == correct3);
888 }
889 
890 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
891 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
892 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
893 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
894 {
895       // PERF DMD
896     static if (GDC_with_SSE41)
897     {
898         return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
899     }
900     else static if (LDC_with_SSE41)
901     {
902         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
903     }
904     else
905     {
906         __m128 zero = _mm_setzero_ps();
907         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
908         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
909         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
910     }        
911 }
912 unittest
913 {
914     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
915     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
916     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
917     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
918     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
919     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
920     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
921     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
922     assert(R1.array == correct1);
923     assert(R2.array == correct2);
924     assert(R3.array == correct3);
925 }
926 
927 
928 /// Extract a 32-bit integer from `a`, selected with `imm8`.
929 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
930 {
931     return (cast(int4)a).array[imm8 & 3];
932 }
933 unittest
934 {
935     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
936     assert(_mm_extract_epi32(A, 0) == 1);
937     assert(_mm_extract_epi32(A, 1 + 8) == 2);
938     assert(_mm_extract_epi32(A, 3 + 4) == 4);
939 }
940 
941 /// Extract a 64-bit integer from `a`, selected with `imm8`.
942 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
943 {
944     long2 la = cast(long2)a;
945     return la.array[imm8 & 1];
946 }
947 unittest
948 {
949     __m128i A = _mm_setr_epi64(45, -67);
950     assert(_mm_extract_epi64(A, 0) == 45);
951     assert(_mm_extract_epi64(A, 1) == -67);
952     assert(_mm_extract_epi64(A, 2) == 45);
953 }
954 
955 /// Extract an 8-bit integer from `a`, selected with `imm8`.
956 /// Warning: the returned value is zero-extended to 32-bits.
957 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
958 {
959     byte16 ba = cast(byte16)a;
960     return cast(ubyte) ba.array[imm8 & 15];
961 }
962 unittest
963 {
964     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
965     assert(_mm_extract_epi8(A, 7) == 7);
966     assert(_mm_extract_epi8(A, 13) == 255);
967     assert(_mm_extract_epi8(A, 7 + 16) == 7);
968 }
969 
970 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
971 /// Note: returns a 32-bit $(I integer).
972 int _mm_extract_ps (__m128 a, const int imm8) @trusted
973 {
974     return (cast(int4)a).array[imm8 & 3];
975 }
976 unittest
977 {
978     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
979     assert(_mm_extract_ps(A, 0) == 0x3f800000);
980     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
981     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
982 }
983 
984 
985 
986 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
987 /// integer value, and store the results as packed double-precision floating-point elements.
988 __m128d _mm_floor_pd (__m128d a) @trusted
989 {
990     static if (LDC_with_ARM64)
991     {
992         // LDC arm64 acceptable since 1.8 -O2
993         long2 l = vcvtmq_s64_f64(a);
994         double2 r;
995         r.ptr[0] = l.array[0];
996         r.ptr[1] = l.array[1];
997         return r;
998     }
999     else
1000     {
1001         return _mm_round_pd!1(a);
1002     }
1003 }
1004 unittest
1005 {
1006     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1007     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1008     A = _mm_floor_pd(A);
1009     B = _mm_floor_pd(B);
1010     double[2] correctA = [1.0, -3.0];
1011     double[2] correctB = [53.0, -3.0];
1012     assert(A.array == correctA);
1013     assert(B.array == correctB);
1014 }
1015 
1016 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1017 /// integer value, and store the results as packed single-precision floating-point elements.
1018 __m128 _mm_floor_ps (__m128 a) @trusted
1019 {
1020     static if (LDC_with_ARM64)
1021     {
1022         // LDC arm64 acceptable since 1.8 -O1
1023         int4 l = vcvtmq_s32_f32(a);
1024         float4 r;
1025         r.ptr[0] = l.array[0];
1026         r.ptr[1] = l.array[1];
1027         r.ptr[2] = l.array[2];
1028         r.ptr[3] = l.array[3];
1029         return r;
1030     }
1031     else
1032     {
1033         return _mm_round_ps!1(a);
1034     }
1035 }
1036 unittest
1037 {
1038     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1039     __m128 C = _mm_floor_ps(A);
1040     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1041     assert(C.array == correct);
1042 }
1043 
1044 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1045 /// integer value, store the result as a double-precision floating-point element in the 
1046 /// lower element, and copy the upper element from `a` to the upper element.
1047 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1048 {
1049     static if (LDC_with_ARM64)
1050     {
1051         a[0] = vcvtms_s64_f64(b[0]);
1052         return a;
1053     }
1054     else
1055     {
1056         return _mm_round_sd!1(a, b);
1057     }
1058 }
1059 unittest
1060 {
1061     __m128d A = _mm_setr_pd(1.3, -2.12);
1062     __m128d B = _mm_setr_pd(-53.1, -3.7);
1063     __m128d C = _mm_floor_sd(A, B);
1064     double[2] correct = [-54.0, -2.12];
1065     assert(C.array == correct);
1066 }
1067 
1068 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1069 /// integer value, store the result as a single-precision floating-point element in the
1070 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1071 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1072 {
1073     static if (LDC_with_ARM64)
1074     {
1075         a[0] = vcvtms_s32_f32(b[0]);
1076         return a;
1077     }
1078     else
1079     {
1080         return _mm_round_ss!1(a, b);
1081     }
1082 }
1083 unittest
1084 {
1085     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1086     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1087     __m128 C = _mm_floor_ss(A, B);
1088     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1089     assert(C.array == correct);
1090 }
1091 
1092 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1093 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1094 {
1095     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1096     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1097     // LDC arm64: ins.s since LDC 1.8 -O2
1098     int4 ia = cast(int4)a;
1099     ia.ptr[imm8 & 3] = i;
1100     return cast(__m128i)ia; 
1101 }
1102 unittest
1103 {
1104     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1105     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1106     int[4] result = [1, 2, 5, 4];
1107     assert(C.array == result);
1108 }
1109 
1110 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1111 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1112 {
1113     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1114     // LDC x86: always do something sensible.
1115     long2 la = cast(long2)a;
1116     la.ptr[imm8 & 1] = i;
1117     return cast(__m128i)la;
1118 }
1119 unittest
1120 {
1121     __m128i A = _mm_setr_epi64(1, 2);
1122     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1123     long[2] result = [1, 5];
1124     assert(C.array == result);
1125 }
1126 
1127 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1128 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1129 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1130 {
1131     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1132     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1133     byte16 ba = cast(byte16)a;
1134     ba.ptr[imm8 & 15] = cast(byte)i;
1135     return cast(__m128i)ba; 
1136 }
1137 unittest
1138 {
1139     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1140     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1141     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1142     assert(C.array == result);
1143 }
1144 
1145 
1146 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1147 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1148 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1149 /// (elements are zeroed out when the corresponding bit is set).
1150 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1151 {
1152     // PERF DMD
1153     static if (GDC_with_SSE41)
1154     {
1155         return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
1156     }
1157     else static if (LDC_with_SSE41)
1158     {
1159         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1160     }
1161     else
1162     {
1163         float4 tmp2 = a;
1164         float tmp1 = b.array[(imm8 >> 6) & 3];
1165         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1166         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1167     }
1168 }
1169 unittest
1170 {
1171     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1172     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1173     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1174     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1175     assert(C.array == correct);
1176 }
1177 
1178 
1179 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1180 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
1181 {
1182     static if (GDC_with_SSE41)
1183     {
1184         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1185     }
1186     else version(LDC)
1187     {
1188         // x86: pmaxsd since LDC 1.1 -O1
1189         // ARM: smax.4s since LDC 1.8 -01
1190         int4 sa = cast(int4)a;
1191         int4 sb = cast(int4)b;
1192         int4 greater = greaterMask!int4(sa, sb);
1193         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1194     }
1195     else
1196     {
1197         __m128i higher = _mm_cmpgt_epi32(a, b);
1198         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1199         __m128i mask = _mm_and_si128(aTob, higher);
1200         return _mm_xor_si128(b, mask);
1201     }
1202 }
1203 unittest
1204 {
1205     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1206                                       _mm_setr_epi32(        -4,-8,  9, -8));
1207     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1208     assert(R.array == correct);
1209 }
1210 
1211 /// Compare packed signed 8-bit integers in `a` and `b`, 
1212 /// and return packed maximum values.
1213 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
1214 {
1215     // PERF DMD
1216     static if (GDC_with_SSE41)
1217     {
1218         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1219     }
1220     else version(LDC)
1221     {
1222         // x86: pmaxsb since LDC 1.1 -O1
1223         // ARM64: smax.16b since LDC 1.8.0 -O1
1224         byte16 sa = cast(byte16)a;
1225         byte16 sb = cast(byte16)b;
1226         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1227         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1228     }
1229     else
1230     {
1231         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1232         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1233         __m128i mask = _mm_and_si128(aTob, lower);
1234         return _mm_xor_si128(b, mask);
1235     }
1236 }
1237 unittest
1238 {
1239     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1240     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1241     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1242     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1243     assert(R.array == correct);
1244 }
1245 
1246 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1247 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
1248 {
1249     // PERF DMD
1250     static if (GDC_with_SSE41)
1251     {
1252         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1253     }
1254     else version(LDC)
1255     {
1256         // x86: pmaxuw since LDC 1.1 -O1
1257         // ARM64: umax.8h since LDC 1.8.0 -O1
1258         // PERF: without sse4.1, LLVM 12 produces a very interesting
1259         //          psubusw xmm0, xmm1
1260         //          paddw   xmm0, xmm1
1261         //       sequence that maybe should go in other min/max intrinsics? 
1262         ushort8 sa = cast(ushort8)a;
1263         ushort8 sb = cast(ushort8)b;
1264         ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1265         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1266     }
1267     else
1268     {
1269         b = _mm_subs_epu16(b, a);
1270         b = _mm_add_epi16(b, a);
1271         return b;
1272     }
1273 }
1274 unittest
1275 {
1276     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1277                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1278     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1279     assert(R.array == correct);
1280 }
1281 
1282 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1283 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
1284 {
1285     // PERF DMD
1286     static if (GDC_with_SSE41)
1287     {
1288         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1289     }
1290     else version(LDC)
1291     {
1292         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1293         // ARM64: umax.4s since LDC 1.8.0 -O1
1294         uint4 sa = cast(uint4)a;
1295         uint4 sb = cast(uint4)b;
1296         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1297         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1298     }
1299     else
1300     {
1301         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1302         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1303         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1304         __m128i mask = _mm_and_si128(aTob, higher);
1305         return _mm_xor_si128(b, mask);
1306     }
1307 }
1308 unittest
1309 {
1310     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1311                                       _mm_setr_epi32(        -4,-8,  9, -8));
1312     int[4] correct =                                [        -4,-8,  9, -7];
1313     assert(R.array == correct);
1314 }
1315 
1316 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1317 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
1318 {
1319     // PERF DMD
1320     static if (GDC_with_SSE41)
1321     {
1322         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1323     }
1324     else version(LDC)
1325     {
1326         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1327         // ARM: smin.4s since LDC 1.8 -01
1328         int4 sa = cast(int4)a;
1329         int4 sb = cast(int4)b;
1330         int4 greater = greaterMask!int4(sa, sb);
1331         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1332     }
1333     else
1334     {
1335         __m128i higher = _mm_cmplt_epi32(a, b);
1336         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1337         __m128i mask = _mm_and_si128(aTob, higher);
1338         return _mm_xor_si128(b, mask);
1339     }
1340 }
1341 unittest
1342 {
1343     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1344                                       _mm_setr_epi32(        -4, -8,  9, -8));
1345     int[4] correct =                               [         -4, -8, -4, -8];
1346     assert(R.array == correct);
1347 }
1348 
1349 /// Compare packed signed 8-bit integers in `a` and `b`, 
1350 /// and return packed minimum values.
1351 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
1352 {
1353     // PERF DMD
1354     static if (GDC_with_SSE41)
1355     {
1356         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1357     }
1358     else version(LDC)
1359     {
1360         // x86: pminsb since LDC 1.1 -O1
1361         // ARM64: smin.16b since LDC 1.8.0 -O1
1362         byte16 sa = cast(byte16)a;
1363         byte16 sb = cast(byte16)b;
1364         byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1365         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1366     }
1367     else
1368     {
1369         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1370         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1371         __m128i mask = _mm_and_si128(aTob, lower);
1372         return _mm_xor_si128(b, mask);
1373     }
1374 }
1375 unittest
1376 {
1377     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1378     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1379     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1380     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1381     assert(R.array == correct);
1382 }
1383 
1384 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1385 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
1386 {
1387     // PERF DMD
1388     static if (GDC_with_SSE41)
1389     {
1390         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1391     }
1392     else version(LDC)
1393     {
1394         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1395         // ARM64: umin.8h since LDC 1.8.0 -O1
1396         ushort8 sa = cast(ushort8)a;
1397         ushort8 sb = cast(ushort8)b;
1398         ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1399         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1400     }
1401     else
1402     {
1403         __m128i c = _mm_subs_epu16(b, a);
1404         b = _mm_sub_epi16(b, c);
1405         return b;
1406     }
1407 }
1408 unittest
1409 {
1410     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1411                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1412     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1413     assert(R.array == correct);
1414 }
1415 
1416 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1417 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
1418 {
1419     // PERF DMD
1420     static if (GDC_with_SSE41)
1421     {
1422         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1423     }
1424     else version(LDC)
1425     {
1426         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1427         // ARM64: umin.4s since LDC 1.8.0 -O1
1428         uint4 sa = cast(uint4)a;
1429         uint4 sb = cast(uint4)b;
1430         uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1431         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1432     }
1433     else
1434     {
1435         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1436         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1437         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1438         __m128i mask = _mm_and_si128(aTob, higher);
1439         return _mm_xor_si128(b, mask);
1440     }
1441 }
1442 unittest
1443 {
1444     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1445                                       _mm_setr_epi32(        -4,-8,  9, -8));
1446     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1447     assert(R.array == correct);
1448 }
1449 
1450 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1451 /// store the minimum and index in return value, and zero the remaining bits.
1452 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1453 {
1454     // PERF DMD
1455     static if (GDC_with_SSE41)
1456     {
1457         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1458     }
1459     else static if (LDC_with_SSE41)
1460     {
1461         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1462     }
1463     else static if (LDC_with_ARM64)
1464     {
1465         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1466         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1467         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1468         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1469         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1470         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1471         short8 sbest = cast(short8)best;
1472         short8 r;
1473         r[0] = sbest[1];
1474         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1475         r[2] = 0;
1476         r[3] = 0;
1477         r[4] = 0;
1478         r[5] = 0;
1479         r[6] = 0;
1480         r[7] = 0;
1481         return cast(__m128i)r;
1482     }
1483     else
1484     {
1485         short8 sa = cast(short8)a;
1486         ushort min = 0xffff;
1487         int index = 0;
1488         for(int n = 0; n < 8; ++n)
1489         {
1490             ushort c = sa.array[n];
1491             if (c < min)
1492             {
1493                 min = c;
1494                 index = n;
1495             }
1496         }
1497         short8 r;
1498         r.ptr[0] = min;
1499         r.ptr[1] = cast(short)index;
1500         return cast(__m128i)r;
1501     }
1502 }
1503 unittest
1504 {
1505     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1506     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1507     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1508     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1509     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1510     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1511     assert(R1.array == correct1);
1512     assert(R2.array == correct2);
1513 }
1514 
1515 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1516 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1517 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1518 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1519 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1520 /// at the offset specified in `imm8[2]`.
1521 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1522 {
1523     // PERF DMD
1524     static if (GDC_with_SSE41)
1525     {
1526         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
1527     }
1528     else static if (LDC_with_SSE41)
1529     {
1530         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1531     }
1532     else
1533     {
1534         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1535         int b_offset = (imm8 & 3) * 4;
1536 
1537         byte16 ba = cast(byte16)a;
1538         byte16 bb = cast(byte16)b;
1539         short8 r;
1540 
1541         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1542 
1543         for (int j = 0; j < 8; j += 2)
1544         {
1545             int k = a_offset + j;
1546             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1547                                            0, 0, 0, 0, 
1548                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1549                                            0, 0, 0, 0);
1550             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1551             r.ptr[j] = diffs.array[0];
1552             r.ptr[j+1] = diffs.array[4];
1553         }
1554         return cast(__m128i)r;
1555     }
1556 }
1557 unittest
1558 {
1559     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1560     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1561     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1562     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1563     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1564     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1565     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1566     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
1567     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
1568     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
1569     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
1570     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
1571     assert(r1.array == correct1);
1572     assert(r4.array == correct4);
1573     assert(r5.array == correct5);
1574     assert(r7.array == correct7);
1575     assert(r8.array == correct0);
1576 }
1577 
1578 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1579 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
1580 {
1581     // PERF DMD
1582     static if (GDC_with_SSE41)
1583     {
1584         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1585     }
1586     else static if (LDC_with_SSE41)
1587     {
1588         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1589         // Use IR instead.
1590         // This generates pmuldq with since LDC 1.2.0 -O0 
1591         enum ir = `
1592             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1593             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1594             %la = sext <2 x i32> %ia to <2 x i64>
1595             %lb = sext <2 x i32> %ib to <2 x i64>
1596             %r = mul <2 x i64> %la, %lb
1597             ret <2 x i64> %r`;
1598         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1599     }
1600     else static if (LDC_with_ARM64)  
1601     {
1602         // 3 instructions since LDC 1.8 -O2
1603         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1604         int2 a_lo = vmovn_s64(cast(long2)a);
1605         int2 b_lo = vmovn_s64(cast(long2)b);
1606         return cast(__m128i) vmull_s32(a_lo, b_lo);
1607     }
1608     else
1609     {
1610         int4 ia = cast(int4)a;
1611         int4 ib = cast(int4)b;
1612         long2 r;
1613         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1614         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1615         return cast(__m128i)r;
1616     }
1617 }
1618 unittest
1619 {
1620     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1621     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1622     long2 R = cast(long2) _mm_mul_epi32(A, B);
1623     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1624     assert(R.array == correct);
1625 }
1626 
1627 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1628 /// return the low 32 bits of the intermediate integers.
1629 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
1630 {
1631     // PERF DMD
1632     // PERF GDC without SSE4.1 could be better
1633     static if (GDC_with_SSE41)
1634     {
1635         int4 ia = cast(int4)a;
1636         int4 ib = cast(int4)b;
1637         // Note: older GDC doesn't have that op, but older GDC
1638         // also has no support for -msse4.1 detection
1639         return cast(__m128i)(a * b); 
1640     }
1641     else version(LDC)
1642     {
1643         int4 ia = cast(int4)a;
1644         int4 ib = cast(int4)b;
1645         return cast(__m128i)(a * b);
1646     }
1647     else
1648     {
1649         // DMD doesn't take the above
1650         int4 ia = cast(int4)a;
1651         int4 ib = cast(int4)b;
1652         int4 r;
1653         r.ptr[0] = ia.array[0] * ib.array[0];
1654         r.ptr[1] = ia.array[1] * ib.array[1];
1655         r.ptr[2] = ia.array[2] * ib.array[2];
1656         r.ptr[3] = ia.array[3] * ib.array[3];
1657         return r;
1658     }
1659 }
1660 unittest
1661 {
1662     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1663     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1664     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1665     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1666     assert(R.array == correct);
1667 }
1668 
1669 
1670 /// Convert packed signed 32-bit integers from `a` and `b` 
1671 /// to packed 16-bit integers using unsigned saturation.
1672 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
1673 {
1674     static if (GDC_with_SSE41)
1675     {
1676         // PERF For some reason doesn't generates the builtin???
1677         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1678     }
1679     else static if (LDC_with_SSE41)
1680     {
1681         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1682     }
1683     else static if (LDC_with_ARM64)
1684     {
1685        int4 z;
1686        z = 0;       
1687        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1688                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1689     }
1690     else
1691     {
1692         // PERF: not great without SSE4.1
1693         int4 sa = cast(int4)a;
1694         int4 sb = cast(int4)b;
1695         align(16) ushort[8] result;
1696         for (int i = 0; i < 4; ++i)
1697         {
1698             int s = sa.array[i];
1699             if (s < 0) s = 0;
1700             if (s > 65535) s = 65535;
1701             result.ptr[i] = cast(ushort)s;
1702 
1703             s = sb.array[i];
1704             if (s < 0) s = 0;
1705             if (s > 65535) s = 65535;
1706             result.ptr[i+4] = cast(ushort)s;
1707         }
1708         return *cast(__m128i*)(result.ptr);
1709     }
1710 }
1711 unittest
1712 {
1713     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1714     short8 R = cast(short8) _mm_packus_epi32(A, A);
1715     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1716     assert(R.array == correct);
1717 }
1718 
1719 
1720 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1721 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1722 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1723 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1724 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1725 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1726 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1727 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1728 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1729 {
1730     // PERF DMD
1731     static if (GDC_with_SSE41)
1732     {
1733         return __builtin_ia32_roundpd(a, rounding);
1734     }
1735     else static if (LDC_with_SSE41)
1736     {
1737         return __builtin_ia32_roundpd(a, rounding);
1738     }
1739     else
1740     {
1741         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1742         {
1743             // Convert to 64-bit integers
1744             long lo = _mm_cvtsd_si64(a);
1745             a.ptr[0] = a.array[1];
1746             long hi = _mm_cvtsd_si64(a);
1747             return _mm_setr_pd(lo, hi);
1748         }
1749         else
1750         {
1751             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1752 
1753             uint old = _MM_GET_ROUNDING_MODE();
1754             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1755             
1756             // Convert to 64-bit integers
1757             long lo = _mm_cvtsd_si64(a);
1758             a.ptr[0] = a.array[1];
1759             long hi = _mm_cvtsd_si64(a);
1760 
1761             // Convert back to double to achieve the rounding
1762             // The problem is that a 64-bit double can't represent all the values 
1763             // a 64-bit integer can (and vice-versa). So this function won't work for
1764             // large values. (TODO: what range exactly?)
1765             _MM_SET_ROUNDING_MODE(old);
1766             return _mm_setr_pd(lo, hi);
1767         }
1768     }
1769 }
1770 unittest
1771 {
1772     // tested in other intrinsics
1773 }
1774 
1775 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1776 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1777 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1778 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1779 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1780 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1781 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1782 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1783 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1784 {
1785     // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
1786     static if (GDC_or_LDC_with_SSE41)
1787     {
1788         return __builtin_ia32_roundps(a, rounding);
1789     }
1790     else
1791     {
1792         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1793         {
1794             __m128i integers = _mm_cvtps_epi32(a);
1795             return _mm_cvtepi32_ps(integers);
1796         }
1797         else
1798         {
1799             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1800             uint old = _MM_GET_ROUNDING_MODE();
1801             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1802             scope(exit) _MM_SET_ROUNDING_MODE(old);
1803 
1804             // Convert to 64-bit integers
1805             __m128i integers = _mm_cvtps_epi32(a);
1806 
1807             // Convert back to float to achieve the rounding
1808             // The problem is that a 32-float can't represent all the values 
1809             // a 32-bit integer can (and vice-versa). So this function won't work for
1810             // large values. (TODO: what range exactly?)
1811             __m128 result = _mm_cvtepi32_ps(integers);
1812 
1813             return result;
1814         }
1815     }
1816 }
1817 unittest
1818 {
1819     // tested in other intrinsics
1820 }
1821 
1822 
1823 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1824 /// rounding parameter, store the result as a double-precision floating-point element 
1825 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1826 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1827 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1828 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1829 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1830 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1831 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1832 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1833 {
1834     static if (GDC_with_SSE41)
1835     {
1836         return __builtin_ia32_roundsd(a, b, rounding);
1837     }
1838     else static if (LDC_with_SSE41)
1839     {
1840         return __builtin_ia32_roundsd(a, b, rounding);
1841     }
1842     else
1843     {
1844         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1845         {
1846             // Convert to 64-bit integer
1847             long b0 = _mm_cvtsd_si64(b);
1848             a.ptr[0] = b0;
1849             return a;
1850         }
1851         else
1852         {
1853             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1854 
1855             uint old = _MM_GET_ROUNDING_MODE();
1856             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1857             
1858             // Convert to 64-bit integer
1859             long b0 = _mm_cvtsd_si64(b);
1860             a.ptr[0] = b0;
1861 
1862             // Convert back to double to achieve the rounding
1863             // The problem is that a 64-bit double can't represent all the values 
1864             // a 64-bit integer can (and vice-versa). So this function won't work for
1865             // large values. (TODO: what range exactly?)
1866             _MM_SET_ROUNDING_MODE(old);
1867             return a;
1868         }
1869     }
1870 }
1871 unittest
1872 {
1873     // tested in other intrinsics
1874 }
1875 
1876 
1877 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1878 /// rounding parameter, store the result as a single-precision floating-point element 
1879 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1880 /// to the upper elements of result.
1881 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1882 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1883 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1884 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1885 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1886 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1887 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1888 {
1889     static if (GDC_with_SSE41)
1890     {
1891         return __builtin_ia32_roundss(a, b, rounding);
1892     }
1893     else static if (LDC_with_SSE41)
1894     {
1895         return __builtin_ia32_roundss(a, b, rounding);
1896     }
1897     else
1898     {
1899         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1900         {
1901             int b0 = _mm_cvtss_si32(b);
1902             a.ptr[0] = b0;   
1903             return a;
1904         }
1905         else version(GNU)
1906         {
1907             pragma(inline, false)
1908             __m128 GDCworkaround() nothrow @nogc @trusted 
1909             {
1910                 uint old = _MM_GET_ROUNDING_MODE();
1911                 _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1912 
1913                 // Convert to 32-bit integer
1914                 int b0 = _mm_cvtss_si32(b);
1915                 a.ptr[0] = b0;       
1916 
1917                 // Convert back to double to achieve the rounding
1918                 // The problem is that a 32-bit float can't represent all the values 
1919                 // a 32-bit integer can (and vice-versa). So this function won't work for
1920                 // large values. (TODO: what range exactly?)
1921                 _MM_SET_ROUNDING_MODE(old);
1922                 return a;
1923             }
1924             return GDCworkaround();
1925         }
1926         else
1927         {
1928             uint old = _MM_GET_ROUNDING_MODE();
1929             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1930 
1931             // Convert to 32-bit integer
1932             int b0 = _mm_cvtss_si32(b);
1933             a.ptr[0] = b0;       
1934 
1935             // Convert back to double to achieve the rounding
1936             // The problem is that a 32-bit float can't represent all the values 
1937             // a 32-bit integer can (and vice-versa). So this function won't work for
1938             // large values. (TODO: what range exactly?)
1939             _MM_SET_ROUNDING_MODE(old);
1940             return a;
1941         }
1942     }
1943 }
1944 unittest
1945 {
1946     // tested in other intrinsics
1947 }
1948 
1949 
1950 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
1951 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
1952 /// exception may be generated.
1953 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted
1954 {
1955     // PERF DMD D_SIMD
1956     static if (GDC_with_SSE41)
1957     {
1958         return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
1959     }
1960     else static if (LDC_with_InlineIREx)
1961     {
1962         enum prefix = `!0 = !{ i32 1 }`;
1963         enum ir = `
1964             %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
1965             ret <4 x i32> %r`;
1966         return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr);
1967     }
1968     else
1969     {
1970         return *mem_addr; // regular move instead
1971     }
1972 }
1973 // TODO unittest
1974 
1975 
1976 /// Return 1 if all bits in `a` are all 1's. Else return 0.
1977 int _mm_test_all_ones (__m128i a) @safe
1978 {
1979     return _mm_testc_si128(a, _mm_set1_epi32(-1));
1980 }
1981 unittest
1982 {
1983     __m128i A = _mm_set1_epi32(-1);
1984     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
1985     assert(_mm_test_all_ones(A) == 1);
1986     assert(_mm_test_all_ones(B) == 0);
1987 }
1988 
1989 /// Return 1 if all bits in `a` are all 0's. Else return 0.
1990 // This is a #BONUS since it was lacking in Intel Intrinsics API.
1991 int _mm_test_all_zeros (__m128i a) @safe
1992 {
1993     return _mm_testz_si128(a, _mm_set1_epi32(-1));
1994 }
1995 unittest
1996 {
1997     __m128i A = _mm_set1_epi32(0);
1998     __m128i B = _mm_set_epi32(0, 8, 0, 0);
1999     assert(_mm_test_all_zeros(A) == 1);
2000     assert(_mm_test_all_zeros(B) == 0);
2001 }
2002 
2003 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
2004 /// and return 1 if the result is zero, otherwise return 0.
2005 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
2006 {
2007     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
2008 }
2009 
2010 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 
2011 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 
2012 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
2013 /// CF values are zero, otherwise return 0.
2014 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
2015 {
2016     return _mm_testnzc_si128(a, mask);
2017 }
2018 
2019 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
2020 /// result is zero, otherwise return 0.
2021 /// In other words, test if all bits masked by `b` are 1 in `a`.
2022 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
2023 {
2024     // PERF DMD
2025     static if (GDC_with_SSE41)
2026     {
2027         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2028     }
2029     else static if (LDC_with_SSE41)
2030     {
2031         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2032     }
2033     else static if (LDC_with_ARM64)
2034     {
2035         // Acceptable since LDC 1.8 -02
2036         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2037         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2038     }
2039     else
2040     {
2041         __m128i c = ~a & b;
2042         int[4] zero = [0, 0, 0, 0];
2043         return c.array == zero;
2044     }
2045 }
2046 unittest
2047 {
2048     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2049     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2050     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2051     assert(_mm_testc_si128(A, A) == 1);
2052     assert(_mm_testc_si128(A, M1) == 0);
2053     assert(_mm_testc_si128(A, M2) == 1);
2054 }
2055 
2056 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2057 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2058 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2059 /// result is zero, otherwise set CF to 0. 
2060 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2061 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2062 {
2063     // PERF DMD
2064     static if (GDC_with_SSE41)
2065     {
2066         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2067     }
2068     else static if (LDC_with_SSE41)
2069     {
2070         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2071     }
2072     else static if (LDC_with_ARM64)
2073     {
2074         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2075         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2076 
2077         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2078                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2079     }
2080     else
2081     {
2082         __m128i c = a & b;
2083         __m128i d = ~a & b;
2084         int[4] zero = [0, 0, 0, 0];
2085         return !( (c.array == zero) || (d.array == zero));
2086     }    
2087 }
2088 unittest
2089 {
2090     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2091     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2092     __m128i Z = _mm_setzero_si128();
2093     assert(_mm_testnzc_si128(A, Z) == 0);
2094     assert(_mm_testnzc_si128(A, M) == 1);
2095     assert(_mm_testnzc_si128(A, A) == 0);
2096 }
2097 
2098 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2099 /// and return 1 if the result is zero, otherwise return 0.
2100 /// In other words, test if all bits masked by `b` are 0 in `a`.
2101 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2102 {
2103     // PERF DMD
2104     static if (GDC_with_SSE41)
2105     {
2106         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2107     }
2108     else static if (LDC_with_SSE41)
2109     {
2110         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2111     }
2112     else static if (LDC_with_ARM64)
2113     {
2114         // Acceptable since LDC 1.8 -02
2115         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2116         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2117     }
2118     else 
2119     {
2120         __m128i c = a & b;
2121         int[4] zero = [0, 0, 0, 0];
2122         return c.array == zero;
2123     }    
2124 }
2125 unittest
2126 {
2127     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2128     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2129     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2130     assert(_mm_testz_si128(A, A) == 0);
2131     assert(_mm_testz_si128(A, M1) == 1);
2132     assert(_mm_testz_si128(A, M2) == 0);
2133 }
2134