1 /**
2 * SSE4.1 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 *            cet 2024.
8 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 */
10 module inteli.smmintrin;
11 
12 // SSE4.1 instructions
13 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
14 // Note: this header will work whether you have SSE4.1 enabled or not.
15 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
16 // generate SSE4.1 instructions.
17 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
18 
19 public import inteli.types;
20 import inteli.internals;
21 
22 // smmintrin pulls in all previous instruction set intrinsics.
23 public import inteli.tmmintrin;
24 
25 nothrow @nogc:
26 
27 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
28 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
29 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
30 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
31 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
32 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
33 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
34 
35 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
36 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
37 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
38 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
39 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
40 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
41 
42 /// Add packed signed 32-bit integers in `a` and `b` using saturation.
43 /// #BONUS
44 __m128i _mm_adds_epi32(__m128i a, __m128i b) pure
45 {
46     // PERF: ARM64 should use 2x vqadd_s32
47     static if (LDC_with_saturated_intrinsics)
48         return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b);
49     else
50     {
51         __m128i int_max = _mm_set1_epi32(0x7FFFFFFF);
52         __m128i res = _mm_add_epi32(a, b);
53         __m128i sign_bit = _mm_srli_epi32(a, 31);
54         __m128i sign_xor  = _mm_xor_si128(a, b);
55         __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res));
56         __m128i saturated = _mm_add_epi32(int_max, sign_bit);
57         return cast(__m128i) _mm_blendv_ps(cast(__m128)res, 
58             cast(__m128)saturated, 
59             cast(__m128)overflow);
60     }
61 }
62 unittest
63 {
64     __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min);
65     __m128i b = _mm_setr_epi32(1, 2, 3, -4);
66     assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]);
67 }
68 
69 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
70 // Note: changed signature, GDC needs a compile-time value for imm8.
71 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted
72 {
73     // PERF DMD
74     static if (GDC_with_SSE41)
75     {
76         pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16
77         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
78     }
79     else 
80     {
81         // LDC x86 This generates pblendw since LDC 1.1 and -O2
82         short8 r;
83         short8 sa = cast(short8)a;
84         short8 sb = cast(short8)b;
85         for (int n = 0; n < 8; ++n)
86         {
87             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
88         }
89         return cast(__m128i)r;
90     }
91 }
92 unittest
93 {
94     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
95     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
96     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
97     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
98     assert(C.array == correct);
99 }
100 
101 
102 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
103 // Note: changed signature, GDC needs a compile-time value for `imm8`.
104 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
105 {
106     static assert(imm8 >= 0 && imm8 < 4);
107     // PERF DMD
108     static if (GDC_with_SSE41)
109     {
110         return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
111     }
112     else
113     {
114         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
115         double2 r;
116         for (int n = 0; n < 2; ++n)
117         {
118             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
119         }
120         return cast(__m128d)r;
121     }
122 }
123 unittest
124 {
125     __m128d A = _mm_setr_pd(0, 1);
126     __m128d B = _mm_setr_pd(8, 9);
127     double2 C = _mm_blend_pd!2(A, B);
128     double[2] correct =    [0, 9];
129     assert(C.array == correct);
130 }
131 
132 
133 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
134 /// mask `imm8`.
135 // Note: changed signature, GDC needs a compile-time value for imm8.
136 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted
137 {
138     // PERF DMD
139     static assert(imm8 >= 0 && imm8 < 16);
140     static if (GDC_with_SSE41)
141     {
142         return __builtin_ia32_blendps(a, b, imm8);
143     }
144     else version(LDC)
145     {
146         // LDC x86: generates blendps since LDC 1.1 -O2
147         //   arm64: pretty good, two instructions worst case
148         return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
149                                          (imm8 & 2) ? 5 : 1,
150                                          (imm8 & 4) ? 6 : 2,
151                                          (imm8 & 8) ? 7 : 3)(a, b);
152     }
153     else
154     {
155         // PERF GDC without SSE4.1 is quite bad
156         __m128 r;
157         for (int n = 0; n < 4; ++n)
158         {
159             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
160         }
161         return r;
162     }
163 }
164 unittest
165 {
166     __m128 A = _mm_setr_ps(0, 1,  2,  3);
167     __m128 B = _mm_setr_ps(8, 9, 10, 11);
168     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
169     float[4] correct =    [8, 1, 10, 11];
170     assert(C.array == correct);
171 }
172 
173 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
174 /// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`.
175 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted
176 {
177     // PERF DMD
178     /*static if (GDC_with_SSE41)
179     {
180         // This intrinsic do nothing in GDC 12.
181         // TODO report to GDC. No problem in GCC.
182         return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
183     }
184     else*/
185     static if (LDC_with_SSE41)
186     {
187         return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
188     }
189     else static if (LDC_with_ARM64)
190     {
191         // LDC arm64: two instructions since LDC 1.12 -O2
192         byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
193         return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
194     }
195     else
196     {
197         __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
198         return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
199     }
200 }
201 unittest
202 {
203     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
204                                8,  9, 10, 11, 12, 13, 14, 15);
205     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
206                               24, 25, 26, 27, 28, 29, 30, 31);
207     __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
208                                1,  1, -1, -1,  4,  1,  8, -128);
209     byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
210     byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
211                                8,  9, 26, 27, 12, 13, 14, 31 ];
212     assert(R.array == correct);
213 }
214 
215 
216 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
217 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
218 {
219     // PERF DMD
220     static if (GDC_with_SSE42)
221     {
222         // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
223         // with -msse4.2 but not -msse4.1.
224         // Not sure what is the reason, and there is a replacement sequence.
225         // Sounds like a bug.
226         return __builtin_ia32_blendvpd(a, b, mask);
227     }
228     else static if (LDC_with_SSE41)
229     {
230         return __builtin_ia32_blendvpd(a, b, mask);
231     }
232     else static if (LDC_with_ARM64)
233     {
234         long2 shift;
235         shift = 63;
236         long2 lmask = cast(long2)mask >> shift;
237         return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
238     }
239     else
240     {
241         __m128d r; // PERF =void;
242         long2 lmask = cast(long2)mask;
243         for (int n = 0; n < 2; ++n)
244         {
245             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
246         }
247         return r;
248     }
249 }
250 unittest
251 {
252     __m128d A = _mm_setr_pd(1.0, 2.0);
253     __m128d B = _mm_setr_pd(3.0, 4.0);
254     __m128d M1 = _mm_setr_pd(-3.0, 2.0);
255     __m128d R1 = _mm_blendv_pd(A, B, M1);
256     double[2] correct1 = [3.0, 2.0];
257     assert(R1.array == correct1);
258 
259     // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
260     // See Issue #78
261     __m128d M2 = _mm_setr_pd(double.nan, double.infinity);
262     __m128d R2 = _mm_blendv_pd(A, B, M2);
263     double[2] correct2 = [1.0, 2.0];
264     assert(R2.array == correct2);
265 }
266 
267 
268 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
269 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted
270 {
271     // PERF DMD
272     static if (GDC_with_SSE41)
273     {
274         return __builtin_ia32_blendvps(a, b, mask);
275     }
276     else static if (LDC_with_SSE41)
277     {
278         return __builtin_ia32_blendvps(a, b, mask);
279     }
280     else static if (LDC_with_ARM64)
281     {
282         int4 shift;
283         shift = 31;
284         int4 lmask = cast(int4)mask >> shift;
285         return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
286     }
287     else
288     {
289         // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2
290         // If lack of optimization, consider replacing by:
291         //  __m128i overflow_mask = _mm_srai_epi32(overflow, 31);
292         //    return _mm_or_si128(
293         //        _mm_and_si128(overflow_mask, saturated),
294         //        _mm_andnot_si128(overflow_mask, res)
295         // LLVM makes almost the same sequence when optimized.
296         __m128 r;
297         int4 lmask = cast(int4)mask;
298         for (int n = 0; n < 4; ++n)
299         {
300             r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
301         }
302         return r;
303     }
304 }
305 unittest
306 {
307     __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
308     __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
309     __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
310     __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
311     __m128 R1 = _mm_blendv_ps(A, B, M1);
312     __m128 R2 = _mm_blendv_ps(A, B, M2);
313     float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
314     float[4] correct2 =    [ 0.0f, 1.0f, 6.0f, 3.0f];
315     assert(R1.array == correct1);
316 
317     // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
318     // See Issue #78
319     assert(R2.array == correct2);
320 }
321 
322 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
323 /// and store the results as packed double-precision floating-point elements.
324 __m128d _mm_ceil_pd (__m128d a) @trusted
325 {
326     static if (LDC_with_ARM64)
327     {
328         // LDC arm64 acceptable since 1.8 -O2
329         // Unfortunately x86 intrinsics force a round-trip back to double2
330         // ARM neon semantics wouldn't have that
331         long2 l = vcvtpq_s64_f64(a);
332         double2 r;
333         r.ptr[0] = l.array[0];
334         r.ptr[1] = l.array[1];
335         return r;
336     }
337     else
338     {
339         return _mm_round_pd!2(a);
340     }
341 }
342 unittest
343 {
344     __m128d A = _mm_setr_pd(1.3f, -2.12f);
345     __m128d B = _mm_setr_pd(53.6f, -2.7f);
346     A = _mm_ceil_pd(A);
347     B = _mm_ceil_pd(B);
348     double[2] correctA = [2.0, -2.0];
349     double[2] correctB = [54.0, -2.0];
350     assert(A.array == correctA);
351     assert(B.array == correctB);
352 }
353 
354 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
355 /// and store the results as packed single-precision floating-point elements.
356 __m128 _mm_ceil_ps (__m128 a) @trusted
357 {
358     static if (LDC_with_ARM64)
359     {
360         // LDC arm64 acceptable since 1.8 -O1
361         int4 l = vcvtpq_s32_f32(a);
362         float4 r;
363         r.ptr[0] = l.array[0];
364         r.ptr[1] = l.array[1];
365         r.ptr[2] = l.array[2];
366         r.ptr[3] = l.array[3];
367         return r;
368     }
369     else
370     {
371         return _mm_round_ps!2(a);
372     }
373 }
374 unittest
375 {
376     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
377     __m128 C = _mm_ceil_ps(A);
378     float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
379     assert(C.array == correct);
380 }
381 
382 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
383 /// store the result as a double-precision floating-point element in the lower element of result, 
384 /// and copy the upper element from `a` to the upper element of dst.
385 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
386 {
387     static if (LDC_with_ARM64)
388     {
389         a[0] = vcvtps_s64_f64(b[0]);
390         return a;
391     }
392     else
393     {
394         return _mm_round_sd!2(a, b);
395     }
396 }
397 unittest
398 {
399     __m128d A = _mm_setr_pd(1.3, -2.12);
400     __m128d B = _mm_setr_pd(53.6, -3.7);
401     __m128d C = _mm_ceil_sd(A, B);
402     double[2] correct = [54.0, -2.12];
403     assert(C.array == correct);
404 }
405 
406 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
407 /// store the result as a single-precision floating-point element in the lower element of result, 
408 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
409 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
410 {
411     static if (LDC_with_ARM64)
412     {
413         a[0] = vcvtps_s32_f32(b[0]);
414         return a;
415     }
416     else
417     {
418         return _mm_round_ss!2(a, b);
419     }
420 }
421 unittest
422 {
423     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
424     __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
425     __m128 C = _mm_ceil_ss(A, B);
426     float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
427     assert(C.array == correct);
428 }
429 
430 /// Compare packed 64-bit integers in `a` and `b` for equality.
431 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
432 {
433     static if (SIMD_COMPARISON_MASKS_16B)
434     {
435         version(DigitalMars)
436         {
437             // DMD doesn't recognize long2 == long2
438             long2 la = cast(long2)a;
439             long2 lb = cast(long2)b;
440             long2 res;
441             res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
442             res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
443             return cast(__m128i)res;
444         }
445         else
446         {
447             return cast(__m128i)(cast(long2)a == cast(long2)b);
448         }
449     }
450     else static if (GDC_with_SSE41)
451     {
452         return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
453     }
454     else version(LDC)
455     {
456         // LDC x86: generates pcmpeqq since LDC 1.1 -O1
457         //     arm64: generates cmeq since LDC 1.8 -O1
458         return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
459     }
460     else
461     {
462         // Clever pcmpeqd + pand use with LDC 1.24 -O2
463         long2 la = cast(long2)a;
464         long2 lb = cast(long2)b;
465         long2 res;
466         res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
467         res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
468         return cast(__m128i)res;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_setr_epi64(-1, -2);
474     __m128i B = _mm_setr_epi64(-3, -2);
475     __m128i C = _mm_setr_epi64(-1, -4);
476     long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
477     long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
478     long[2] correct1 = [0, -1];
479     long[2] correct2 = [-1, 0];
480     assert(AB.array == correct1);
481     assert(AC.array == correct2);
482 }
483 
484 
485 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
486 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
487 {
488     // PERF DMD
489     static if (GDC_with_SSE41)
490     {
491         return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
492     }
493     else static if (LDC_with_optimizations)
494     {
495         // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
496         enum ir = `
497             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
498             %r = sext <4 x i16> %v to <4 x i32>
499             ret <4 x i32> %r`;
500         return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
501     }
502     else
503     {
504         short8 sa = cast(short8)a;
505         int4 r;
506         r.ptr[0] = sa.array[0];
507         r.ptr[1] = sa.array[1];
508         r.ptr[2] = sa.array[2];
509         r.ptr[3] = sa.array[3];
510         return r;
511     }
512 }
513 unittest
514 {
515     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
516     int4 C = cast(int4) _mm_cvtepi16_epi32(A);
517     int[4] correct = [-1, 0, -32768, 32767];
518     assert(C.array == correct);
519 }
520 
521 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
522 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
523 {
524     // PERF DMD
525     static if (GDC_with_SSE41)
526     {
527         return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
528     }
529     else static if (LDC_with_optimizations)
530     {
531         // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
532         enum ir = `
533             %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
534             %r = sext <2 x i16> %v to <2 x i64>
535             ret <2 x i64> %r`;
536         return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
537     }
538     else
539     {
540         short8 sa = cast(short8)a;
541         long2 r;
542         r.ptr[0] = sa.array[0];
543         r.ptr[1] = sa.array[1];
544         return cast(__m128i)r;
545     }
546 }
547 unittest
548 {
549     __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
550     long2 C = cast(long2) _mm_cvtepi16_epi64(A);
551     long[2] correct = [-32768, 32767];
552     assert(C.array == correct);
553 }
554 
555 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
556 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
557 {
558     // PERF DMD
559     static if (GDC_with_SSE41)
560     {
561         return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
562     }
563     else static if (LDC_with_optimizations)
564     {
565         // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
566         enum ir = `
567             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
568             %r = sext <2 x i32> %v to <2 x i64>
569             ret <2 x i64> %r`;
570         return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
571     }
572     else
573     {
574         int4 sa = cast(int4)a;
575         long2 r;
576         r.ptr[0] = sa.array[0];
577         r.ptr[1] = sa.array[1];
578         return cast(__m128i)r;
579     }
580 }
581 unittest
582 {
583     __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
584     long2 C = cast(long2) _mm_cvtepi32_epi64(A);
585     long[2] correct = [-4, 42];
586     assert(C.array == correct);
587 }
588 
589 
590 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
591 __m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted
592 {
593     // PERF DMD
594     static if (GDC_with_SSE41)
595     {
596         alias ubyte16 = __vector(ubyte[16]);
597         return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
598     }
599     else static if (LDC_with_optimizations)
600     {
601         // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
602         // LDC ARM64: sshll generated since LDC 1.8.0 -O1
603         enum ir = `
604             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
605             %r = sext <8 x i8> %v to <8 x i16>
606             ret <8 x i16> %r`;
607         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
608     }
609     else
610     {
611         byte16 sa = cast(byte16)a;
612         short8 r;
613         foreach(n; 0..8)
614             r.ptr[n] = sa.array[n];
615         return cast(__m128i)r;
616     }
617 }
618 unittest
619 {
620     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
621     short8 C = cast(short8) _mm_cvtepi8_epi16(A);
622     short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
623     assert(C.array == correct);
624 }
625 
626 
627 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
628 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
629 {
630     // PERF DMD
631     static if (GDC_with_SSE41)
632     {
633         alias ubyte16 = __vector(ubyte[16]);
634         return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
635     }
636     else static if (LDC_with_SSE41 && LDC_with_optimizations)
637     {
638         // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
639         enum ir = `
640             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
641             %r = sext <4 x i8> %v to <4 x i32>
642             ret <4 x i32> %r`;
643         return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
644     }
645     else
646     {
647         // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
648         byte16 sa = cast(byte16)a;
649         int4 r;
650         r.ptr[0] = sa.array[0];
651         r.ptr[1] = sa.array[1];
652         r.ptr[2] = sa.array[2];
653         r.ptr[3] = sa.array[3];
654         return cast(__m128i)r;
655     }
656 }
657 unittest
658 {
659     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
660     int4 C = cast(int4) _mm_cvtepi8_epi32(A);
661     int[4] correct = [127, -128, 1, -1];
662     assert(C.array == correct);
663 }
664 
665 
666 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
667 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
668 {
669     // PERF DMD
670     static if (GDC_with_SSE41)
671     {
672         alias ubyte16 = __vector(ubyte[16]);
673         return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
674     }
675     else static if (LDC_with_optimizations)
676     {
677         // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
678         // LDC arm64: it's ok since LDC 1.8 -O1
679         enum ir = `
680             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
681             %r = sext <2 x i8> %v to <2 x i64>
682             ret <2 x i64> %r`;
683         return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
684     }
685     else
686     {
687         byte16 sa = cast(byte16)a;
688         long2 r;
689         foreach(n; 0..2)
690             r.ptr[n] = sa.array[n];
691         return cast(__m128i)r;
692     }
693 }
694 unittest
695 {
696     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
697     long2 C = cast(long2) _mm_cvtepi8_epi64(A);
698     long[2] correct = [127, -128];
699     assert(C.array == correct);
700 }
701 
702 
703 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
704 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
705 {
706     // PERF DMD
707     static if (GDC_with_SSE41)
708     {
709         return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
710     }
711     else
712     {
713         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
714         //     arm64: ushll since LDC 1.12 -O1
715         short8 sa = cast(short8)a;
716         int4 r;
717         r.ptr[0] = cast(ushort)sa.array[0];
718         r.ptr[1] = cast(ushort)sa.array[1];
719         r.ptr[2] = cast(ushort)sa.array[2];
720         r.ptr[3] = cast(ushort)sa.array[3];
721         return cast(__m128i)r;
722     }
723 }
724 unittest
725 {
726     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
727     int4 C = cast(int4) _mm_cvtepu16_epi32(A);
728     int[4] correct = [65535, 0, 32768, 32767];
729     assert(C.array == correct);
730 }
731 
732 
733 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
734 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
735 {
736     // PERF DMD
737     static if (GDC_with_SSE41)
738     {
739         return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
740     }
741     else static if (LDC_with_ARM64)
742     {
743         // LDC arm64: a bit shorter than below, in -O2
744         short8 sa = cast(short8)a;
745         long2 r;
746         for(int n = 0; n < 2; ++n)
747             r.ptr[n] = cast(ushort)sa.array[n];
748         return cast(__m128i)r;
749     }
750     else
751     {
752         // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
753         short8 sa = cast(short8)a;
754         long2 r;
755         r.ptr[0] = cast(ushort)sa.array[0];
756         r.ptr[1] = cast(ushort)sa.array[1];
757         return cast(__m128i)r;
758     }
759 }
760 unittest
761 {
762     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
763     long2 C = cast(long2) _mm_cvtepu16_epi64(A);
764     long[2] correct = [65535, 0];
765     assert(C.array == correct);
766 }
767 
768 
769 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
770 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
771 {
772     // PERF DMD
773     static if (GDC_with_SSE41)
774     {
775         return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
776     }
777     else
778     {
779         // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
780         //     arm64: generates ushll since LDC 1.12 -O1
781         int4 sa = cast(int4)a;
782         long2 r;
783         r.ptr[0] = cast(uint)sa.array[0];
784         r.ptr[1] = cast(uint)sa.array[1];
785         return cast(__m128i)r;
786     }
787 }
788 unittest
789 {
790     __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
791     long2 C = cast(long2) _mm_cvtepu32_epi64(A);
792     long[2] correct = [4294967295, 42];
793     assert(C.array == correct);
794 }
795 
796 
797 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
798 __m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted
799 {
800     // PERF DMD
801     static if (GDC_with_SSE41)
802     {
803         return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
804     }
805     else static if (LDC_with_optimizations)
806     {
807         enum ir = `
808             %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
809             %r = zext <8 x i8> %v to <8 x i16>
810             ret <8 x i16> %r`;
811         return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
812     }
813     else
814     {
815         return _mm_unpacklo_epi8(a, _mm_setzero_si128());
816     }
817 }
818 unittest
819 {
820     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
821     short8 C = cast(short8) _mm_cvtepu8_epi16(A);
822     short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
823     assert(C.array == correct);
824 }
825 
826 
827 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
828 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
829 {
830     // PERF DMD
831     static if (GDC_with_SSE41)
832     {
833         alias ubyte16 = __vector(ubyte[16]);
834         return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
835     }
836     else static if (LDC_with_ARM64)
837     {
838         // LDC arm64: a bit better than below in -O2
839         byte16 sa = cast(byte16)a;
840         int4 r;
841         for(int n = 0; n < 4; ++n) 
842             r.ptr[n] = cast(ubyte)sa.array[n];
843         return cast(__m128i)r;
844     }
845     else
846     {
847         // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
848         // PERF: catastrophic with GDC without SSE4.1
849         byte16 sa = cast(byte16)a;
850         int4 r;
851         r.ptr[0] = cast(ubyte)sa.array[0];
852         r.ptr[1] = cast(ubyte)sa.array[1];
853         r.ptr[2] = cast(ubyte)sa.array[2];
854         r.ptr[3] = cast(ubyte)sa.array[3];
855         return cast(__m128i)r;
856     }
857 }
858 unittest
859 {
860     __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
861     int4 C = cast(int4) _mm_cvtepu8_epi32(A);
862     int[4] correct = [127, 128, 1, 255];
863     assert(C.array == correct);
864 }
865 
866 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
867 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
868 {
869     // PERF DMD
870     static if (GDC_with_SSE41)
871     {
872         alias ubyte16 = __vector(ubyte[16]);
873         return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
874     }
875     else static if (LDC_with_ARM64)
876     {
877         // LDC arm64: this optimizes better than the loop below
878         byte16 sa = cast(byte16)a;
879         long2 r;
880         for (int n = 0; n < 2; ++n)
881             r.ptr[n] = cast(ubyte)sa.array[n];
882         return cast(__m128i)r;
883     }
884     else
885     {
886         // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
887         byte16 sa = cast(byte16)a;
888         long2 r;
889         r.ptr[0] = cast(ubyte)sa.array[0];
890         r.ptr[1] = cast(ubyte)sa.array[1];
891         return cast(__m128i)r;
892     }
893 }
894 unittest
895 {
896     __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
897     long2 C = cast(long2) _mm_cvtepu8_epi64(A);
898     long[2] correct = [127, 254];
899     assert(C.array == correct);
900 }
901 
902 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
903 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
904 /// store the sum in dst using the low 4 bits of `imm8`.
905 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
906 {
907     // PERF DMD
908     static if (GDC_with_SSE41)
909     {
910         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
911     }
912     else static if (LDC_with_SSE41)
913     {
914         return __builtin_ia32_dppd(a, b, imm8 & 0x33);
915     }
916     else
917     {
918         __m128d zero = _mm_setzero_pd();
919         __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
920         double sum = temp.array[0] + temp.array[1];
921         return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
922     }
923 }
924 unittest
925 {
926     __m128d A = _mm_setr_pd(1.0, 2.0);
927     __m128d B = _mm_setr_pd(4.0, 8.0);
928     double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
929     double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
930     double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
931     double[2] correct1 = [ 4.0,  4.0];
932     double[2] correct2 = [16.0,  0.0];
933     double[2] correct3 = [ 0.0, 20.0];
934     assert(R1.array == correct1);
935     assert(R2.array == correct2);
936     assert(R3.array == correct3);
937 }
938 
939 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
940 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
941 /// and conditionally store the sum in result using the low 4 bits of `imm8`.
942 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
943 {
944       // PERF DMD
945     static if (GDC_with_SSE41)
946     {
947         return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
948     }
949     else static if (LDC_with_SSE41)
950     {
951         return __builtin_ia32_dpps(a, b, cast(byte)imm8);
952     }
953     else
954     {
955         __m128 zero = _mm_setzero_ps();
956         __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
957         float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
958         return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
959     }        
960 }
961 unittest
962 {
963     __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
964     __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
965     float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
966     float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
967     float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
968     float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
969     float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
970     float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
971     assert(R1.array == correct1);
972     assert(R2.array == correct2);
973     assert(R3.array == correct3);
974 }
975 
976 
977 /// Extract a 32-bit integer from `a`, selected with `imm8`.
978 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
979 {
980     return (cast(int4)a).array[imm8 & 3];
981 }
982 unittest
983 {
984     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
985     assert(_mm_extract_epi32(A, 0) == 1);
986     assert(_mm_extract_epi32(A, 1 + 8) == 2);
987     assert(_mm_extract_epi32(A, 3 + 4) == 4);
988 }
989 
990 /// Extract a 64-bit integer from `a`, selected with `imm8`.
991 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
992 {
993     long2 la = cast(long2)a;
994     return la.array[imm8 & 1];
995 }
996 unittest
997 {
998     __m128i A = _mm_setr_epi64(45, -67);
999     assert(_mm_extract_epi64(A, 0) == 45);
1000     assert(_mm_extract_epi64(A, 1) == -67);
1001     assert(_mm_extract_epi64(A, 2) == 45);
1002 }
1003 
1004 /// Extract an 8-bit integer from `a`, selected with `imm8`.
1005 /// Warning: the returned value is zero-extended to 32-bits.
1006 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
1007 {
1008     byte16 ba = cast(byte16)a;
1009     return cast(ubyte) ba.array[imm8 & 15];
1010 }
1011 unittest
1012 {
1013     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
1014     assert(_mm_extract_epi8(A, 7) == 7);
1015     assert(_mm_extract_epi8(A, 13) == 255);
1016     assert(_mm_extract_epi8(A, 7 + 16) == 7);
1017 }
1018 
1019 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
1020 /// Note: returns a 32-bit $(I integer).
1021 int _mm_extract_ps (__m128 a, const int imm8) @trusted
1022 {
1023     return (cast(int4)a).array[imm8 & 3];
1024 }
1025 unittest
1026 {
1027     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
1028     assert(_mm_extract_ps(A, 0) == 0x3f800000);
1029     assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
1030     assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
1031 }
1032 
1033 
1034 
1035 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
1036 /// integer value, and store the results as packed double-precision floating-point elements.
1037 __m128d _mm_floor_pd (__m128d a) @trusted
1038 {
1039     static if (LDC_with_ARM64)
1040     {
1041         // LDC arm64 acceptable since 1.8 -O2
1042         long2 l = vcvtmq_s64_f64(a);
1043         double2 r;
1044         r.ptr[0] = l.array[0];
1045         r.ptr[1] = l.array[1];
1046         return r;
1047     }
1048     else
1049     {
1050         return _mm_round_pd!1(a);
1051     }
1052 }
1053 unittest
1054 {
1055     __m128d A = _mm_setr_pd(1.3f, -2.12f);
1056     __m128d B = _mm_setr_pd(53.6f, -2.7f);
1057     A = _mm_floor_pd(A);
1058     B = _mm_floor_pd(B);
1059     double[2] correctA = [1.0, -3.0];
1060     double[2] correctB = [53.0, -3.0];
1061     assert(A.array == correctA);
1062     assert(B.array == correctB);
1063 }
1064 
1065 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
1066 /// integer value, and store the results as packed single-precision floating-point elements.
1067 __m128 _mm_floor_ps (__m128 a) @trusted
1068 {
1069     static if (LDC_with_ARM64)
1070     {
1071         // LDC arm64 acceptable since 1.8 -O1
1072         int4 l = vcvtmq_s32_f32(a);
1073         float4 r;
1074         r.ptr[0] = l.array[0];
1075         r.ptr[1] = l.array[1];
1076         r.ptr[2] = l.array[2];
1077         r.ptr[3] = l.array[3];
1078         return r;
1079     }
1080     else
1081     {
1082         return _mm_round_ps!1(a);
1083     }
1084 }
1085 unittest
1086 {
1087     __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
1088     __m128 C = _mm_floor_ps(A);
1089     float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
1090     assert(C.array == correct);
1091 }
1092 
1093 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
1094 /// integer value, store the result as a double-precision floating-point element in the 
1095 /// lower element, and copy the upper element from `a` to the upper element.
1096 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
1097 {
1098     static if (LDC_with_ARM64)
1099     {
1100         a[0] = vcvtms_s64_f64(b[0]);
1101         return a;
1102     }
1103     else
1104     {
1105         return _mm_round_sd!1(a, b);
1106     }
1107 }
1108 unittest
1109 {
1110     __m128d A = _mm_setr_pd(1.3, -2.12);
1111     __m128d B = _mm_setr_pd(-53.1, -3.7);
1112     __m128d C = _mm_floor_sd(A, B);
1113     double[2] correct = [-54.0, -2.12];
1114     assert(C.array == correct);
1115 }
1116 
1117 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an
1118 /// integer value, store the result as a single-precision floating-point element in the
1119 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
1120 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
1121 {
1122     static if (LDC_with_ARM64)
1123     {
1124         a[0] = vcvtms_s32_f32(b[0]);
1125         return a;
1126     }
1127     else
1128     {
1129         return _mm_round_ss!1(a, b);
1130     }
1131 }
1132 unittest
1133 {
1134     __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
1135     __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
1136     __m128 C = _mm_floor_ss(A, B);
1137     float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
1138     assert(C.array == correct);
1139 }
1140 
1141 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
1142 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
1143 {
1144     // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
1145     // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
1146     // LDC arm64: ins.s since LDC 1.8 -O2
1147     int4 ia = cast(int4)a;
1148     ia.ptr[imm8 & 3] = i;
1149     return cast(__m128i)ia; 
1150 }
1151 unittest
1152 {
1153     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1154     int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
1155     int[4] result = [1, 2, 5, 4];
1156     assert(C.array == result);
1157 }
1158 
1159 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
1160 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
1161 {
1162     // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
1163     // LDC x86: always do something sensible.
1164     long2 la = cast(long2)a;
1165     la.ptr[imm8 & 1] = i;
1166     return cast(__m128i)la;
1167 }
1168 unittest
1169 {
1170     __m128i A = _mm_setr_epi64(1, 2);
1171     long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
1172     long[2] result = [1, 5];
1173     assert(C.array == result);
1174 }
1175 
1176 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
1177 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
1178 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
1179 {
1180     // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
1181     // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
1182     byte16 ba = cast(byte16)a;
1183     ba.ptr[imm8 & 15] = cast(byte)i;
1184     return cast(__m128i)ba; 
1185 }
1186 unittest
1187 {
1188     __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1189     byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
1190     byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
1191     assert(C.array == result);
1192 }
1193 
1194 
1195 /// Warning: of course it does something totally different from `_mm_insert_epi32`!
1196 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
1197 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
1198 /// (elements are zeroed out when the corresponding bit is set).
1199 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
1200 {
1201     // PERF DMD
1202     static if (GDC_with_SSE41)
1203     {
1204         return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
1205     }
1206     else static if (LDC_with_SSE41)
1207     {
1208         return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
1209     }
1210     else
1211     {
1212         float4 tmp2 = a;
1213         float tmp1 = b.array[(imm8 >> 6) & 3];
1214         tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
1215         return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
1216     }
1217 }
1218 unittest
1219 {
1220     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1221     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1222     __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
1223     float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
1224     assert(C.array == correct);
1225 }
1226 
1227 
1228 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1229 __m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted
1230 {
1231     static if (GDC_with_SSE41)
1232     {
1233         return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
1234     }
1235     else version(LDC)
1236     {
1237         // x86: pmaxsd since LDC 1.1 -O1
1238         // ARM: smax.4s since LDC 1.8 -01
1239         int4 sa = cast(int4)a;
1240         int4 sb = cast(int4)b;
1241         static if (SIMD_COMPARISON_MASKS_16B)
1242             int4 greater = sa > sb;
1243         else
1244             int4 greater = greaterMask!int4(sa, sb);
1245         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1246     }
1247     else
1248     {
1249         __m128i higher = _mm_cmpgt_epi32(a, b);
1250         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1251         __m128i mask = _mm_and_si128(aTob, higher);
1252         return _mm_xor_si128(b, mask);
1253     }
1254 }
1255 unittest
1256 {
1257     int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
1258                                       _mm_setr_epi32(        -4,-8,  9, -8));
1259     int[4] correct =                               [0x7fffffff, 1,  9,  7];
1260     assert(R.array == correct);
1261 }
1262 
1263 /// Compare packed signed 8-bit integers in `a` and `b`, 
1264 /// and return packed maximum values.
1265 __m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted
1266 {
1267     // PERF DMD
1268     static if (GDC_with_SSE41)
1269     {
1270         return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
1271     }
1272     else version(LDC)
1273     {
1274         // x86: pmaxsb since LDC 1.1 -O1
1275         // ARM64: smax.16b since LDC 1.8.0 -O1
1276         byte16 sa = cast(byte16)a;
1277         byte16 sb = cast(byte16)b;
1278         static if (SIMD_COMPARISON_MASKS_16B)
1279             byte16 greater = sa > sb;
1280         else
1281             byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1282         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1283     }
1284     else
1285     {
1286         __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
1287         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1288         __m128i mask = _mm_and_si128(aTob, lower);
1289         return _mm_xor_si128(b, mask);
1290     }
1291 }
1292 unittest
1293 {
1294     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1295     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1296     byte16 R = cast(byte16) _mm_max_epi8(A, B);
1297     byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
1298     assert(R.array == correct);
1299 }
1300 
1301 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
1302 __m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted
1303 {
1304     // PERF DMD
1305     static if (GDC_with_SSE41)
1306     {
1307         return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
1308     }
1309     else version(LDC)
1310     {
1311         // x86: pmaxuw since LDC 1.1 -O1
1312         // ARM64: umax.8h since LDC 1.8.0 -O1
1313         // PERF: without sse4.1, LLVM 12 produces a very interesting
1314         //          psubusw xmm0, xmm1
1315         //          paddw   xmm0, xmm1
1316         //       sequence that maybe should go in other min/max intrinsics? 
1317         ushort8 sa = cast(ushort8)a;
1318         ushort8 sb = cast(ushort8)b;
1319         static if (SIMD_COMPARISON_MASKS_16B)
1320         {
1321             // Note: doesn't work well with GDC, which prefers the builtin.
1322             ushort8 greater = sa > sb;
1323         }
1324         else
1325             ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
1326         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1327     }
1328     else
1329     {
1330         b = _mm_subs_epu16(b, a);
1331         b = _mm_add_epi16(b, a);
1332         return b;
1333     }
1334 }
1335 unittest
1336 {
1337     short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1338                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1339     short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
1340     assert(R.array == correct);
1341 }
1342 
1343 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
1344 __m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted
1345 {
1346     // PERF DMD
1347     static if (GDC_with_SSE41)
1348     {
1349         return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
1350     }
1351     else version(LDC)
1352     {
1353         // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
1354         // ARM64: umax.4s since LDC 1.8.0 -O1
1355         uint4 sa = cast(uint4)a;
1356         uint4 sb = cast(uint4)b;
1357         static if (SIMD_COMPARISON_MASKS_16B)
1358             uint4 greater = sa > sb;
1359         else
1360             uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1361         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1362     }
1363     else
1364     {
1365         // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128"
1366         /+
1367         movdqa  xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000]
1368         movdqa  xmm3, xmm1
1369         pxor    xmm3, xmm2
1370         pxor    xmm2, xmm0
1371         pcmpgtd xmm2, xmm3
1372         pand    xmm0, xmm2
1373         pandn   xmm2, xmm1
1374         por     xmm0, xmm2
1375         +/
1376         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1377         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
1378         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1379         __m128i mask = _mm_and_si128(aTob, higher);
1380         return _mm_xor_si128(b, mask);
1381     }
1382 }
1383 unittest
1384 {
1385     int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1386                                       _mm_setr_epi32(        -4,-8,  9, -8));
1387     int[4] correct =                                [        -4,-8,  9, -7];
1388     assert(R.array == correct);
1389 }
1390 
1391 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
1392 __m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted
1393 {
1394     // PERF DMD
1395     static if (GDC_with_SSE41)
1396     {
1397         return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
1398     }
1399     else version(LDC)
1400     {
1401         // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
1402         // ARM: smin.4s since LDC 1.8 -01
1403         int4 sa = cast(int4)a;
1404         int4 sb = cast(int4)b;
1405         static if (SIMD_COMPARISON_MASKS_16B)
1406             int4 greater = sa > sb;
1407         else
1408             int4 greater = greaterMask!int4(sa, sb);
1409         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1410     }
1411     else
1412     {
1413         __m128i higher = _mm_cmplt_epi32(a, b);
1414         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1415         __m128i mask = _mm_and_si128(aTob, higher);
1416         return _mm_xor_si128(b, mask);
1417     }
1418 }
1419 unittest
1420 {
1421     int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
1422                                       _mm_setr_epi32(        -4, -8,  9, -8));
1423     int[4] correct =                               [         -4, -8, -4, -8];
1424     assert(R.array == correct);
1425 }
1426 
1427 /// Compare packed signed 8-bit integers in `a` and `b`, 
1428 /// and return packed minimum values.
1429 __m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted
1430 {
1431     // PERF DMD
1432     static if (GDC_with_SSE41)
1433     {
1434         return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
1435     }
1436     else version(LDC)
1437     {
1438         // x86: pminsb since LDC 1.1 -O1
1439         // ARM64: smin.16b since LDC 1.8.0 -O1
1440         byte16 sa = cast(byte16)a;
1441         byte16 sb = cast(byte16)b;
1442         static if (SIMD_COMPARISON_MASKS_16B)
1443             byte16 greater = sa > sb;
1444         else
1445             byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
1446         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1447     }
1448     else
1449     {
1450         __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
1451         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1452         __m128i mask = _mm_and_si128(aTob, lower);
1453         return _mm_xor_si128(b, mask);
1454     }
1455 }
1456 unittest
1457 {
1458     __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
1459     __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
1460     byte16 R = cast(byte16) _mm_min_epi8(A, B);
1461     byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
1462     assert(R.array == correct);
1463 }
1464 
1465 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
1466 __m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted
1467 {
1468     // PERF DMD
1469     static if (GDC_with_SSE41)
1470     {
1471         return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
1472     }
1473     else version(LDC)
1474     {
1475         // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
1476         // ARM64: umin.8h since LDC 1.8.0 -O1
1477         ushort8 sa = cast(ushort8)a;
1478         ushort8 sb = cast(ushort8)b;
1479         static if (SIMD_COMPARISON_MASKS_16B)
1480             ushort8 greater = (sb > sa);
1481         else
1482             ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
1483         return cast(__m128i)( (greater & sa) | (~greater & sb) );
1484     }
1485     else
1486     {
1487         __m128i c = _mm_subs_epu16(b, a);
1488         b = _mm_sub_epi16(b, c);
1489         return b;
1490     }
1491 }
1492 unittest
1493 {
1494     short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
1495                                           _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
1496     short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
1497     assert(R.array == correct);
1498 }
1499 
1500 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1501 __m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted
1502 {
1503     // PERF DMD
1504     static if (GDC_with_SSE41)
1505     {
1506         return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
1507     }
1508     else version(LDC)
1509     {
1510         // x86: pminud since LDC 1.1 -O1, also good without sse4.1
1511         // ARM64: umin.4s since LDC 1.8.0 -O1
1512         uint4 sa = cast(uint4)a;
1513         uint4 sb = cast(uint4)b;
1514         static if (SIMD_COMPARISON_MASKS_16B)
1515             uint4 greater = sa > sb;
1516         else
1517             uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
1518         return cast(__m128i)( (~greater & sa) | (greater & sb) );
1519     }
1520     else
1521     {
1522         // PERF: same remark as in _mm_max_epu32
1523         __m128i valueShift = _mm_set1_epi32(-0x80000000);
1524         __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
1525         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1526         __m128i mask = _mm_and_si128(aTob, higher);
1527         return _mm_xor_si128(b, mask);
1528     }
1529 }
1530 unittest
1531 {
1532     int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
1533                                       _mm_setr_epi32(        -4,-8,  9, -8));
1534     int[4] correct =                                [0x7fffffff, 1,  4, -8];
1535     assert(R.array == correct);
1536 }
1537 
1538 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
1539 /// store the minimum and index in return value, and zero the remaining bits.
1540 __m128i _mm_minpos_epu16 (__m128i a) @trusted
1541 {
1542     // PERF DMD
1543     static if (GDC_with_SSE41)
1544     {
1545         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1546     }
1547     else static if (LDC_with_SSE41)
1548     {
1549         return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
1550     }
1551     else static if (LDC_with_ARM64)
1552     {
1553         __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1554         __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
1555         __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
1556         __m128i best = _mm_min_epu32(combinedLo, combinedHi);
1557         best = _mm_min_epu32(best, _mm_srli_si128!8(best));
1558         best = _mm_min_epu32(best, _mm_srli_si128!4(best));
1559         short8 sbest = cast(short8)best;
1560         short8 r;
1561         r[0] = sbest[1];
1562         r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
1563         r[2] = 0;
1564         r[3] = 0;
1565         r[4] = 0;
1566         r[5] = 0;
1567         r[6] = 0;
1568         r[7] = 0;
1569         return cast(__m128i)r;
1570     }
1571     else
1572     {
1573         short8 sa = cast(short8)a;
1574         ushort min = 0xffff;
1575         int index = 0;
1576         for(int n = 0; n < 8; ++n)
1577         {
1578             ushort c = sa.array[n];
1579             if (c < min)
1580             {
1581                 min = c;
1582                 index = n;
1583             }
1584         }
1585         short8 r;
1586         r.ptr[0] = min;
1587         r.ptr[1] = cast(short)index;
1588         return cast(__m128i)r;
1589     }
1590 }
1591 unittest
1592 {
1593     __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
1594     __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
1595     short8 R1 = cast(short8) _mm_minpos_epu16(A);
1596     short8 R2 = cast(short8) _mm_minpos_epu16(B);
1597     short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
1598     short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
1599     assert(R1.array == correct1);
1600     assert(R2.array == correct2);
1601 }
1602 
1603 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
1604 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 
1605 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
1606 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
1607 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
1608 /// at the offset specified in `imm8[2]`.
1609 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
1610 {
1611     // PERF DMD
1612     static if (GDC_with_SSE41)
1613     {
1614         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
1615     }
1616     else static if (LDC_with_SSE41)
1617     {
1618         return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
1619     }
1620     else
1621     {
1622         int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
1623         int b_offset = (imm8 & 3) * 4;
1624 
1625         byte16 ba = cast(byte16)a;
1626         byte16 bb = cast(byte16)b;
1627         short8 r;
1628 
1629         __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
1630 
1631         for (int j = 0; j < 8; j += 2)
1632         {
1633             int k = a_offset + j;
1634             __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
1635                                            0, 0, 0, 0, 
1636                                            ba[k+1], ba[k+2], ba[k+3], ba[k+4],
1637                                            0, 0, 0, 0);
1638             short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
1639             r.ptr[j] = diffs.array[0];
1640             r.ptr[j+1] = diffs.array[4];
1641         }
1642         return cast(__m128i)r;
1643     }
1644 }
1645 unittest
1646 {
1647     __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
1648     __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
1649     short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
1650     short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
1651     short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
1652     short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
1653     short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
1654     short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
1655     short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
1656     short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
1657     short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
1658     short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
1659     assert(r1.array == correct1);
1660     assert(r4.array == correct4);
1661     assert(r5.array == correct5);
1662     assert(r7.array == correct7);
1663     assert(r8.array == correct0);
1664 }
1665 
1666 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
1667 __m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted
1668 {
1669     // PERF DMD
1670     static if (GDC_with_SSE41)
1671     {
1672         return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
1673     }
1674     else static if (LDC_with_SSE41 && LDC_with_optimizations)
1675     {
1676         // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
1677         // Use IR instead.
1678         // This generates pmuldq with since LDC 1.2.0 -O0 
1679         enum ir = `
1680             %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
1681             %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
1682             %la = sext <2 x i32> %ia to <2 x i64>
1683             %lb = sext <2 x i32> %ib to <2 x i64>
1684             %r = mul <2 x i64> %la, %lb
1685             ret <2 x i64> %r`;
1686         return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
1687     }
1688     else static if (LDC_with_ARM64)  
1689     {
1690         // 3 instructions since LDC 1.8 -O2
1691         // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
1692         int2 a_lo = vmovn_s64(cast(long2)a);
1693         int2 b_lo = vmovn_s64(cast(long2)b);
1694         return cast(__m128i) vmull_s32(a_lo, b_lo);
1695     }
1696     else
1697     {
1698         int4 ia = cast(int4)a;
1699         int4 ib = cast(int4)b;
1700         long2 r;
1701         r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
1702         r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
1703         return cast(__m128i)r;
1704     }
1705 }
1706 unittest
1707 {
1708     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1709     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1710     long2 R = cast(long2) _mm_mul_epi32(A, B);
1711     long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
1712     assert(R.array == correct);
1713 }
1714 
1715 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
1716 /// return the low 32 bits of the intermediate integers.
1717 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted
1718 {
1719     // PERF DMD
1720     // PERF GDC without SSE4.1 could be better
1721     static if (GDC_with_SSE41)
1722     {
1723         int4 ia = cast(int4)a;
1724         int4 ib = cast(int4)b;
1725         // Note: older GDC doesn't have that op, but older GDC
1726         // also has no support for -msse4.1 detection
1727         return cast(__m128i)(a * b); 
1728     }
1729     else version(LDC)
1730     {
1731         int4 ia = cast(int4)a;
1732         int4 ib = cast(int4)b;
1733         return cast(__m128i)(a * b);
1734     }
1735     else
1736     {
1737         // DMD doesn't take the above
1738         int4 ia = cast(int4)a;
1739         int4 ib = cast(int4)b;
1740         int4 r;
1741         r.ptr[0] = ia.array[0] * ib.array[0];
1742         r.ptr[1] = ia.array[1] * ib.array[1];
1743         r.ptr[2] = ia.array[2] * ib.array[2];
1744         r.ptr[3] = ia.array[3] * ib.array[3];
1745         return r;
1746     }
1747 }
1748 unittest
1749 {
1750     __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
1751     __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
1752     int4 R = cast(int4) _mm_mullo_epi32(A, B);
1753     int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
1754     assert(R.array == correct);
1755 }
1756 
1757 
1758 /// Convert packed signed 32-bit integers from `a` and `b` 
1759 /// to packed 16-bit integers using unsigned saturation.
1760 __m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted
1761 {
1762     static if (GDC_with_SSE41)
1763     {
1764         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1765     }
1766     else static if (LDC_with_SSE41)
1767     {
1768         return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
1769     }
1770     else static if (LDC_with_ARM64)
1771     {
1772        int4 z;
1773        z = 0;       
1774        return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
1775                                          vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
1776     }
1777     else
1778     {
1779         __m128i i32768 = _mm_set1_epi32(32768);
1780         __m128i s32768 = _mm_set1_epi16(-32768);
1781         a = _mm_sub_epi32(a, i32768);
1782         b = _mm_sub_epi32(b, i32768);
1783         __m128i clampedSigned = _mm_packs_epi32(a, b);
1784         return _mm_add_epi16(clampedSigned, s32768);
1785     }
1786 }
1787 unittest
1788 {
1789     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1790     short8 R = cast(short8) _mm_packus_epi32(A, A);
1791     short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
1792     assert(R.array == correct);
1793 }
1794 
1795 
1796 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
1797 /// rounding parameter, and store the results as packed double-precision floating-point elements.
1798 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1799 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1800 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1801 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1802 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1803 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1804 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted
1805 {
1806     // PERF DMD
1807     static if (GDC_with_SSE41)
1808     {
1809         return __builtin_ia32_roundpd(a, rounding);
1810     }
1811     else static if (LDC_with_SSE41)
1812     {
1813         return __builtin_ia32_roundpd(a, rounding);
1814     }
1815     else
1816     {
1817         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1818         {
1819             // Convert to 64-bit integers
1820             long lo = _mm_cvtsd_si64(a);
1821             a.ptr[0] = a.array[1];
1822             long hi = _mm_cvtsd_si64(a);
1823             return _mm_setr_pd(lo, hi);
1824         }
1825         else
1826         {
1827             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1828 
1829             uint old = _MM_GET_ROUNDING_MODE();
1830             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1831             
1832             // Convert to 64-bit integers
1833             long lo = _mm_cvtsd_si64(a);
1834             a.ptr[0] = a.array[1];
1835             long hi = _mm_cvtsd_si64(a);
1836 
1837             // Convert back to double to achieve the rounding
1838             // The problem is that a 64-bit double can't represent all the values 
1839             // a 64-bit integer can (and vice-versa). So this function won't work for
1840             // large values. (MAYDO: what range exactly?)
1841             _MM_SET_ROUNDING_MODE(old);
1842             return _mm_setr_pd(lo, hi);
1843         }
1844     }
1845 }
1846 unittest
1847 {
1848     // tested in other intrinsics
1849 }
1850 
1851 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
1852 /// rounding parameter, and store the results as packed single-precision floating-point elements.
1853 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1854 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1855 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1856 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1857 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1858 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1859 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted
1860 {
1861     // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
1862     static if (GDC_or_LDC_with_SSE41)
1863     {
1864         return __builtin_ia32_roundps(a, rounding);
1865     }
1866     else
1867     {
1868         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1869         {
1870             __m128i integers = _mm_cvtps_epi32(a);
1871             return _mm_cvtepi32_ps(integers);
1872         }
1873         else
1874         {
1875             version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
1876             uint old = _MM_GET_ROUNDING_MODE();
1877             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1878             scope(exit) _MM_SET_ROUNDING_MODE(old);
1879 
1880             // Convert to 64-bit integers
1881             __m128i integers = _mm_cvtps_epi32(a);
1882 
1883             // Convert back to float to achieve the rounding
1884             // The problem is that a 32-float can't represent all the values 
1885             // a 32-bit integer can (and vice-versa). So this function won't work for
1886             // large values. (MAYDO: what range exactly?)
1887             __m128 result = _mm_cvtepi32_ps(integers);
1888 
1889             return result;
1890         }
1891     }
1892 }
1893 unittest
1894 {
1895     // tested in other intrinsics
1896 }
1897 
1898 
1899 /// Round the lower double-precision (64-bit) floating-point element in `b` using the
1900 /// rounding parameter, store the result as a double-precision floating-point element 
1901 /// in the lower element of result, and copy the upper element from `a` to the upper element of result.
1902 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1903 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1904 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1905 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1906 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1907 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1908 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
1909 {
1910     static if (GDC_with_SSE41)
1911     {
1912         return __builtin_ia32_roundsd(a, b, rounding);
1913     }
1914     else static if (LDC_with_SSE41)
1915     {
1916         return __builtin_ia32_roundsd(a, b, rounding);
1917     }
1918     else
1919     {
1920         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1921         {
1922             // Convert to 64-bit integer
1923             long b0 = _mm_cvtsd_si64(b);
1924             a.ptr[0] = b0;
1925             return a;
1926         }
1927         else
1928         {
1929             version(GNU) pragma(inline, false); // else fail unittest with optimizations
1930 
1931             uint old = _MM_GET_ROUNDING_MODE();
1932             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1933             
1934             // Convert to 64-bit integer
1935             long b0 = _mm_cvtsd_si64(b);
1936             a.ptr[0] = b0;
1937 
1938             // Convert back to double to achieve the rounding
1939             // The problem is that a 64-bit double can't represent all the values 
1940             // a 64-bit integer can (and vice-versa). So this function won't work for
1941             // large values. (MAYDO: what range exactly?)
1942             _MM_SET_ROUNDING_MODE(old);
1943             return a;
1944         }
1945     }
1946 }
1947 unittest
1948 {
1949     // tested in other intrinsics
1950 }
1951 
1952 
1953 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 
1954 /// rounding parameter, store the result as a single-precision floating-point element 
1955 /// in the lower element of result, and copy the upper 3 packed elements from `a`
1956 /// to the upper elements of result.
1957 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
1958 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
1959 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
1960 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
1961 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
1962 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
1963 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
1964 {
1965     static if (GDC_with_SSE41)
1966     {
1967         return __builtin_ia32_roundss(a, b, rounding);
1968     }
1969     else static if (LDC_with_SSE41)
1970     {
1971         return __builtin_ia32_roundss(a, b, rounding);
1972     }
1973     else
1974     {
1975         static if (rounding & _MM_FROUND_CUR_DIRECTION)
1976         {
1977             int b0 = _mm_cvtss_si32(b);
1978             a.ptr[0] = b0;   
1979             return a;
1980         }
1981         else version(GNU)
1982         {
1983             pragma(inline, false)
1984             __m128 GDCworkaround() nothrow @nogc @trusted 
1985             {
1986                 uint old = _MM_GET_ROUNDING_MODE();
1987                 _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
1988 
1989                 // Convert to 32-bit integer
1990                 int b0 = _mm_cvtss_si32(b);
1991                 a.ptr[0] = b0;       
1992 
1993                 // Convert back to double to achieve the rounding
1994                 // The problem is that a 32-bit float can't represent all the values 
1995                 // a 32-bit integer can (and vice-versa). So this function won't work for
1996                 // large values. (MAYDO: what range exactly?)
1997                 _MM_SET_ROUNDING_MODE(old);
1998                 return a;
1999             }
2000             return GDCworkaround();
2001         }
2002         else
2003         {
2004             uint old = _MM_GET_ROUNDING_MODE();
2005             _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
2006 
2007             // Convert to 32-bit integer
2008             int b0 = _mm_cvtss_si32(b);
2009             a.ptr[0] = b0;       
2010 
2011             // Convert back to double to achieve the rounding
2012             // The problem is that a 32-bit float can't represent all the values 
2013             // a 32-bit integer can (and vice-versa). So this function won't work for
2014             // large values. (MAYDO: what range exactly?)
2015             _MM_SET_ROUNDING_MODE(old);
2016             return a;
2017         }
2018     }
2019 }
2020 unittest
2021 {
2022     // tested in other intrinsics
2023 }
2024 
2025 
2026 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 
2027 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
2028 /// exception may be generated.
2029 __m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted
2030 {
2031     // PERF DMD D_SIMD
2032     static if (GDC_with_SSE41)
2033     {
2034         return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
2035     }
2036     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
2037     {
2038         enum prefix = `!0 = !{ i32 1 }`;
2039         enum ir = `
2040             %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
2041             ret <4 x i32> %r`;
2042         return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr);
2043     }
2044     else
2045     {
2046         return *cast(__m128i*)mem_addr; // regular move instead
2047     }
2048 }
2049 unittest
2050 {
2051     align(16) static immutable int[4] correct = [1, 2, 3, 4];
2052     __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr));
2053     _mm_mfence();
2054     assert(A.array == correct);
2055 }
2056 
2057 /// Return 1 if all bits in `a` are all 1's. Else return 0.
2058 int _mm_test_all_ones (__m128i a) @safe
2059 {
2060     return _mm_testc_si128(a, _mm_set1_epi32(-1));
2061 }
2062 unittest
2063 {
2064     __m128i A = _mm_set1_epi32(-1);
2065     __m128i B = _mm_set_epi32(-1, -2, -1, -1);
2066     assert(_mm_test_all_ones(A) == 1);
2067     assert(_mm_test_all_ones(B) == 0);
2068 }
2069 
2070 /// Return 1 if all bits in `a` are all 0's. Else return 0.
2071 // This is a #BONUS since it was lacking in Intel Intrinsics API.
2072 int _mm_test_all_zeros (__m128i a) @safe
2073 {
2074     return _mm_testz_si128(a, _mm_set1_epi32(-1));
2075 }
2076 unittest
2077 {
2078     __m128i A = _mm_set1_epi32(0);
2079     __m128i B = _mm_set_epi32(0, 8, 0, 0);
2080     assert(_mm_test_all_zeros(A) == 1);
2081     assert(_mm_test_all_zeros(B) == 0);
2082 }
2083 
2084 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
2085 /// and return 1 if the result is zero, otherwise return 0.
2086 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
2087 {
2088     return _mm_testz_si128(a, mask); // it's really the same, but with a good name
2089 }
2090 
2091 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 
2092 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 
2093 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
2094 /// CF values are zero, otherwise return 0.
2095 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
2096 {
2097     return _mm_testnzc_si128(a, mask);
2098 }
2099 
2100 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
2101 /// result is zero, otherwise return 0.
2102 /// In other words, test if all bits masked by `b` are 1 in `a`.
2103 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
2104 {
2105     // PERF DMD
2106     static if (GDC_with_SSE41)
2107     {
2108         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2109     }
2110     else static if (LDC_with_SSE41)
2111     {
2112         return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
2113     }
2114     else static if (LDC_with_ARM64)
2115     {
2116         // Acceptable since LDC 1.8 -02
2117         long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
2118         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2119     }
2120     else
2121     {
2122         __m128i c = ~a & b;
2123         int[4] zero = [0, 0, 0, 0];
2124         return c.array == zero;
2125     }
2126 }
2127 unittest
2128 {
2129     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2130     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
2131     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2132     assert(_mm_testc_si128(A, A) == 1);
2133     assert(_mm_testc_si128(A, M1) == 0);
2134     assert(_mm_testc_si128(A, M2) == 1);
2135 }
2136 
2137 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
2138 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
2139 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
2140 /// result is zero, otherwise set CF to 0. 
2141 /// Return 1 if both the ZF and CF values are zero, otherwise return 0.
2142 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
2143 {
2144     // PERF DMD
2145     static if (GDC_with_SSE41)
2146     {
2147         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2148     }
2149     else static if (LDC_with_SSE41)
2150     {
2151         return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
2152     }
2153     else static if (LDC_with_ARM64)
2154     {
2155         long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
2156         long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
2157 
2158         return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
2159                 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
2160     }
2161     else
2162     {
2163         __m128i c = a & b;
2164         __m128i d = ~a & b;
2165         int[4] zero = [0, 0, 0, 0];
2166         return !( (c.array == zero) || (d.array == zero));
2167     }    
2168 }
2169 unittest
2170 {
2171     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2172     __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
2173     __m128i Z = _mm_setzero_si128();
2174     assert(_mm_testnzc_si128(A, Z) == 0);
2175     assert(_mm_testnzc_si128(A, M) == 1);
2176     assert(_mm_testnzc_si128(A, A) == 0);
2177 }
2178 
2179 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
2180 /// and return 1 if the result is zero, otherwise return 0.
2181 /// In other words, test if all bits masked by `b` are 0 in `a`.
2182 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
2183 {
2184     // PERF DMD
2185     static if (GDC_with_SSE41)
2186     {
2187         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2188     }
2189     else static if (LDC_with_SSE41)
2190     {
2191         return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
2192     }
2193     else static if (LDC_with_ARM64)
2194     {
2195         // Acceptable since LDC 1.8 -02
2196         long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
2197         return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2198     }
2199     else 
2200     {
2201         __m128i c = a & b;
2202         int[4] zero = [0, 0, 0, 0];
2203         return c.array == zero;
2204     }    
2205 }
2206 unittest
2207 {
2208     __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
2209     __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
2210     __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
2211     assert(_mm_testz_si128(A, A) == 0);
2212     assert(_mm_testz_si128(A, M1) == 1);
2213     assert(_mm_testz_si128(A, M2) == 0);
2214 }
2215