1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (DMD_with_DSIMD)
83     {
84         return cast(__m128d) __simd(XMM.ADDSD, a, b);
85     }
86     else static if (GDC_with_SSE2)
87     {
88         return __builtin_ia32_addsd(a, b);
89     }
90     else version(DigitalMars)
91     {
92         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
93         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
94         asm pure nothrow @nogc @trusted { nop;}
95         a[0] = a[0] + b[0];
96         return a;
97     }
98     else
99     {
100         a[0] += b[0];
101         return a;
102     }
103 }
104 unittest
105 {
106     __m128d a = [1.5, -2.0];
107     a = _mm_add_sd(a, a);
108     assert(a.array == [3.0, -2.0]);
109 }
110 
111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
113 {
114     pragma(inline, true);
115     return a + b;
116 }
117 unittest
118 {
119     __m128d a = [1.5, -2.0];
120     a = _mm_add_pd(a, a);
121     assert(a.array == [3.0, -4.0]);
122 }
123 
124 /// Add 64-bit integers `a` and `b`.
125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
126 {
127     // PERF DMD
128     pragma(inline, true);
129     return a + b;
130 }
131 
132 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
134 {
135     static if (DMD_with_DSIMD)
136     {
137         return cast(__m128i) __simd(XMM.PADDSW, a, b);
138     }
139     else static if (GDC_with_SSE2)
140     {
141         return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
142     }
143     else static if(LDC_with_saturated_intrinsics)
144     {
145         return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b);
146     }
147     else
148     {
149         short[8] res; // PERF =void;
150         short8 sa = cast(short8)a;
151         short8 sb = cast(short8)b;
152         foreach(i; 0..8)
153             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
154         return _mm_loadu_si128(cast(int4*)res.ptr);
155     }
156 }
157 unittest
158 {
159     short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0),
160                                              _mm_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10));
161     static immutable short[8] correctResult             =  [14, 12, 10, -32768, 6, 4, 32767, -10];
162     assert(res.array == correctResult);
163 }
164 
165 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
166 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
167 {
168     static if (DMD_with_DSIMD)
169     {
170         return cast(__m128i) __simd(XMM.PADDSB, a, b);
171     }
172     else static if (GDC_with_SSE2)
173     {
174         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
175     }
176     else static if(LDC_with_saturated_intrinsics)
177     {
178         return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b);
179     }
180     else
181     {
182         byte[16] res; // PERF =void;
183         byte16 sa = cast(byte16)a;
184         byte16 sb = cast(byte16)b;
185         foreach(i; 0..16)
186             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
187         return _mm_loadu_si128(cast(int4*)res.ptr);
188     }
189 }
190 unittest
191 {
192     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
193                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0));
194     static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14,
195                                                16, 18, 127, 22, 24, 26, 28, 30];
196     assert(res.array == correctResult);
197 }
198 
199 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
200 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
201 {
202     static if (DMD_with_DSIMD)
203     {
204         return cast(__m128i) __simd(XMM.PADDUSB, a, b);
205     }
206     else static if (GDC_with_SSE2)
207     {
208         return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b);
209     }
210     else static if(LDC_with_saturated_intrinsics)
211     {
212         return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b);
213     }
214     else
215     {
216         ubyte[16] res; // PERF =void;
217         byte16 sa = cast(byte16)a;
218         byte16 sb = cast(byte16)b;
219         foreach(i; 0..16)
220             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
221         return _mm_loadu_si128(cast(int4*)res.ptr);
222     }
223 }
224 unittest
225 {
226     byte16 res = cast(byte16) 
227         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
228                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
229     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
230                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
231     assert(res.array == correctResult);
232 }
233 
234 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
235 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
236 {
237     static if (DMD_with_DSIMD)
238     {
239         // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway
240         return cast(__m128i) __simd(XMM.PADDUSW, a, b);
241     }
242     else static if (GDC_with_SSE2)
243     {
244         return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b);
245     }
246     else static if(LDC_with_saturated_intrinsics)
247     {
248         return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b);
249     }
250     else
251     {
252         ushort[8] res; // PERF =void;
253         short8 sa = cast(short8)a;
254         short8 sb = cast(short8)b;
255         foreach(i; 0..8)
256             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
257         return _mm_loadu_si128(cast(int4*)res.ptr);
258     }
259 }
260 unittest
261 {
262     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
263                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
264     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
265     assert(res.array == correctResult);
266 }
267 
268 /// Compute the bitwise AND of packed double-precision (64-bit) 
269 /// floating-point elements in `a` and `b`.
270 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
271 {
272     pragma(inline, true);
273     return cast(__m128d)( cast(long2)a & cast(long2)b );
274 }
275 unittest
276 {
277     double a = 4.32;
278     double b = -78.99;
279     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
280     __m128d A = _mm_set_pd(a, b);
281     __m128d B = _mm_set_pd(b, a);
282     long2 R = cast(long2)( _mm_and_pd(A, B) );
283     assert(R.array[0] == correct);
284     assert(R.array[1] == correct);
285 }
286 
287 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
288 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
289 {
290     pragma(inline, true);
291     return a & b;
292 }
293 unittest
294 {
295     __m128i A = _mm_set1_epi32(7);
296     __m128i B = _mm_set1_epi32(14);
297     __m128i R = _mm_and_si128(A, B);
298     int[4] correct = [6, 6, 6, 6];
299     assert(R.array == correct);
300 }
301 
302 /// Compute the bitwise NOT of packed double-precision (64-bit) 
303 /// floating-point elements in `a` and then AND with `b`.
304 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
305 {
306     static if (DMD_with_DSIMD)
307     {
308         return cast(__m128d) __simd(XMM.ANDNPD, a, b);
309     }
310     else
311     {
312         return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
313     }
314 }
315 unittest
316 {
317     double a = 4.32;
318     double b = -78.99;
319     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
320     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
321     __m128d A = _mm_setr_pd(a, b);
322     __m128d B = _mm_setr_pd(b, a);
323     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
324     assert(R.array[0] == correct);
325     assert(R.array[1] == correct2);
326 }
327 
328 /// Compute the bitwise NOT of 128 bits (representing integer data) 
329 /// in `a` and then AND with `b`.
330 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
331 {
332     static if (DMD_with_DSIMD)
333     {
334         return cast(__m128i) __simd(XMM.PANDN, a, b);
335     }
336     else
337     {
338         return (~a) & b;
339     }
340 }
341 unittest
342 {
343     __m128i A = _mm_setr_epi32(7, -2, 9, 54654);
344     __m128i B = _mm_setr_epi32(14, 78, 111, -256);
345     __m128i R = _mm_andnot_si128(A, B);
346     int[4] correct = [8, 0, 102, -54784];
347     assert(R.array == correct);
348 }
349 
350 /// Average packed unsigned 16-bit integers in `a` and `b`.
351 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
352 {
353     static if (DMD_with_DSIMD)
354     {
355         return cast(__m128i) __simd(XMM.PAVGW, a, b);
356     }
357     else static if (GDC_with_SSE2)
358     {
359         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
360     }
361     else static if (LDC_with_ARM64)
362     {
363         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
364     }
365     else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
366     {
367         // Exists since LDC 1.18
368         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
369     }
370     else static if (LDC_with_optimizations)
371     {
372         // Generates pavgw even in LDC 1.0, even in -O0
373         // But not in ARM
374         enum ir = `
375             %ia = zext <8 x i16> %0 to <8 x i32>
376             %ib = zext <8 x i16> %1 to <8 x i32>
377             %isum = add <8 x i32> %ia, %ib
378             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
379             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
380             %r = trunc <8 x i32> %isums to <8 x i16>
381             ret <8 x i16> %r`;
382         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
383     }
384     else
385     {
386         short8 sa = cast(short8)a;
387         short8 sb = cast(short8)b;
388         short8 sr = void;
389         foreach(i; 0..8)
390         {
391             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
392         }
393         return cast(int4)sr;
394     }
395 }
396 unittest
397 {
398     __m128i A = _mm_set1_epi16(31);
399     __m128i B = _mm_set1_epi16(64);
400     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
401     foreach(i; 0..8)
402         assert(avg.array[i] == 48);
403 }
404 
405 /// Average packed unsigned 8-bit integers in `a` and `b`.
406 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
407 {
408     static if (DMD_with_DSIMD)
409     {
410         return cast(__m128i) __simd(XMM.PAVGB, a, b);
411     }
412     else static if (GDC_with_SSE2)
413     {
414         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
415     }
416     else static if (LDC_with_SSE2 && __VERSION__ >= 2094)
417     {
418         // Exists since LDC 1.18
419         return cast(__m128i) __builtin_ia32_pavgb128(cast(byte16)a, cast(byte16)b);
420     }
421     else static if (LDC_with_ARM64)
422     {
423         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
424     }
425     else static if (LDC_with_optimizations)
426     {
427         // Generates pavgb even in LDC 1.0, even in -O0
428         // But not in ARM
429         enum ir = `
430             %ia = zext <16 x i8> %0 to <16 x i16>
431             %ib = zext <16 x i8> %1 to <16 x i16>
432             %isum = add <16 x i16> %ia, %ib
433             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
434             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
435             %r = trunc <16 x i16> %isums to <16 x i8>
436             ret <16 x i8> %r`;
437         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
438     }
439     else
440     {
441         byte16 sa = cast(byte16)a;
442         byte16 sb = cast(byte16)b;
443         byte16 sr = void;
444         foreach(i; 0..16)
445         {
446             sr.ptr[i] = cast(ubyte)( (cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]) + 1) >> 1 );
447         }
448         return cast(int4)sr;
449     }
450 }
451 unittest
452 {
453     __m128i A = _mm_set1_epi8(31);
454     __m128i B = _mm_set1_epi8(64);
455     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
456     foreach(i; 0..16)
457         assert(avg.array[i] == 48);
458 }
459 
460 /// Shift `a` left by `bytes` bytes while shifting in zeros.
461 alias _mm_bslli_si128 = _mm_slli_si128;
462 unittest
463 {
464     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
465     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
466     __m128i result = _mm_bslli_si128!5(toShift);
467     assert( (cast(byte16)result).array == exact);
468 }
469 
470 /// Shift `v` right by `bytes` bytes while shifting in zeros.
471 alias _mm_bsrli_si128 = _mm_srli_si128;
472 unittest
473 {
474     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
475     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
476     __m128i result = _mm_bsrli_si128!5(toShift);
477     assert( (cast(byte16)result).array == exact);
478 }
479 
480 /// Cast vector of type `__m128d` to type `__m128`. 
481 /// Note: Also possible with a regular `cast(__m128)(a)`.
482 __m128 _mm_castpd_ps (__m128d a) pure @safe
483 {
484     return cast(__m128)a;
485 }
486 
487 /// Cast vector of type `__m128d` to type `__m128i`. 
488 /// Note: Also possible with a regular `cast(__m128i)(a)`.
489 __m128i _mm_castpd_si128 (__m128d a) pure @safe
490 {
491     return cast(__m128i)a;
492 }
493 
494 /// Cast vector of type `__m128` to type `__m128d`. 
495 /// Note: Also possible with a regular `cast(__m128d)(a)`.
496 __m128d _mm_castps_pd (__m128 a) pure @safe
497 {
498     return cast(__m128d)a;
499 }
500 
501 /// Cast vector of type `__m128` to type `__m128i`. 
502 /// Note: Also possible with a regular `cast(__m128i)(a)`.
503 __m128i _mm_castps_si128 (__m128 a) pure @safe
504 {
505     return cast(__m128i)a;
506 }
507 
508 /// Cast vector of type `__m128i` to type `__m128d`. 
509 /// Note: Also possible with a regular `cast(__m128d)(a)`.
510 __m128d _mm_castsi128_pd (__m128i a) pure @safe
511 {
512     return cast(__m128d)a;
513 }
514 
515 /// Cast vector of type `__m128i` to type `__m128`. 
516 /// Note: Also possible with a regular `cast(__m128)(a)`.
517 __m128 _mm_castsi128_ps (__m128i a) pure @safe
518 {
519     return cast(__m128)a;
520 }
521 
522 /// Invalidate and flush the cache line that contains `p` 
523 /// from all levels of the cache hierarchy.
524 void _mm_clflush (const(void)* p) @trusted
525 {
526     static if (GDC_with_SSE2)
527     {
528         __builtin_ia32_clflush(p);
529     }
530     else static if (LDC_with_SSE2)
531     {
532         __builtin_ia32_clflush(cast(void*)p);
533     }
534     else version(D_InlineAsm_X86)
535     {
536         asm pure nothrow @nogc @trusted
537         {
538             mov EAX, p;
539             clflush [EAX];
540         }
541     }
542     else version(D_InlineAsm_X86_64)
543     {
544         asm pure nothrow @nogc @trusted
545         {
546             mov RAX, p;
547             clflush [RAX];
548         }
549     }
550     else 
551     {
552         // Do nothing. Invalidating cacheline does
553         // not affect correctness.
554     }
555 }
556 unittest
557 {
558     ubyte[64] cacheline;
559     _mm_clflush(cacheline.ptr);
560 }
561 
562 /// Compare packed 16-bit integers in `a` and `b` for equality.
563 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
564 {
565     static if (SIMD_COMPARISON_MASKS_16B)
566     {
567         return cast(__m128i)(cast(short8)a == cast(short8)b);
568     }
569     else static if (GDC_with_SSE2)
570     {
571         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
572     }
573     else
574     {
575         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
576     }
577 }
578 unittest
579 {
580     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
581     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
582     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
583     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
584     assert(R.array == E);
585 }
586 
587 /// Compare packed 32-bit integers in `a` and `b` for equality.
588 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
589 {
590     static if (SIMD_COMPARISON_MASKS_16B)
591     {
592         return cast(__m128i)(cast(int4)a == cast(int4)b);
593     }
594     else static if (GDC_with_SSE2)
595     {
596         return __builtin_ia32_pcmpeqd128(a, b);
597     }
598     else
599     {
600         return equalMask!__m128i(a, b);
601     }
602 }
603 unittest
604 {
605     int4   A = [-3, -2, -1,  0];
606     int4   B = [ 4, -2,  2,  0];
607     int[4] E = [ 0, -1,  0, -1];
608     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
609     assert(R.array == E);
610 }
611 
612 /// Compare packed 8-bit integers in `a` and `b` for equality.
613 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
614 {
615     static if (SIMD_COMPARISON_MASKS_16B)
616     {
617         return cast(__m128i)(cast(byte16)a == cast(byte16)b);
618     }
619     else static if (GDC_with_SSE2)
620     {
621         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
622     }
623     else
624     {
625         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
626     }
627 }
628 unittest
629 {
630     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
631     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
632     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
633     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
634     assert(C.array == correct);
635 }
636 
637 /// Compare packed double-precision (64-bit) floating-point elements 
638 /// in `a` and `b` for equality.
639 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
640 {
641     static if (SIMD_COMPARISON_MASKS_16B)
642     {
643         return cast(double2)(cast(double2)a == cast(double2)b);
644     }
645     else static if (GDC_with_SSE2)
646     {
647         return __builtin_ia32_cmpeqpd(a, b);
648     }
649     else
650     {
651         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
652     }
653 }
654 unittest
655 {
656     double2 A = _mm_setr_pd(1.0, 2.0);
657     double2 B = _mm_setr_pd(0.0, 2.0);
658     double2 N = _mm_setr_pd(double.nan, double.nan);
659     long2 C = cast(long2) _mm_cmpeq_pd(A, B);
660     long[2] correctC = [0, -1];
661     assert(C.array == correctC);
662     long2 D = cast(long2) _mm_cmpeq_pd(N, N);
663     long[2] correctD = [0, 0];
664     assert(D.array == correctD);
665 }
666 
667 /// Compare the lower double-precision (64-bit) floating-point elements
668 /// in `a` and `b` for equality, store the result in the lower element,
669 /// and copy the upper element from `a`.
670 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
671 {
672     static if (DMD_with_DSIMD)
673     {
674         return cast(__m128d) __simd(XMM.CMPSD, a, b, 0);
675     }
676     else static if (GDC_with_SSE2)
677     {
678         return __builtin_ia32_cmpeqsd(a, b);
679     }
680     else
681     {
682         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
683     }
684 }
685 unittest
686 {
687     double2 A = _mm_setr_pd(0.0, 2.0);
688     double2 B = _mm_setr_pd(1.0, 2.0);
689     double2 C = _mm_setr_pd(1.0, 3.0);
690     double2 D = cast(double2) _mm_cmpeq_sd(A, B);
691     long2 E = cast(long2) _mm_cmpeq_sd(B, C);
692     double[2] correctD = [0.0, 2.0];
693     double two = 2.0;
694     long[2] correctE = [-1, *cast(long*)&two];
695     assert(D.array == correctD);
696     assert(E.array == correctE);
697 }
698 
699 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
700 /// #BONUS
701 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe
702 {
703     static if (SIMD_COMPARISON_MASKS_16B)
704     {
705         return cast(__m128i)(cast(short8)a >= cast(short8)b);
706     }
707     else version (LDC)
708     {
709         // LDC ARM64: generates cmge since -O1
710         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b);
711     }
712     else
713     {        
714         return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b));
715     }
716 }
717 unittest
718 {
719     short8   A = [-3, -2, -32768,  0,  0,  1,  2,  3];
720     short8   B = [ 4,  3,  32767,  1,  0, -1, -2, -3];
721     short[8] E = [ 0,  0,      0,  0,  -1, -1, -1, -1];
722     short8   R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B));
723     assert(R.array == E);
724 }
725 
726 /// Compare packed double-precision (64-bit) floating-point elements 
727 /// in `a` and `b` for greater-than-or-equal.
728 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
729 {
730     static if (SIMD_COMPARISON_MASKS_16B)
731     {
732         return cast(__m128d)(a >= b);
733     }
734     else static if (GDC_with_SSE2)
735     {
736         return __builtin_ia32_cmpgepd(a, b);
737     }
738     else
739     {
740         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
741     }
742 }
743 
744 /// Compare the lower double-precision (64-bit) floating-point elements 
745 /// in `a` and `b` for greater-than-or-equal, store the result in the 
746 /// lower element, and copy the upper element from `a`.
747 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
748 {
749     static if (DMD_with_DSIMD)
750     {
751         return cast(__m128d) __simd(XMM.CMPSD, b, a, 2);
752     }
753     else static if (GDC_with_SSE2)
754     {
755         return __builtin_ia32_cmplesd(b, a);
756     }
757     else
758     {
759         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
760     }
761 }
762 unittest
763 {
764     __m128d A = _mm_setr_pd(1.0, 0.0);
765     __m128d B = _mm_setr_pd(double.nan, 0.0);
766     __m128d C = _mm_setr_pd(2.0, 0.0);
767     assert( (cast(long2)_mm_cmpge_sd(A, A)).array[0] == -1);
768     assert( (cast(long2)_mm_cmpge_sd(A, B)).array[0] ==  0);
769     assert( (cast(long2)_mm_cmpge_sd(A, C)).array[0] ==  0);
770     assert( (cast(long2)_mm_cmpge_sd(B, A)).array[0] ==  0);
771     assert( (cast(long2)_mm_cmpge_sd(B, B)).array[0] ==  0);
772     assert( (cast(long2)_mm_cmpge_sd(B, C)).array[0] ==  0);
773     assert( (cast(long2)_mm_cmpge_sd(C, A)).array[0] == -1);
774     assert( (cast(long2)_mm_cmpge_sd(C, B)).array[0] ==  0);
775     assert( (cast(long2)_mm_cmpge_sd(C, C)).array[0] == -1);
776 }
777 
778 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
779 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
780 {
781     static if (SIMD_COMPARISON_MASKS_16B)
782     {
783         return cast(__m128i)(cast(short8)a > cast(short8)b);
784     }
785     else static if (GDC_with_SSE2)
786     {
787         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
788     }
789     else
790     {
791         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
792     }
793 }
794 unittest
795 {
796     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
797     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
798     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
799     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
800     assert(R.array == E);
801 }
802 
803 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
804 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
805 {
806     static if (SIMD_COMPARISON_MASKS_16B)
807     {
808         return cast(__m128i)(cast(int4)a > cast(int4)b);
809     }
810     else static if (GDC_with_SSE2)
811     {
812         return __builtin_ia32_pcmpgtd128(a, b); 
813     }
814     else
815     {
816         return cast(__m128i)( greaterMask!int4(a, b));
817     }
818 }
819 unittest
820 {
821     int4   A = [-3,  2, -1,  0];
822     int4   B = [ 4, -2,  2,  0];
823     int[4] E = [ 0, -1,  0,  0];
824     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
825     assert(R.array == E);
826 }
827 
828 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
829 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
830 {
831     static if (SIMD_COMPARISON_MASKS_16B)
832     {
833         return cast(__m128i)(cast(byte16)a > cast(byte16)b);
834     }
835     else
836     {
837         // Note: __builtin_ia32_pcmpgtb128 is buggy on some old GDC, do not use
838         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
839     }
840 }
841 unittest
842 {
843     __m128i A = _mm_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
844     __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
845     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
846     byte[16] correct =       [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
847     __m128i D = _mm_cmpeq_epi8(A, B);
848     assert(C.array == correct);
849 }
850 
851 /// Compare packed double-precision (64-bit) floating-point elements 
852 /// in `a` and `b` for greater-than.
853 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
854 {
855     static if (SIMD_COMPARISON_MASKS_16B)
856     {
857         return cast(__m128d)(a > b);
858     }
859     else static if (GDC_with_SSE2)
860     {
861         return __builtin_ia32_cmpgtpd(a, b); 
862     }
863     else
864     {
865         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
866     }
867 }
868 
869 /// Compare the lower double-precision (64-bit) floating-point elements 
870 /// in `a` and `b` for greater-than, store the result in the lower element,
871 /// and copy the upper element from `a`.
872 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
873 {
874     static if (DMD_with_DSIMD)
875     {
876         return cast(__m128d) __simd(XMM.CMPSD, b, a, 1);
877     }
878     else static if (GDC_with_SSE2)
879     {
880         return __builtin_ia32_cmpltsd(b, a);
881     }
882     else
883     {
884         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
885     }
886 }
887 unittest
888 {
889     __m128d A = _mm_setr_pd(1.0, 0.0);
890     __m128d B = _mm_setr_pd(double.nan, 0.0);
891     __m128d C = _mm_setr_pd(2.0, 0.0);
892     assert( (cast(long2)_mm_cmpgt_sd(A, A)).array[0] ==  0);
893     assert( (cast(long2)_mm_cmpgt_sd(A, B)).array[0] ==  0);
894     assert( (cast(long2)_mm_cmpgt_sd(A, C)).array[0] ==  0);
895     assert( (cast(long2)_mm_cmpgt_sd(B, A)).array[0] ==  0);
896     assert( (cast(long2)_mm_cmpgt_sd(B, B)).array[0] ==  0);
897     assert( (cast(long2)_mm_cmpgt_sd(B, C)).array[0] ==  0);
898     assert( (cast(long2)_mm_cmpgt_sd(C, A)).array[0] == -1);
899     assert( (cast(long2)_mm_cmpgt_sd(C, B)).array[0] ==  0);
900     assert( (cast(long2)_mm_cmpgt_sd(C, C)).array[0] ==  0);
901 }
902 
903 
904 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
905 /// #BONUS
906 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe
907 {
908     static if (SIMD_COMPARISON_MASKS_16B)
909     {
910         return cast(__m128i)(cast(short8)a <= cast(short8)b);
911     }
912     else version (LDC)
913     {
914         // LDC ARM64: generates cmge since -O1
915         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a);
916     }
917     else
918     {
919         return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a));
920     }
921 }
922 unittest
923 {
924     short8   A = [-3, -2, -32768,  1,  0,  1,  2,  3];
925     short8   B = [ 4,  3,  32767,  0,  0, -1, -2, -3];
926     short[8] E = [-1, -1,     -1,  0,  -1, 0,  0,  0];
927     short8   R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B));
928     assert(R.array == E);
929 }
930 
931 /// Compare packed double-precision (64-bit) floating-point elements 
932 /// in `a` and `b` for less-than-or-equal.
933 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
934 {
935     static if (SIMD_COMPARISON_MASKS_16B)
936     {
937         return cast(__m128d)(a <= b);
938     }
939     else static if (GDC_with_SSE2)
940     {
941         return __builtin_ia32_cmplepd(a, b); 
942     }
943     else
944     {
945         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
946     }
947 }
948 
949 /// Compare the lower double-precision (64-bit) floating-point elements 
950 /// in `a` and `b` for less-than-or-equal, store the result in the 
951 /// lower element, and copy the upper element from `a`.
952 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
953 {
954     static if (DMD_with_DSIMD)
955     {
956         return cast(__m128d) __simd(XMM.CMPSD, a, b, 2);
957     }
958     else static if (GDC_with_SSE2)
959     {
960         return __builtin_ia32_cmplesd(a, b); 
961     }
962     else
963     {
964         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
965     }
966 }
967 
968 /// Compare packed 16-bit integers in `a` and `b` for less-than.
969 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
970 {
971     return _mm_cmpgt_epi16(b, a);
972 }
973 
974 /// Compare packed 32-bit integers in `a` and `b` for less-than.
975 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
976 {
977     return _mm_cmpgt_epi32(b, a);
978 }
979 
980 /// Compare packed 8-bit integers in `a` and `b` for less-than.
981 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
982 {
983     return _mm_cmpgt_epi8(b, a);
984 }
985 
986 /// Compare packed double-precision (64-bit) floating-point elements
987 /// in `a` and `b` for less-than.
988 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
989 {
990     static if (SIMD_COMPARISON_MASKS_16B)
991     {
992         return cast(__m128d)(a < b);
993     }
994     else static if (GDC_with_SSE2)
995     {
996         return __builtin_ia32_cmpltpd(a, b); 
997     }
998     else
999     {
1000         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
1001     }
1002 }
1003 
1004 /// Compare the lower double-precision (64-bit) floating-point elements
1005 /// in `a` and `b` for less-than, store the result in the lower 
1006 /// element, and copy the upper element from `a`.
1007 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
1008 {
1009     static if (DMD_with_DSIMD)
1010     {
1011         return cast(__m128d) __simd(XMM.CMPSD, a, b, 1);
1012     }
1013     else static if (GDC_with_SSE2)
1014     {
1015         return __builtin_ia32_cmpltsd(a, b); 
1016     }
1017     else
1018     {
1019         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
1020     }
1021 }
1022 
1023 /// Compare packed double-precision (64-bit) floating-point elements
1024 /// in `a` and `b` for not-equal.
1025 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
1026 {
1027     static if (GDC_with_SSE2)
1028     {
1029         return __builtin_ia32_cmpneqpd(a, b); 
1030     }
1031     else
1032     {
1033         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
1034     }
1035 }
1036 
1037 /// Compare the lower double-precision (64-bit) floating-point elements
1038 /// in `a` and `b` for not-equal, store the result in the lower 
1039 /// element, and copy the upper element from `a`.
1040 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
1041 {
1042     static if (GDC_with_SSE2)
1043     {
1044         return __builtin_ia32_cmpneqsd(a, b); 
1045     }
1046     else
1047     {
1048         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
1049     }
1050 }
1051 
1052 /// Compare packed double-precision (64-bit) floating-point elements 
1053 /// in `a` and `b` for not-greater-than-or-equal.
1054 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
1055 {
1056     static if (GDC_with_SSE2)
1057     {
1058         return __builtin_ia32_cmpngepd(a, b); 
1059     }
1060     else
1061     {
1062         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
1063     }
1064 }
1065 
1066 /// Compare the lower double-precision (64-bit) floating-point elements 
1067 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
1068 /// the lower element, and copy the upper element from `a`.
1069 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
1070 {
1071     // Note: There is no __builtin_ia32_cmpngesd builtin.
1072     static if (GDC_with_SSE2)
1073     {
1074         return __builtin_ia32_cmpltsd(b, a); 
1075     }
1076     else
1077     {
1078         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
1079     }
1080 }
1081 
1082 /// Compare packed double-precision (64-bit) floating-point elements 
1083 /// in `a` and `b` for not-greater-than.
1084 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
1085 {
1086     static if (GDC_with_SSE2)
1087     {
1088         return __builtin_ia32_cmpngtpd(a, b);
1089     }
1090     else
1091     {
1092         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
1093     }
1094 }
1095 
1096 /// Compare the lower double-precision (64-bit) floating-point elements 
1097 /// in `a` and `b` for not-greater-than, store the result in the 
1098 /// lower element, and copy the upper element from `a`.
1099 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
1100 {
1101     // Note: There is no __builtin_ia32_cmpngtsd builtin.
1102     static if (GDC_with_SSE2)
1103     {
1104         return __builtin_ia32_cmplesd(b, a);
1105     }
1106     else
1107     {
1108         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
1109     }
1110 }
1111 
1112 /// Compare packed double-precision (64-bit) floating-point elements 
1113 /// in `a` and `b` for not-less-than-or-equal.
1114 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
1115 {
1116     static if (GDC_with_SSE2)
1117     {
1118         return __builtin_ia32_cmpnlepd(a, b);
1119     }
1120     else
1121     {
1122         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
1123     }
1124 }
1125 
1126 /// Compare the lower double-precision (64-bit) floating-point elements 
1127 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
1128 /// lower element, and copy the upper element from `a`.
1129 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
1130 {
1131     static if (GDC_with_SSE2)
1132     {
1133         return __builtin_ia32_cmpnlesd(a, b);
1134     }
1135     else
1136     {
1137         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
1138     }
1139 }
1140  
1141 /// Compare packed double-precision (64-bit) floating-point elements 
1142 /// in `a` and `b` for not-less-than.
1143 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
1144 {
1145     static if (GDC_with_SSE2)
1146     {
1147         return __builtin_ia32_cmpnltpd(a, b);
1148     }
1149     else
1150     {
1151         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1152     }
1153 }
1154 
1155 /// Compare the lower double-precision (64-bit) floating-point elements 
1156 /// in `a` and `b` for not-less-than, store the result in the lower 
1157 /// element, and copy the upper element from `a`.
1158 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1159 {
1160     static if (GDC_with_SSE2)
1161     {
1162         return __builtin_ia32_cmpnltsd(a, b);
1163     }
1164     else
1165     {
1166         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1167     }
1168 }
1169 
1170 /// Compare packed double-precision (64-bit) floating-point elements 
1171 /// in `a` and `b` to see if neither is NaN.
1172 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1173 {
1174     static if (GDC_with_SSE2)
1175     {
1176         return __builtin_ia32_cmpordpd(a, b);
1177     }
1178     else
1179     {
1180         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1181     }
1182 }
1183 
1184 /// Compare the lower double-precision (64-bit) floating-point elements 
1185 /// in `a` and `b` to see if neither is NaN, store the result in the 
1186 /// lower element, and copy the upper element from `a` to the upper element.
1187 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1188 {
1189     static if (GDC_with_SSE2)
1190     {
1191         return __builtin_ia32_cmpordsd(a, b);
1192     }
1193     else
1194     {
1195         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1196     }
1197 }
1198 
1199 /// Compare packed double-precision (64-bit) floating-point elements 
1200 /// in `a` and `b` to see if either is NaN.
1201 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1202 {
1203     static if (GDC_with_SSE2)
1204     {
1205         return __builtin_ia32_cmpunordpd(a, b);
1206     }
1207     else
1208     {
1209         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1210     }
1211 }
1212 
1213 /// Compare the lower double-precision (64-bit) floating-point elements 
1214 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1215 /// element, and copy the upper element from `a` to the upper element.
1216 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1217 {
1218     static if (GDC_with_SSE2)
1219     {
1220         return __builtin_ia32_cmpunordsd(a, b);
1221     }
1222     else
1223     {
1224         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1225     }
1226 }
1227 
1228 /// Compare the lower double-precision (64-bit) floating-point element 
1229 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1230 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1231 {
1232     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1233     // comisd instruction, it returns false in case of unordered instead.
1234     //
1235     // Actually C++ compilers disagree over the meaning of that instruction.
1236     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1237     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1238     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1239     return a.array[0] == b.array[0];
1240 }
1241 unittest
1242 {
1243     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1244     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1245     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1246     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1247     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1248 }
1249 
1250 /// Compare the lower double-precision (64-bit) floating-point element 
1251 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1252 /// result (0 or 1).
1253 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1254 {
1255     return a.array[0] >= b.array[0];
1256 }
1257 unittest
1258 {
1259     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1260     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1261     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1262     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1263     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1264     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1265 }
1266 
1267 /// Compare the lower double-precision (64-bit) floating-point element 
1268 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1269 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1270 {
1271     return a.array[0] > b.array[0];
1272 }
1273 unittest
1274 {
1275     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1276     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1277     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1278     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1279     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1280 }
1281 
1282 /// Compare the lower double-precision (64-bit) floating-point element 
1283 /// in `a` and `b` for less-than-or-equal.
1284 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1285 {
1286     return a.array[0] <= b.array[0];
1287 }
1288 unittest
1289 {
1290     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1291     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1292     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1293     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1294     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1295     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1296 }
1297 
1298 /// Compare the lower double-precision (64-bit) floating-point element 
1299 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1300 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1301 {
1302     return a.array[0] < b.array[0];
1303 }
1304 unittest
1305 {
1306     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1307     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1308     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1309     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1310     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1311     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1312 }
1313 
1314 /// Compare the lower double-precision (64-bit) floating-point element
1315 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1316 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1317 {
1318     return a.array[0] != b.array[0];
1319 }
1320 unittest
1321 {
1322     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1323     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1324     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1325     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1326     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1327 }
1328 
1329 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1330 /// floating-point elements.
1331 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1332 {
1333     static if (LDC_with_optimizations)
1334     {
1335         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1336         enum ir = `
1337             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1338             %r = sitofp <2 x i32> %v to <2 x double>
1339             ret <2 x double> %r`;
1340         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1341     }
1342     else static if (GDC_with_SSE2)
1343     {
1344         return __builtin_ia32_cvtdq2pd(a);
1345     }
1346     else
1347     {
1348         double2 r = void;
1349         r.ptr[0] = a.array[0];
1350         r.ptr[1] = a.array[1];
1351         return r;
1352     }
1353 }
1354 unittest
1355 {
1356     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1357     assert(A.array[0] == 54.0);
1358     assert(A.array[1] == 54.0);
1359 }
1360 
1361 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1362 /// floating-point elements.
1363 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1364 {
1365     static if (DMD_with_DSIMD)
1366     {
1367         return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a);
1368     }
1369     else static if (GDC_with_SSE2)
1370     {
1371         return __builtin_ia32_cvtdq2ps(a);
1372     }
1373     else static if (LDC_with_optimizations)
1374     {
1375         // See #86 for why we had to resort to LLVM IR.
1376         // Plain code below was leading to catastrophic behaviour. 
1377         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1378         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1379         enum ir = `
1380             %r = sitofp <4 x i32> %0 to <4 x float>
1381             ret <4 x float> %r`;
1382         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1383     }
1384     else static if (LDC_with_x86_asm)
1385     {
1386         __m128 r;
1387         asm pure nothrow @nogc @trusted
1388         {
1389             movdqu XMM0, a;
1390             cvtdq2ps XMM0, XMM0;
1391             movdqu r, XMM0;
1392         }
1393         return r;
1394     }
1395     else
1396     {
1397         __m128 res; // PERF =void;
1398         res.ptr[0] = cast(float)a.array[0];
1399         res.ptr[1] = cast(float)a.array[1];
1400         res.ptr[2] = cast(float)a.array[2];
1401         res.ptr[3] = cast(float)a.array[3];
1402         return res;
1403     }
1404 }
1405 unittest
1406 {
1407     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1408     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1409 }
1410 
1411 /// Convert packed double-precision (64-bit) floating-point elements 
1412 /// in `a` to packed 32-bit integers.
1413 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1414 {
1415     // PERF ARM32
1416     static if (LDC_with_SSE2)
1417     {
1418         return __builtin_ia32_cvtpd2dq(a);
1419     }
1420     else static if (GDC_with_SSE2)
1421     {
1422         return __builtin_ia32_cvtpd2dq(a);
1423     }
1424     else static if (LDC_with_ARM64)
1425     {
1426         // Get current rounding mode.
1427         uint fpscr = arm_get_fpcr();
1428         long2 i;
1429         switch(fpscr & _MM_ROUND_MASK_ARM)
1430         {
1431             default:
1432             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1433             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1434             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1435             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1436         }
1437         int4 zero = 0;
1438         return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); // PERF: this slow down build for nothing, test without shufflevector
1439     }
1440     else
1441     {
1442         // PERF ARM32
1443         __m128i r = _mm_setzero_si128();
1444         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1445         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1446         return r;
1447     }
1448 }
1449 unittest
1450 {
1451     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1452     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1453 }
1454 
1455 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1456 /// to packed 32-bit integers
1457 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1458 {
1459     return to_m64(_mm_cvtpd_epi32(v));
1460 }
1461 unittest
1462 {
1463     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1464     assert(A.array[0] == 55 && A.array[1] == 61);
1465 }
1466 
1467 /// Convert packed double-precision (64-bit) floating-point elements 
1468 /// in `a` to packed single-precision (32-bit) floating-point elements.
1469 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1470 {
1471     static if (LDC_with_SSE2)
1472     {
1473         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1474     }
1475     else static if (GDC_with_SSE2)
1476     {
1477         return __builtin_ia32_cvtpd2ps(a);
1478     }
1479     else
1480     { 
1481         __m128 r = void;
1482         r.ptr[0] = a.array[0];
1483         r.ptr[1] = a.array[1];
1484         r.ptr[2] = 0;
1485         r.ptr[3] = 0;
1486         return r;
1487     }
1488 }
1489 unittest
1490 {
1491     __m128d A = _mm_set_pd(5.25, 4.0);
1492     __m128 B = _mm_cvtpd_ps(A);
1493     assert(B.array == [4.0f, 5.25f, 0, 0]);
1494 }
1495 
1496 /// Convert packed 32-bit integers in `v` to packed double-precision 
1497 /// (64-bit) floating-point elements.
1498 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1499 {
1500     return _mm_cvtepi32_pd(to_m128i(v));
1501 }
1502 unittest
1503 {
1504     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1505     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1506 }
1507 
1508 /// Convert packed single-precision (32-bit) floating-point elements 
1509 /// in `a` to packed 32-bit integers
1510 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1511 {
1512     static if (LDC_with_SSE2)
1513     {
1514         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1515     }
1516     else static if (GDC_with_SSE2)
1517     {
1518         return __builtin_ia32_cvtps2dq(a);
1519     }
1520     else static if (LDC_with_ARM64)
1521     {
1522         // Get current rounding mode.
1523         uint fpscr = arm_get_fpcr();
1524         switch(fpscr & _MM_ROUND_MASK_ARM)
1525         {
1526             default:
1527             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1528             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1529             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1530             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1531         }
1532     }
1533     else
1534     {
1535         __m128i r = void;
1536         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1537         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1538         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1539         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1540         return r;
1541     }
1542 }
1543 unittest
1544 {
1545     // GDC bug #98607
1546     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1547     // GDC does not provide optimization barrier for rounding mode.
1548     // Workarounded with different literals. This bug will likely only manifest in unittest.
1549     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1550 
1551     uint savedRounding = _MM_GET_ROUNDING_MODE();
1552 
1553     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1554     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1555     assert(A.array == [1, -2, 54, -3]);
1556 
1557     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1558     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1559     assert(A.array == [1, -3, 53, -3]);
1560 
1561     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1562     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1563     assert(A.array == [2, -2, 54, -2]);
1564 
1565     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1566     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1567     assert(A.array == [1, -2, 53, -2]);
1568 
1569     _MM_SET_ROUNDING_MODE(savedRounding);
1570 }
1571 
1572 /// Convert packed single-precision (32-bit) floating-point elements 
1573 /// in `a` to packed double-precision (64-bit) floating-point elements.
1574 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1575 {
1576     static if (LDC_with_optimizations)
1577     {
1578         // Generates cvtps2pd since LDC 1.0 -O0
1579         enum ir = `
1580             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1581             %r = fpext <2 x float> %v to <2 x double>
1582             ret <2 x double> %r`;
1583         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1584     }
1585     else static if (GDC_with_SSE2)
1586     {
1587         return __builtin_ia32_cvtps2pd(a);
1588     }
1589     else
1590     {
1591         double2 r = void;
1592         r.ptr[0] = a.array[0];
1593         r.ptr[1] = a.array[1];
1594         return r;
1595     }
1596 }
1597 unittest
1598 {
1599     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1600     assert(A.array[0] == 54.0);
1601     assert(A.array[1] == 54.0);
1602 }
1603 
1604 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1605 double _mm_cvtsd_f64 (__m128d a) pure @safe
1606 {
1607     return a.array[0];
1608 }
1609 
1610 /// Convert the lower double-precision (64-bit) floating-point element
1611 /// in `a` to a 32-bit integer.
1612 int _mm_cvtsd_si32 (__m128d a) @safe
1613 {
1614     static if (LDC_with_SSE2)
1615     {
1616         return __builtin_ia32_cvtsd2si(a);
1617     }
1618     else static if (GDC_with_SSE2)
1619     {
1620         return __builtin_ia32_cvtsd2si(a);
1621     }
1622     else
1623     {
1624         return convertDoubleToInt32UsingMXCSR(a[0]);
1625     }
1626 }
1627 unittest
1628 {
1629     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1630 }
1631 
1632 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1633 long _mm_cvtsd_si64 (__m128d a) @trusted
1634 {
1635     static if (LDC_with_SSE2)
1636     {
1637         version (X86_64)
1638         {
1639             return __builtin_ia32_cvtsd2si64(a);
1640         }
1641         else
1642         {
1643             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1644             // using SSE instructions only. So the builtin doesn't exist for this arch.
1645             return convertDoubleToInt64UsingMXCSR(a[0]);
1646         }
1647     }
1648     else
1649     {
1650         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1651     }
1652 }
1653 unittest
1654 {
1655     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1656 
1657     uint savedRounding = _MM_GET_ROUNDING_MODE();
1658 
1659     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1660     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1661 
1662     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1663     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1664 
1665     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1666     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1667 
1668     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1669     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1670 
1671     _MM_SET_ROUNDING_MODE(savedRounding);
1672 }
1673 
1674 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1675 
1676 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1677 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1678 /// to the upper elements of result.
1679 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1680 {
1681     static if (GDC_with_SSE2)
1682     {
1683         return __builtin_ia32_cvtsd2ss(a, b); 
1684     }
1685     else
1686     {
1687         // Generates cvtsd2ss since LDC 1.3 -O0
1688         a.ptr[0] = b.array[0];
1689         return a;
1690     }
1691 }
1692 unittest
1693 {
1694     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1695     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1696 }
1697 
1698 /// Get the lower 32-bit integer in `a`.
1699 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1700 {
1701     return a.array[0];
1702 }
1703 
1704 /// Get the lower 64-bit integer in `a`.
1705 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1706 {
1707     long2 la = cast(long2)a;
1708     return la.array[0];
1709 }
1710 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1711 
1712 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1713 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1714 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1715 {
1716     a.ptr[0] = cast(double)b;
1717     return a;
1718 }
1719 unittest
1720 {
1721     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1722     assert(a.array == [42.0, 0]);
1723 }
1724 
1725 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1726 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1727 {
1728     int4 r = [0, 0, 0, 0];
1729     r.ptr[0] = a;
1730     return r;
1731 }
1732 unittest
1733 {
1734     __m128i a = _mm_cvtsi32_si128(65);
1735     assert(a.array == [65, 0, 0, 0]);
1736 }
1737 
1738 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1739 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1740 
1741 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1742 {
1743     a.ptr[0] = cast(double)b;
1744     return a;
1745 }
1746 unittest
1747 {
1748     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1749     assert(a.array == [42.0, 0]);
1750 }
1751 
1752 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1753 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1754 {
1755     long2 r = [0, 0];
1756     r.ptr[0] = a;
1757     return cast(__m128i)(r);
1758 }
1759 
1760 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1761 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1762 
1763 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1764 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1765 // element of result.
1766 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1767 {
1768     a.ptr[0] = b.array[0];
1769     return a;
1770 }
1771 unittest
1772 {
1773     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1774     assert(a.array == [42.0, 0]);
1775 }
1776 
1777 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1778 long _mm_cvttss_si64 (__m128 a) pure @safe
1779 {
1780     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1781 }
1782 unittest
1783 {
1784     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1785 }
1786 
1787 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1788 /// Put zeroes in the upper elements of result.
1789 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1790 {
1791     static if (LDC_with_SSE2)
1792     {
1793         return __builtin_ia32_cvttpd2dq(a);
1794     }
1795     else static if (GDC_with_SSE2)
1796     {
1797         return __builtin_ia32_cvttpd2dq(a);
1798     }
1799     else
1800     {
1801         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1802         __m128i r; // PERF =void;
1803         r.ptr[0] = cast(int)a.array[0];
1804         r.ptr[1] = cast(int)a.array[1];
1805         r.ptr[2] = 0;
1806         r.ptr[3] = 0;
1807         return r;
1808     }
1809 }
1810 unittest
1811 {
1812     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1813     assert(R.array == [-4, 45641, 0, 0]);
1814 }
1815 
1816 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1817 /// to packed 32-bit integers with truncation.
1818 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1819 {
1820     return to_m64(_mm_cvttpd_epi32(v));
1821 }
1822 unittest
1823 {
1824     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1825     int[2] correct = [-4, 45641];
1826     assert(R.array == correct);
1827 }
1828 
1829 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1830 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1831 {
1832     // x86: Generates cvttps2dq since LDC 1.3 -O2
1833     // ARM64: generates fcvtze since LDC 1.8 -O2
1834     __m128i r; // PERF = void;
1835     r.ptr[0] = cast(int)a.array[0];
1836     r.ptr[1] = cast(int)a.array[1];
1837     r.ptr[2] = cast(int)a.array[2];
1838     r.ptr[3] = cast(int)a.array[3];
1839     return r;
1840 }
1841 unittest
1842 {
1843     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1844     assert(R.array == [-4, 45641, 0, 1]);
1845 }
1846 
1847 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1848 int _mm_cvttsd_si32 (__m128d a)
1849 {
1850     // Generates cvttsd2si since LDC 1.3 -O0
1851     return cast(int)a.array[0];
1852 }
1853 
1854 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1855 long _mm_cvttsd_si64 (__m128d a)
1856 {
1857     // Generates cvttsd2si since LDC 1.3 -O0
1858     // but in 32-bit instead, it's a long sequence that resort to FPU
1859     return cast(long)a.array[0];
1860 }
1861 
1862 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1863 
1864 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1865 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1866 {
1867     pragma(inline, true);
1868     return a / b;
1869 }
1870 
1871 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1872 {
1873     static if (GDC_with_SSE2)
1874     {
1875         return __builtin_ia32_divsd(a, b);
1876     }
1877     else version(DigitalMars)
1878     {
1879         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1880         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1881         asm pure nothrow @nogc @trusted { nop;}
1882         a.array[0] = a.array[0] / b.array[0];
1883         return a;
1884     }
1885     else
1886     {
1887         a.ptr[0] /= b.array[0];
1888         return a;
1889     }
1890 }
1891 unittest
1892 {
1893     __m128d a = [2.0, 4.5];
1894     a = _mm_div_sd(a, a);
1895     assert(a.array == [1.0, 4.5]);
1896 }
1897 
1898 /// Extract a 16-bit integer from `v`, selected with `index`.
1899 /// Warning: the returned value is zero-extended to 32-bits.
1900 int _mm_extract_epi16(__m128i v, int index) pure @safe
1901 {
1902     short8 r = cast(short8)v;
1903     return cast(ushort)(r.array[index & 7]);
1904 }
1905 unittest
1906 {
1907     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1908     assert(_mm_extract_epi16(A, 6) == 6);
1909     assert(_mm_extract_epi16(A, 0) == 65535);
1910     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1911 }
1912 
1913 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1914 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1915 {
1916     short8 r = cast(short8)v;
1917     r.ptr[index & 7] = cast(short)i;
1918     return cast(__m128i)r;
1919 }
1920 unittest
1921 {
1922     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1923     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1924     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1925     assert(R.array == correct);
1926 }
1927 
1928 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 
1929 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 
1930 /// is globally visible before any load instruction which follows the fence in program order.
1931 void _mm_lfence() @trusted
1932 {
1933     version(GNU)
1934     {
1935         static if (GDC_with_SSE2)
1936         {
1937             __builtin_ia32_lfence();
1938         }
1939         else version(X86)
1940         {
1941             asm pure nothrow @nogc @trusted
1942             {
1943                 "lfence;\n" : : : ;
1944             }
1945         }
1946         else __warn_noop();
1947     }
1948     else static if (LDC_with_SSE2)
1949     {
1950         __builtin_ia32_lfence();
1951     }
1952     else static if (LDC_with_ARM64)
1953     {
1954          __builtin_arm_dmb(9);  // dmb ishld
1955     }
1956     else static if (DMD_with_asm)
1957     {
1958         asm nothrow @nogc pure @trusted
1959         {
1960             lfence;
1961         }
1962     }
1963     else version(LDC)
1964     {
1965         // When the architecture is unknown, generate a full memory barrier,
1966         // as the semantics of sfence do not really match those of atomics.
1967         llvm_memory_fence();
1968     }
1969     else
1970         static assert(false);
1971 }
1972 unittest
1973 {
1974     _mm_lfence();
1975 }
1976 
1977 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1978 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1979 __m128d _mm_load_pd (const(double) * mem_addr) pure
1980 {
1981     pragma(inline, true);
1982     __m128d* aligned = cast(__m128d*)mem_addr;
1983     return *aligned;
1984 }
1985 unittest
1986 {
1987     align(16) double[2] S = [-5.0, 7.0];
1988     __m128d R = _mm_load_pd(S.ptr);
1989     assert(R.array == S);
1990 }
1991 
1992 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1993 /// `mem_addr` does not need to be aligned on any particular boundary.
1994 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1995 {
1996     double m = *mem_addr;
1997     __m128d r; // PERF =void;
1998     r.ptr[0] = m;
1999     r.ptr[1] = m;
2000     return r;
2001 }
2002 unittest
2003 {
2004     double what = 4;
2005     __m128d R = _mm_load_pd1(&what);
2006     double[2] correct = [4.0, 4];
2007     assert(R.array == correct);
2008 }
2009 
2010 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
2011 /// element. `mem_addr` does not need to be aligned on any particular boundary.
2012 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
2013 {
2014     double2 r = [0, 0];
2015     r.ptr[0] = *mem_addr;
2016     return r;
2017 }
2018 unittest
2019 {
2020     double x = -42;
2021     __m128d a = _mm_load_sd(&x);
2022     assert(a.array == [-42.0, 0.0]);
2023 }
2024 
2025 /// Load 128-bits of integer data from memory into dst. 
2026 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2027 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe
2028 {
2029     pragma(inline, true);
2030     return *mem_addr;
2031 }
2032 unittest
2033 {
2034     align(16) int[4] correct = [-1, 2, 3, 4];
2035     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
2036     assert(A.array == correct);
2037 }
2038 
2039 alias _mm_load1_pd = _mm_load_pd1; ///
2040 
2041 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
2042 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
2043 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
2044 {
2045     pragma(inline, true);
2046     a.ptr[1] = *mem_addr;
2047     return a;
2048 }
2049 unittest
2050 {
2051     double A = 7.0;
2052     __m128d B = _mm_setr_pd(4.0, -5.0);
2053     __m128d R = _mm_loadh_pd(B, &A);
2054     double[2] correct = [ 4.0, 7.0 ];
2055     assert(R.array == correct);
2056 }
2057 
2058 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
2059 /// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit.
2060 /// You may use `_mm_loadu_si64` instead.
2061 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
2062 {
2063     pragma(inline, true);
2064     static if (DMD_with_DSIMD)
2065     {
2066         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2067     }
2068     else
2069     {
2070         auto pLong = cast(const(long)*)mem_addr;
2071         long2 r = [0, 0];
2072         r.ptr[0] = *pLong;
2073         return cast(__m128i)(r);
2074     }
2075 }
2076 unittest
2077 {
2078     long A = 0x7878787870707070;
2079     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
2080     long[2] correct = [0x7878787870707070, 0];
2081     assert(R.array == correct);
2082 }
2083 
2084 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
2085 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
2086 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
2087 {
2088     a.ptr[0] = *mem_addr;
2089     return a;
2090 }
2091 unittest
2092 {
2093     double A = 7.0;
2094     __m128d B = _mm_setr_pd(4.0, -5.0);
2095     __m128d R = _mm_loadl_pd(B, &A);
2096     double[2] correct = [ 7.0, -5.0 ];
2097     assert(R.array == correct);
2098 }
2099 
2100 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
2101 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2102 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
2103 {
2104     __m128d a = *cast(__m128d*)(mem_addr);
2105     __m128d r; // PERF =void;
2106     r.ptr[0] = a.array[1];
2107     r.ptr[1] = a.array[0];
2108     return r;
2109 }
2110 unittest
2111 {
2112     align(16) double[2] A = [56.0, -74.0];
2113     __m128d R = _mm_loadr_pd(A.ptr);
2114     double[2] correct = [-74.0, 56.0];
2115     assert(R.array == correct);
2116 }
2117 
2118 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
2119 /// `mem_addr` does not need to be aligned on any particular boundary.
2120 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
2121 {
2122     pragma(inline, true);
2123     static if (GDC_with_SSE2)
2124     {
2125         return __builtin_ia32_loadupd(mem_addr); 
2126     }
2127     else static if (LDC_with_optimizations)
2128     {
2129         return loadUnaligned!(double2)(mem_addr);
2130     }
2131     else version(DigitalMars)
2132     {
2133         // Apparently inside __simd you can use aligned dereferences without fear.
2134         // That was issue 23048 on dlang's Bugzilla.
2135         static if (DMD_with_DSIMD)
2136         {
2137             return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
2138         }
2139         else static if (SSESizedVectorsAreEmulated)
2140         {
2141             // Since this vector is emulated, it doesn't have alignement constraints
2142             // and as such we can just cast it.
2143             return *cast(__m128d*)(mem_addr);
2144         }
2145         else
2146         {
2147             __m128d result;
2148             result.ptr[0] = mem_addr[0];
2149             result.ptr[1] = mem_addr[1];
2150             return result;
2151         }
2152     }
2153     else
2154     {
2155         __m128d result;
2156         result.ptr[0] = mem_addr[0];
2157         result.ptr[1] = mem_addr[1];
2158         return result;
2159     }
2160 }
2161 unittest
2162 {
2163     double[2] A = [56.0, -75.0];
2164     __m128d R = _mm_loadu_pd(A.ptr);
2165     double[2] correct = [56.0, -75.0];
2166     assert(R.array == correct);
2167 }
2168 
2169 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
2170 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
2171 {
2172     // PERF DMD
2173     pragma(inline, true);
2174     static if (GDC_with_SSE2)
2175     {
2176         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
2177     }
2178     else static if (LDC_with_optimizations)
2179     {
2180         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2181     }
2182     else
2183     {
2184         const(int)* p = cast(const(int)*)mem_addr;
2185         __m128i r = void;
2186         r.ptr[0] = p[0];
2187         r.ptr[1] = p[1];
2188         r.ptr[2] = p[2];
2189         r.ptr[3] = p[3];
2190         return r;
2191     }
2192 }
2193 unittest
2194 {
2195     align(16) int[4] correct = [-1, 2, -3, 4];
2196     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2197     assert(A.array == correct);
2198 }
2199 
2200 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
2201 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2202 {
2203     static if (DMD_with_DSIMD)
2204     {
2205         int r = *cast(short*)(mem_addr);
2206         return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
2207     }
2208     else version(DigitalMars)
2209     {
2210         // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
2211         // DMD cannot handle the below code...
2212         align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
2213         r[0] = *cast(short*)(mem_addr);
2214         return *cast(int4*)(r.ptr);
2215     }
2216     else
2217     {
2218         short r = *cast(short*)(mem_addr);
2219         short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
2220         result.ptr[0] = r;
2221         return cast(__m128i)result;
2222     }
2223 }
2224 unittest
2225 {
2226     short r = 13;
2227     short8 A = cast(short8) _mm_loadu_si16(&r);
2228     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
2229     assert(A.array == correct);
2230 }
2231 
2232 /// Load unaligned 32-bit integer from memory into the first element of result.
2233 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2234 {
2235     pragma(inline, true);
2236     int r = *cast(int*)(mem_addr);
2237     int4 result = [0, 0, 0, 0];
2238     result.ptr[0] = r;
2239     return result;
2240 }
2241 unittest
2242 {
2243     int r = 42;
2244     __m128i A = _mm_loadu_si32(&r);
2245     int[4] correct = [42, 0, 0, 0];
2246     assert(A.array == correct);
2247 }
2248 
2249 /// Load unaligned 64-bit integer from memory into the first element of result.
2250 /// Upper 64-bit is zeroed.
2251 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system
2252 {
2253     pragma(inline, true);
2254     static if (DMD_with_DSIMD)
2255     {
2256         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2257     }
2258     else
2259     {    
2260         auto pLong = cast(const(long)*)mem_addr;
2261         long2 r = [0, 0];
2262         r.ptr[0] = *pLong;
2263         return cast(__m128i)r;
2264     }
2265 }
2266 unittest
2267 {
2268     long r = 446446446446;
2269     long2 A = cast(long2) _mm_loadu_si64(&r);
2270     long[2] correct = [446446446446, 0];
2271     assert(A.array == correct);
2272 }
2273 
2274 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2275 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2276 /// and pack the results in destination.
2277 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2278 {
2279     static if (GDC_with_SSE2)
2280     {
2281         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2282     }
2283     else static if (LDC_with_SSE2)
2284     {
2285         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2286     }
2287     else static if (LDC_with_optimizations)
2288     {
2289         // 5 inst with arm64 + LDC 1.32 + -O1
2290         enum ir = `            
2291             %ia = sext <8 x i16> %0 to <8 x i32>
2292             %ib = sext <8 x i16> %1 to <8 x i32>
2293             %p = mul <8 x i32> %ia, %ib
2294             %p_even = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 0, i32 2,i32 4, i32 6>
2295             %p_odd  = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 1, i32 3,i32 5, i32 7>            
2296             %p_sum = add <4 x i32> %p_even, %p_odd
2297             ret <4 x i32> %p_sum`;
2298         return cast(__m128i) LDCInlineIR!(ir, int4, short8, short8)(cast(short8)a, cast(short8)b);
2299     }
2300     else
2301     {
2302         short8 sa = cast(short8)a;
2303         short8 sb = cast(short8)b;
2304         int4 r;
2305         foreach(i; 0..4)
2306         {
2307             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2308         }
2309         return r;
2310     }
2311 }
2312 unittest
2313 {
2314     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2315     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2316     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2317     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2318     assert(R.array == correct);
2319 }
2320 
2321 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2322 /// (elements are not stored when the highest bit is not set in the corresponding element)
2323 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2324 /// boundary.
2325 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2326 {
2327     static if (GDC_with_SSE2)
2328     {    
2329         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2330     }
2331     else static if (LDC_with_SSE2)
2332     {
2333         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2334     }
2335     else static if (LDC_with_ARM64)
2336     {
2337         // PERF: catastrophic on ARM32
2338         byte16 bmask  = cast(byte16)mask;
2339         byte16 shift = 7;
2340         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2341         mask = cast(__m128i) bmask;
2342         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2343         dest = (a & mask) | (dest & ~mask);
2344         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2345     }
2346     else
2347     {
2348         byte16 b = cast(byte16)a;
2349         byte16 m = cast(byte16)mask;
2350         byte* dest = cast(byte*)(mem_addr);
2351         foreach(j; 0..16)
2352         {
2353             if (m.array[j] & 128)
2354             {
2355                 dest[j] = b.array[j];
2356             }
2357         }
2358     }
2359 }
2360 unittest
2361 {
2362     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2363     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2364     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2365     _mm_maskmoveu_si128(A, mask, dest.ptr);
2366     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2367     assert(dest == correct);
2368 }
2369 
2370 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2371 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2372 {
2373     static if (GDC_with_SSE2)
2374     {
2375         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2376     }
2377     else version(LDC)
2378     {
2379         // x86: pmaxsw since LDC 1.0 -O1
2380         // ARM: smax.8h since LDC 1.5 -01
2381         short8 sa = cast(short8)a;
2382         short8 sb = cast(short8)b;
2383         static if (SIMD_COMPARISON_MASKS_16B)
2384             short8 greater = sa > sb;
2385         else
2386             short8 greater = greaterMask!short8(sa, sb);
2387         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2388     }
2389     else
2390     {
2391         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2392         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2393         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2394         return _mm_xor_si128(b, mask);
2395     }
2396 }
2397 unittest
2398 {
2399     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2400                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2401     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2402     assert(R.array == correct);
2403 }
2404 
2405 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2406 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2407 {
2408     // PERF DMD
2409     static if (GDC_with_SSE2)
2410     {
2411         return cast(__m128i) __builtin_ia32_pmaxub128(cast(ubyte16)a, cast(ubyte16)b);
2412     }
2413     else version(LDC)
2414     {
2415         // x86: pmaxub since LDC 1.0.0 -O1
2416         // ARM64: umax.16b since LDC 1.5.0 -O1
2417         // PERF: catastrophic on ARM32
2418         ubyte16 sa = cast(ubyte16)a;
2419         ubyte16 sb = cast(ubyte16)b;
2420         static if (SIMD_COMPARISON_MASKS_16B)
2421             ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
2422         else
2423             ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2424         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2425     }
2426     else
2427     {
2428         // PERF: use algorithm from _mm_max_epu16
2429         __m128i value128 = _mm_set1_epi8(-128);
2430         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2431         __m128i aTob = a ^ b; // a ^ (a ^ b) == b
2432         __m128i mask = aTob & higher;
2433         return b ^ mask;
2434 
2435     }
2436 }
2437 unittest
2438 {
2439     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2440                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2441     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2442     assert(R.array == correct);
2443 }
2444 
2445 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2446 /// packed maximum values.
2447 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2448 {
2449     static if (GDC_with_SSE2)
2450     {
2451         return __builtin_ia32_maxpd(a, b);
2452     }
2453     else
2454     {
2455         // x86: Generates maxpd starting with LDC 1.9 -O2
2456         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2457         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2458         return a;
2459     }
2460 }
2461 unittest
2462 {
2463     __m128d A = _mm_setr_pd(4.0, 1.0);
2464     __m128d B = _mm_setr_pd(1.0, 8.0);
2465     __m128d M = _mm_max_pd(A, B);
2466     assert(M.array[0] == 4.0);
2467     assert(M.array[1] == 8.0);
2468 }
2469 
2470 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2471 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2472 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2473 {
2474     static if (GDC_with_SSE2)
2475     {
2476         return __builtin_ia32_maxsd(a, b);
2477     }
2478     else
2479     {
2480          __m128d r = a;
2481         // Generates maxsd starting with LDC 1.3
2482         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2483         return r;
2484     }
2485 }
2486 unittest
2487 {
2488     __m128d A = _mm_setr_pd(1.0, 1.0);
2489     __m128d B = _mm_setr_pd(4.0, 2.0);
2490     __m128d M = _mm_max_sd(A, B);
2491     assert(M.array[0] == 4.0);
2492     assert(M.array[1] == 1.0);
2493 }
2494 
2495 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2496 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2497 /// is globally visible before any memory instruction which follows the fence in program order.
2498 void _mm_mfence() @trusted // not pure!
2499 {
2500     version(GNU)
2501     {
2502         static if (GDC_with_SSE2)
2503         {
2504             __builtin_ia32_mfence();
2505         }
2506         else version(X86)
2507         {
2508             asm pure nothrow @nogc @trusted
2509             {
2510                 "mfence;\n" : : : ;
2511             }
2512         }
2513         else __warn_noop();
2514     }
2515     else static if (LDC_with_SSE2)
2516     {
2517         __builtin_ia32_mfence();
2518     }
2519     else static if (DMD_with_asm)
2520     {
2521         asm nothrow @nogc pure @trusted
2522         {
2523             mfence;
2524         }
2525     }
2526     else version(LDC)
2527     {
2528         // Note: will generate the DMB ish instruction on ARM
2529         llvm_memory_fence();
2530     }
2531     else
2532         static assert(false);
2533 }
2534 unittest
2535 {
2536     _mm_mfence();
2537 }
2538 
2539 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2540 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2541 {
2542     static if (GDC_with_SSE2)
2543     {
2544         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2545     }
2546     else version(LDC)
2547     {
2548         // x86: pminsw since LDC 1.0 -O1
2549         // ARM64: smin.8h since LDC 1.5 -01
2550         short8 sa = cast(short8)a;
2551         short8 sb = cast(short8)b;
2552         static if (SIMD_COMPARISON_MASKS_16B)
2553             short8 greater = sa > sb;
2554         else
2555             short8 greater = greaterMask!short8(sa, sb);
2556         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2557     }
2558     else
2559     {
2560         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2561         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2562         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2563         return _mm_xor_si128(b, mask);
2564     }
2565 }
2566 unittest
2567 {
2568     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2569                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2570     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2571     assert(R.array == correct);
2572 }
2573 
2574 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2575 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2576 {
2577     static if (GDC_with_SSE2)
2578     {
2579         return cast(__m128i) __builtin_ia32_pminub128(cast(ubyte16)a, cast(ubyte16)b);
2580     }
2581     else version(LDC)
2582     {
2583         // x86: pminub since LDC 1.0.0 -O1
2584         // ARM: umin.16b since LDC 1.5.0 -O1
2585         // PERF: catastrophic on ARM32
2586         ubyte16 sa = cast(ubyte16)a;
2587         ubyte16 sb = cast(ubyte16)b;
2588         static if (SIMD_COMPARISON_MASKS_16B)
2589             ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b);
2590         else
2591             ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2592         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2593     }
2594     else
2595     {
2596         // PERF: use the algorithm from _mm_max_epu16
2597         __m128i value128 = _mm_set1_epi8(-128);
2598         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2599         __m128i aTob = a ^ b; // a ^ (a ^ b) == b
2600         __m128i mask = aTob & lower;
2601         return b ^ mask;
2602     }
2603 }
2604 unittest
2605 {
2606     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2607                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2608     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2609     assert(R.array == correct);
2610 }
2611 
2612 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2613 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2614 {
2615     static if (GDC_with_SSE2)
2616     {
2617         return __builtin_ia32_minpd(a, b);
2618     }
2619     else
2620     {
2621         // Generates minpd starting with LDC 1.9
2622         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2623         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2624         return a;
2625     }
2626 }
2627 unittest
2628 {
2629     __m128d A = _mm_setr_pd(1.0, 2.0);
2630     __m128d B = _mm_setr_pd(4.0, 1.0);
2631     __m128d M = _mm_min_pd(A, B);
2632     assert(M.array[0] == 1.0);
2633     assert(M.array[1] == 1.0);
2634 }
2635 
2636 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2637 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2638 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2639 {
2640     static if (GDC_with_SSE2)
2641     {
2642         return __builtin_ia32_minsd(a, b);
2643     }
2644     else
2645     {
2646         // Generates minsd starting with LDC 1.3
2647         __m128d r = a;
2648         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2649         return r;
2650     }
2651 }
2652 unittest
2653 {
2654     __m128d A = _mm_setr_pd(1.0, 3.0);
2655     __m128d B = _mm_setr_pd(4.0, 2.0);
2656     __m128d M = _mm_min_sd(A, B);
2657     assert(M.array[0] == 1.0);
2658     assert(M.array[1] == 3.0);
2659 }
2660 
2661 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2662 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2663 {
2664     static if (GDC_with_SSE2)
2665     {
2666         // slightly better with GDC -O0
2667         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2668     }
2669     else
2670     {
2671         long2 result = [ 0, 0 ];
2672         long2 la = cast(long2) a;
2673         result.ptr[0] = la.array[0];
2674         return cast(__m128i)(result);
2675     }
2676 }
2677 unittest
2678 {
2679     long2 A = [13, 47];
2680     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2681     long[2] correct = [13, 0];
2682     assert(B.array == correct);
2683 }
2684 
2685 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2686 /// the upper element from `a` to the upper element of dst.
2687 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2688 {
2689     static if (GDC_with_SSE2)
2690     {
2691         return __builtin_ia32_movsd(a, b); 
2692     }
2693     else
2694     {
2695         b.ptr[1] = a.array[1];
2696         return b;
2697     }
2698 }
2699 unittest
2700 {
2701     double2 A = [13.0, 47.0];
2702     double2 B = [34.0, 58.0];
2703     double2 C = _mm_move_sd(A, B);
2704     double[2] correct = [34.0, 47.0];
2705     assert(C.array == correct);
2706 }
2707 
2708 /// Create mask from the most significant bit of each 8-bit element in `v`.
2709 int _mm_movemask_epi8 (__m128i a) pure @trusted
2710 {
2711     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2712     static if (GDC_with_SSE2)
2713     {
2714         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2715     }
2716     else static if (LDC_with_SSE2)
2717     {
2718         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2719     }
2720     else static if (LDC_with_ARM64)
2721     {
2722         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2723         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2724         // SO there might be something a bit faster, but this one is reasonable and branchless.
2725         byte8 mask_shift;
2726         mask_shift.ptr[0] = 7;
2727         mask_shift.ptr[1] = 6;
2728         mask_shift.ptr[2] = 5;
2729         mask_shift.ptr[3] = 4;
2730         mask_shift.ptr[4] = 3;
2731         mask_shift.ptr[5] = 2;
2732         mask_shift.ptr[6] = 1;
2733         mask_shift.ptr[7] = 0;
2734         byte8 mask_and = byte8(-128);
2735         byte8 lo = vget_low_u8(cast(byte16)a);
2736         byte8 hi = vget_high_u8(cast(byte16)a);
2737         lo = vand_u8(lo, mask_and);
2738         lo = vshr_u8(lo, mask_shift);
2739         hi = vand_u8(hi, mask_and);
2740         hi = vshr_u8(hi, mask_shift);
2741         lo = vpadd_u8(lo,lo);
2742         lo = vpadd_u8(lo,lo);
2743         lo = vpadd_u8(lo,lo);
2744         hi = vpadd_u8(hi,hi);
2745         hi = vpadd_u8(hi,hi);
2746         hi = vpadd_u8(hi,hi);
2747         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2748     }
2749     else
2750     {
2751         byte16 ai = cast(byte16)a;
2752         int r = 0;
2753         foreach(bit; 0..16)
2754         {
2755             if (ai.array[bit] < 0) r += (1 << bit);
2756         }
2757         return r;
2758     }
2759 }
2760 unittest
2761 {
2762     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2763 }
2764 
2765 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2766 int _mm_movemask_epi16 (__m128i a) pure @trusted
2767 {
2768     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2769 }
2770 unittest
2771 {
2772     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2773 }
2774 
2775 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2776 /// loating-point element in `v`.
2777 int _mm_movemask_pd(__m128d v) pure @safe
2778 {
2779     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2780     static if (GDC_or_LDC_with_SSE2)
2781     {
2782         return __builtin_ia32_movmskpd(v);
2783     }
2784     else
2785     {
2786         long2 lv = cast(long2)v;
2787         int r = 0;
2788         if (lv.array[0] < 0) r += 1;
2789         if (lv.array[1] < 0) r += 2;
2790         return r;
2791     }
2792 }
2793 unittest
2794 {
2795     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2796     assert(_mm_movemask_pd(A) == 2);
2797 }
2798 
2799 /// Copy the lower 64-bit integer in `v`.
2800 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2801 {
2802     long2 lv = cast(long2)v;
2803     return long1(lv.array[0]);
2804 }
2805 unittest
2806 {
2807     __m128i A = _mm_set_epi64x(-1, -2);
2808     __m64 R = _mm_movepi64_pi64(A);
2809     assert(R.array[0] == -2);
2810 }
2811 
2812 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2813 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2814 {
2815     long2 r;
2816     r.ptr[0] = a.array[0];
2817     r.ptr[1] = 0;
2818     return cast(__m128i)r;
2819 }
2820 
2821 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 
2822 /// and store the unsigned 64-bit results.
2823 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2824 {    
2825     // PERF DMD D_SIMD
2826     static if (GDC_with_SSE2)
2827     {
2828         return cast(__m128i) __builtin_ia32_pmuludq128 (a, b);
2829     }
2830     else
2831     {
2832         version(LDC)
2833         {
2834             static if (__VERSION__ >= 2088)
2835             {
2836                 // Need LLVM9 for proper optimization
2837                 long2 la, lb;
2838                 la.ptr[0] = cast(uint)a.array[0];
2839                 la.ptr[1] = cast(uint)a.array[2];
2840                 lb.ptr[0] = cast(uint)b.array[0];
2841                 lb.ptr[1] = cast(uint)b.array[2];
2842             }
2843             else
2844             {
2845                 __m128i zero;
2846                 zero = 0;
2847                 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero);
2848                 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero);
2849             }
2850         }
2851         else
2852         {
2853             long2 la, lb;
2854             la.ptr[0] = cast(uint)a.array[0];
2855             la.ptr[1] = cast(uint)a.array[2];
2856             lb.ptr[0] = cast(uint)b.array[0];
2857             lb.ptr[1] = cast(uint)b.array[2];
2858         }
2859 
2860         version(DigitalMars)
2861         {
2862             // DMD has no long2 mul
2863             la.ptr[0] *= lb.array[0];
2864             la.ptr[1] *= lb.array[1];
2865             return cast(__m128i)(la);
2866         }
2867         else
2868         {
2869             static if (__VERSION__ >= 2076)
2870             {
2871                 return cast(__m128i)(la * lb);
2872             }
2873             else
2874             {
2875                 // long2 mul not supported before LDC 1.5
2876                 la.ptr[0] *= lb.array[0];
2877                 la.ptr[1] *= lb.array[1];
2878                 return cast(__m128i)(la);
2879             }
2880         }
2881     }
2882 }
2883 unittest
2884 {
2885     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2886     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2887     __m128i C = _mm_mul_epu32(A, B);
2888     long2 LC = cast(long2)C;
2889     assert(LC.array[0] == 18446744065119617025uL);
2890     assert(LC.array[1] == 12723420444339690338uL);
2891 }
2892 
2893 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2894 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2895 {
2896     pragma(inline, true);
2897     return a * b;
2898 }
2899 unittest
2900 {
2901     __m128d a = [-2.0, 1.5];
2902     a = _mm_mul_pd(a, a);
2903     assert(a.array == [4.0, 2.25]);
2904 }
2905 
2906 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2907 /// element of result, and copy the upper element from `a` to the upper element of result.
2908 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2909 {
2910     version(DigitalMars)
2911     {    
2912         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2913         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2914         asm pure nothrow @nogc @trusted { nop;}
2915         a.array[0] = a.array[0] * b.array[0];
2916         return a;
2917     }
2918     else static if (GDC_with_SSE2)
2919     {
2920         return __builtin_ia32_mulsd(a, b);
2921     }
2922     else
2923     {
2924         a.ptr[0] *= b.array[0];
2925         return a;
2926     }
2927 }
2928 unittest
2929 {
2930     __m128d a = [-2.0, 1.5];
2931     a = _mm_mul_sd(a, a);
2932     assert(a.array == [4.0, 1.5]);
2933 }
2934 
2935 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2936 /// and get an unsigned 64-bit result.
2937 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2938 {
2939     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2940 }
2941 unittest
2942 {
2943     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2944     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2945     __m64 C = _mm_mul_su32(A, B);
2946     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2947 }
2948 
2949 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2950 /// high 16 bits of the intermediate integers.
2951 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2952 {
2953     static if (GDC_with_SSE2)
2954     {
2955         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2956     }
2957     else static if (LDC_with_SSE2)
2958     {
2959         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2960     }
2961     else
2962     {
2963         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2964         //        PERF: it seems the simde solution has one less instruction in ARM64.
2965         // PERF: Catastrophic in ARM32.
2966         short8 sa = cast(short8)a;
2967         short8 sb = cast(short8)b;
2968         short8 r = void;
2969         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2970         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2971         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2972         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2973         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2974         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2975         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2976         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2977         return cast(__m128i)r;
2978     }
2979 }
2980 unittest
2981 {
2982     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2983     __m128i B = _mm_set1_epi16(16384);
2984     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2985     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2986     assert(R.array == correct);
2987 }
2988 
2989 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2990 /// high 16 bits of the intermediate integers.
2991 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2992 {
2993     static if (GDC_with_SSE2)
2994     {
2995         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2996     }
2997     else static if (LDC_with_SSE2)
2998     {
2999         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
3000     }
3001     else
3002     {
3003         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
3004         //      it seems the simde solution has one less instruction in ARM64
3005         // PERF: Catastrophic in ARM32.
3006         short8 sa = cast(short8)a;
3007         short8 sb = cast(short8)b;
3008         short8 r = void;
3009         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
3010         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
3011         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
3012         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
3013         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
3014         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
3015         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
3016         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
3017         return cast(__m128i)r;
3018     }
3019 }
3020 unittest
3021 {
3022     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
3023     __m128i B = _mm_set1_epi16(16384);
3024     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
3025     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
3026     assert(R.array == correct);
3027 }
3028 
3029 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
3030 /// bits of the intermediate integers.
3031 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
3032 {
3033     return cast(__m128i)(cast(short8)a * cast(short8)b);
3034 }
3035 unittest
3036 {
3037     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
3038     __m128i B = _mm_set1_epi16(16384);
3039     short8 R = cast(short8)_mm_mullo_epi16(A, B);
3040     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
3041     assert(R.array == correct);
3042 }
3043 
3044 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
3045 __m128i _mm_not_si128 (__m128i a) pure @safe
3046 {
3047     return ~a;
3048 }
3049 unittest
3050 {
3051     __m128i A = _mm_set1_epi32(-748);
3052     int4 notA = cast(int4) _mm_not_si128(A);
3053     int[4] correct = [747, 747, 747, 747];
3054     assert(notA.array == correct);
3055 }
3056 
3057 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
3058 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
3059 {
3060     pragma(inline, true);
3061     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
3062 }
3063 
3064 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
3065 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
3066 {
3067     pragma(inline, true);
3068     return a | b;
3069 }
3070 
3071 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
3072 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
3073 {
3074     static if (DMD_with_DSIMD)
3075     {
3076         return cast(__m128i) __simd(XMM.PACKSSDW, a, b);
3077     }
3078     else static if (GDC_with_SSE2)
3079     {
3080         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
3081     }    
3082     else static if (LDC_with_SSE2)
3083     {
3084         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
3085     }
3086     else static if (LDC_with_ARM64)
3087     {
3088         short4 ra = vqmovn_s32(cast(int4)a);
3089         short4 rb = vqmovn_s32(cast(int4)b);
3090         return cast(__m128i)vcombine_s16(ra, rb);
3091     }
3092     else
3093     {
3094         // PERF: catastrophic on ARM32
3095         short8 r;
3096         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
3097         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
3098         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
3099         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
3100         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
3101         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
3102         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
3103         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
3104         return cast(__m128i)r;
3105     }
3106 }
3107 unittest
3108 {
3109     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
3110     short8 R = cast(short8) _mm_packs_epi32(A, A);
3111     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
3112     assert(R.array == correct);
3113 }
3114 
3115 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
3116 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
3117 {
3118     static if (DMD_with_DSIMD)
3119     {
3120         return cast(__m128i) __simd(XMM.PACKSSWB, a, b);
3121     }
3122     else static if (GDC_with_SSE2)
3123     {
3124         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3125     }
3126     else static if (LDC_with_SSE2)
3127     {
3128         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3129     }
3130     else static if (LDC_with_ARM64)
3131     {
3132         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
3133         byte8 ra = vqmovn_s16(cast(short8)a);
3134         byte8 rb = vqmovn_s16(cast(short8)b);
3135         return cast(__m128i)vcombine_s8(ra, rb);
3136     }
3137     else
3138     {
3139         // PERF: ARM32 is missing
3140         byte16 r;
3141         short8 sa = cast(short8)a;
3142         short8 sb = cast(short8)b;
3143         foreach(i; 0..8)
3144             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
3145         foreach(i; 0..8)
3146             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
3147         return cast(__m128i)r;
3148     }
3149 }
3150 unittest
3151 {
3152     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
3153     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
3154     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
3155                         127, -128, 127, 0, 127, -128, 127, 0];
3156     assert(R.array == correct);
3157 }
3158 
3159 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
3160 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
3161 {
3162     // PERF DMD catastrophic
3163     static if (DMD_with_DSIMD)
3164     {
3165         return cast(__m128i) __simd(XMM.PACKUSWB, a, b);
3166     }
3167     else static if (GDC_with_SSE2)
3168     {
3169         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3170     }
3171     else static if (LDC_with_SSE2)
3172     {
3173         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3174     }
3175     else static if (LDC_with_ARM64)
3176     {
3177         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
3178         byte8 ra = vqmovun_s16(cast(short8)a);
3179         byte8 rb = vqmovun_s16(cast(short8)b);
3180         return cast(__m128i)vcombine_s8(ra, rb);
3181     }
3182     else
3183     {
3184         short8 sa = cast(short8)a;
3185         short8 sb = cast(short8)b;
3186         align(16) ubyte[16] result = void;
3187         for (int i = 0; i < 8; ++i)
3188         {
3189             short s = sa[i];
3190             if (s < 0) s = 0;
3191             if (s > 255) s = 255;
3192             result[i] = cast(ubyte)s;
3193 
3194             s = sb[i];
3195             if (s < 0) s = 0;
3196             if (s > 255) s = 255;
3197             result[i+8] = cast(ubyte)s;
3198         }
3199         return *cast(__m128i*)(result.ptr);
3200     }
3201 }
3202 unittest
3203 {
3204     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
3205     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
3206     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
3207                                                 0, 255, 0, 255, 255, 2, 1, 0];
3208     foreach(i; 0..16)
3209         assert(AA.array[i] == cast(byte)(correctResult[i]));
3210 }
3211 
3212 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
3213 /// and power consumption of spin-wait loops.
3214 void _mm_pause() @trusted
3215 {
3216     version(GNU)
3217     {
3218         static if (GDC_with_SSE2)
3219         {
3220             __builtin_ia32_pause();
3221         }
3222         else version(X86)
3223         {
3224             asm pure nothrow @nogc @trusted
3225             {
3226                 "pause;\n" : : : ;
3227             }
3228         }
3229         else __warn_noop();
3230     }
3231     else static if (LDC_with_SSE2)
3232     {
3233         __builtin_ia32_pause();
3234     }
3235     else static if (DMD_with_asm)
3236     {
3237         asm nothrow @nogc pure @trusted
3238         {
3239             rep; nop; // F3 90 =  pause
3240         }
3241     }
3242     else version (LDC)
3243     {
3244         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
3245     }
3246     else
3247         static assert(false);
3248 }
3249 unittest
3250 {
3251     _mm_pause();
3252 }
3253 
3254 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
3255 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
3256 /// low 16 bits of 64-bit elements in result.
3257 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
3258 {
3259     static if (GDC_with_SSE2)
3260     {
3261         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
3262     }
3263     else static if (LDC_with_SSE2)
3264     {
3265         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
3266     }
3267     else static if (LDC_with_ARM64)
3268     {
3269         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
3270 
3271         // PERF: Looks suboptimal vs addp
3272         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
3273         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
3274         ushort8 r = 0;
3275         r[0] = r0;
3276         r[4] = r4;
3277         return cast(__m128i) r;
3278     }
3279     else
3280     {
3281         // PERF: ARM32 is lacking
3282         byte16 ab = cast(byte16)a;
3283         byte16 bb = cast(byte16)b;
3284         ubyte[16] t;
3285         foreach(i; 0..16)
3286         {
3287             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
3288             if (diff < 0) diff = -diff;
3289             t[i] = cast(ubyte)(diff);
3290         }
3291         int4 r = _mm_setzero_si128();
3292         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
3293         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
3294         return r;
3295     }
3296 }
3297 unittest
3298 {
3299     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3300     __m128i B = _mm_set1_epi8(1);
3301     __m128i R = _mm_sad_epu8(A, B);
3302     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3303                       0,
3304                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3305                       0];
3306     assert(R.array == correct);
3307 }
3308 
3309 /// Set packed 16-bit integers with the supplied values.
3310 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3311 {
3312     short8 r = void;
3313     r.ptr[0] = e0;
3314     r.ptr[1] = e1;
3315     r.ptr[2] = e2;
3316     r.ptr[3] = e3;
3317     r.ptr[4] = e4;
3318     r.ptr[5] = e5;
3319     r.ptr[6] = e6;
3320     r.ptr[7] = e7;
3321     return cast(__m128i) r;
3322 }
3323 unittest
3324 {
3325     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3326     short8 B = cast(short8) A;
3327     foreach(i; 0..8)
3328         assert(B.array[i] == i);
3329 }
3330 
3331 /// Set packed 32-bit integers with the supplied values.
3332 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3333 {
3334     // PERF: does a constant inline correctly? vs int4 field assignment
3335     align(16) int[4] r = [e0, e1, e2, e3];
3336     return *cast(int4*)&r;
3337 }
3338 unittest
3339 {
3340     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3341     foreach(i; 0..4)
3342         assert(A.array[i] == i);
3343         
3344     static if (__VERSION__ >= 2094)
3345         enum __m128i B = _mm_setr_epi32(0, 1, 2, 3);
3346 }
3347 
3348 /// Set packed 64-bit integers with the supplied values.
3349 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3350 {
3351     pragma(inline, true);
3352     long2 r = void;
3353     r.ptr[0] = e0.array[0];
3354     r.ptr[1] = e1.array[0];
3355     return cast(__m128i)(r);
3356 }
3357 unittest
3358 {
3359     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3360     long2 B = cast(long2) A;
3361     assert(B.array[0] == 5678);
3362     assert(B.array[1] == 1234);
3363 }
3364 
3365 /// Set packed 64-bit integers with the supplied values.
3366 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3367 {
3368     pragma(inline, true);
3369     long2 r = void;
3370     r.ptr[0] = e0;
3371     r.ptr[1] = e1;
3372     return cast(__m128i)(r);
3373 }
3374 unittest
3375 {
3376     __m128i A = _mm_set_epi64x(1234, -5678);
3377     long2 B = cast(long2) A;
3378     assert(B.array[0] == -5678);
3379     assert(B.array[1] == 1234);
3380 }
3381 
3382 /// Set packed 8-bit integers with the supplied values.
3383 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3384                       byte e11, byte e10, byte e9, byte e8,
3385                       byte e7, byte e6, byte e5, byte e4,
3386                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3387 {
3388     align(16) byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3389                                  e8, e9, e10, e11, e12, e13, e14, e15];
3390     return *cast(__m128i*)(result.ptr);
3391 }
3392 unittest
3393 {
3394     byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3395     byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3396     assert(R.array == correct);
3397 }
3398 
3399 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3400 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3401 {
3402     pragma(inline, true);
3403     double2 r = void;
3404     r.ptr[0] = e0;
3405     r.ptr[1] = e1;
3406     return r;
3407 }
3408 unittest
3409 {
3410     __m128d A = _mm_set_pd(61.0, 55.0);
3411     double[2] correct = [55.0, 61.0];
3412     assert(A.array == correct);
3413 }
3414 
3415 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3416 __m128d _mm_set_pd1 (double a) pure @trusted
3417 {
3418     pragma(inline, true);
3419     __m128d r = void;
3420     r.ptr[0] = a;
3421     r.ptr[1] = a;
3422     return r;
3423 }
3424 unittest
3425 {
3426     __m128d A = _mm_set_pd1(61.0);
3427     double[2] correct = [61.0, 61.0];
3428     assert(A.array == correct);
3429 }
3430 
3431 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3432 /// and zero the upper element.
3433 __m128d _mm_set_sd (double a) pure @trusted
3434 {
3435     double2 r = void;
3436     r.ptr[0] = a;
3437     r.ptr[1] = 0.0;
3438     return r;
3439 }
3440 unittest
3441 {
3442     __m128d A = _mm_set_sd(61.0);
3443     double[2] correct = [61.0, 0.0];
3444     assert(A.array == correct);
3445 }
3446 
3447 /// Broadcast 16-bit integer a to all elements of dst.
3448 __m128i _mm_set1_epi16 (short a) pure @trusted
3449 {
3450     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3451     {
3452         short8 v = a;
3453         return cast(__m128i) v;
3454     }
3455     else
3456     {
3457         pragma(inline, true);
3458         return cast(__m128i)(short8(a));
3459     }
3460 }
3461 unittest
3462 {
3463     short8 a = cast(short8) _mm_set1_epi16(31);
3464     for (int i = 0; i < 8; ++i)
3465         assert(a.array[i] == 31);
3466 }
3467 
3468 /// Broadcast 32-bit integer `a` to all elements.
3469 __m128i _mm_set1_epi32 (int a) pure @trusted
3470 {
3471     pragma(inline, true);
3472     return cast(__m128i)(int4(a));
3473 }
3474 unittest
3475 {
3476     int4 A = cast(int4) _mm_set1_epi32(31);
3477     for (int i = 0; i < 4; ++i)
3478         assert(A.array[i] == 31);
3479 
3480     // compile-time should work
3481     static if (__VERSION__ >= 2094)
3482         enum __m128i B = _mm_set1_epi32(3); 
3483 }
3484 
3485 /// Broadcast 64-bit integer `a` to all elements.
3486 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3487 {
3488     return _mm_set_epi64(a, a);
3489 }
3490 unittest
3491 {
3492     long b = 0x1DEADCAFE; 
3493     __m64 a;
3494     a.ptr[0] = b;
3495     long2 c = cast(long2) _mm_set1_epi64(a);
3496     assert(c.array[0] == b);
3497     assert(c.array[1] == b);
3498 }
3499 
3500 /// Broadcast 64-bit integer `a` to all elements
3501 __m128i _mm_set1_epi64x (long a) pure @trusted
3502 {
3503     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3504     return cast(__m128i)(b);
3505 }
3506 unittest
3507 {
3508     long b = 0x1DEADCAFE;
3509     long2 c = cast(long2) _mm_set1_epi64x(b);
3510     for (int i = 0; i < 2; ++i)
3511         assert(c.array[i] == b);
3512 }
3513 
3514 /// Broadcast 8-bit integer `a` to all elements.
3515 __m128i _mm_set1_epi8 (byte a) pure @trusted
3516 {
3517     pragma(inline, true);
3518     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3519     return cast(__m128i)(b);
3520 }
3521 unittest
3522 {
3523     byte16 b = cast(byte16) _mm_set1_epi8(31);
3524     for (int i = 0; i < 16; ++i)
3525         assert(b.array[i] == 31);
3526 }
3527 
3528 alias _mm_set1_pd = _mm_set_pd1;
3529 
3530 /// Set packed 16-bit integers with the supplied values in reverse order.
3531 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3532                         short e3, short e2, short e1, short e0) pure @trusted
3533 {
3534     short8 r = void;
3535     r.ptr[0] = e7;
3536     r.ptr[1] = e6;
3537     r.ptr[2] = e5;
3538     r.ptr[3] = e4;
3539     r.ptr[4] = e3;
3540     r.ptr[5] = e2;
3541     r.ptr[6] = e1;
3542     r.ptr[7] = e0;
3543     return cast(__m128i)(r);
3544 }
3545 unittest
3546 {
3547     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3548     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3549     assert(A.array == correct);
3550 }
3551 
3552 /// Set packed 32-bit integers with the supplied values in reverse order.
3553 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3554 {
3555     if (__ctfe)
3556     {
3557         __m128i r;
3558         r.ptr[0] = e3;
3559         r.ptr[1] = e2;
3560         r.ptr[2] = e1;
3561         r.ptr[3] = e0;
3562         return r;
3563     }
3564     else
3565     {
3566         // Performs better than = void; with GDC
3567         pragma(inline, true);
3568         align(16) int[4] result = [e3, e2, e1, e0];
3569         return *cast(__m128i*)(result.ptr);
3570     }
3571 }
3572 unittest
3573 {
3574     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3575     int[4] correct = [-1, 0, -2147483648, 2147483647];
3576     assert(A.array == correct);
3577     
3578     // compile-time should work
3579     static if (__VERSION__ >= 2094)
3580         enum __m128i B = _mm_setr_epi32(0, 1, 2, 3);
3581 }
3582 
3583 /// Set packed 64-bit integers with the supplied values in reverse order.
3584 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3585 {
3586     long2 r = void;
3587     r.ptr[0] = e1;
3588     r.ptr[1] = e0;
3589     return cast(__m128i)(r);
3590 }
3591 unittest
3592 {
3593     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3594     long[2] correct = [-1, 0];
3595     assert(A.array == correct);
3596 }
3597 
3598 /// Set packed 8-bit integers with the supplied values in reverse order.
3599 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3600                        byte e11, byte e10, byte e9,  byte e8,
3601                        byte e7,  byte e6,  byte e5,  byte e4,
3602                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3603 {
3604     align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3605                                  e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3606     return *cast(__m128i*)(result.ptr);
3607 }
3608 unittest
3609 {
3610     byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3611     byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
3612     assert(R.array == correct);
3613 }
3614 
3615 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3616 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3617 {
3618     pragma(inline, true);
3619     double2 result;
3620     result.ptr[0] = e1;
3621     result.ptr[1] = e0;
3622     return result;
3623 }
3624 unittest
3625 {
3626     __m128d A = _mm_setr_pd(61.0, 55.0);
3627     double[2] correct = [61.0, 55.0];
3628     assert(A.array == correct);
3629 }
3630 
3631 /// Return vector of type `__m128d` with all elements set to zero.
3632 __m128d _mm_setzero_pd() pure @trusted
3633 {
3634     pragma(inline, true);
3635     double2 r = void;
3636     r.ptr[0] = 0.0;
3637     r.ptr[1] = 0.0;
3638     return r;
3639 }
3640 unittest
3641 {
3642     __m128d A = _mm_setzero_pd();
3643     double[2] correct = [0.0, 0.0];
3644     assert(A.array == correct);
3645 }
3646 
3647 /// Return vector of type `__m128i` with all elements set to zero.
3648 __m128i _mm_setzero_si128() pure @trusted
3649 {
3650     pragma(inline, true);
3651     int4 r = void;
3652     r.ptr[0] = 0;
3653     r.ptr[1] = 0;
3654     r.ptr[2] = 0;
3655     r.ptr[3] = 0;
3656     return r;
3657 }
3658 unittest
3659 {
3660     __m128i A = _mm_setzero_si128();
3661     int[4] correct = [0, 0, 0, 0];
3662     assert(A.array == correct);
3663 }
3664 
3665 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
3666 /// See_also: `_MM_SHUFFLE`.
3667 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted
3668 {
3669     // PERF DMD D_SIMD
3670     static if (GDC_with_SSE2)
3671     {
3672         return __builtin_ia32_pshufd(a, imm8);
3673     }
3674     else static if (LDC_with_optimizations)
3675     {
3676         return shufflevectorLDC!(int4, (imm8 >> 0) & 3,
3677                                  (imm8 >> 2) & 3,
3678                                  (imm8 >> 4) & 3,
3679                                  (imm8 >> 6) & 3)(a, a);
3680     }
3681     else
3682     {
3683         int4 r = void;
3684         r.ptr[0] = a.ptr[(imm8 >> 0) & 3];
3685         r.ptr[1] = a.ptr[(imm8 >> 2) & 3];
3686         r.ptr[2] = a.ptr[(imm8 >> 4) & 3];
3687         r.ptr[3] = a.ptr[(imm8 >> 6) & 3];
3688         return r;
3689     }
3690 }
3691 unittest
3692 {
3693     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3694     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3695     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3696     int[4] expectedB = [ 3, 2, 1, 0 ];
3697     assert(B.array == expectedB);
3698 }
3699 
3700 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3701 /// See_also: `_MM_SHUFFLE2`.
3702 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted
3703 {
3704     // PERF DMD D_SIMD
3705     static if (GDC_with_SSE2)
3706     {
3707         return __builtin_ia32_shufpd(a, b, imm8);
3708     }
3709     else version(LDC)
3710     {
3711         return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ),
3712                                  2 + ( (imm8 >> 1) & 1 ))(a, b);
3713     }
3714     else
3715     {
3716         double2 r = void;
3717         r.ptr[0] = a.array[imm8 & 1];
3718         r.ptr[1] = b.array[(imm8 >> 1) & 1];
3719         return r;
3720     }
3721 }
3722 unittest
3723 {
3724     __m128d A = _mm_setr_pd(0.5, 2.0);
3725     __m128d B = _mm_setr_pd(4.0, 5.0);
3726     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3727     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3728     double[2] correct = [ 2.0, 5.0 ];
3729     assert(R.array == correct);
3730 }
3731 
3732 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3733 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3734 /// See also: `_MM_SHUFFLE`.
3735 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted
3736 {
3737     static if (DMD_with_DSIMD)
3738     {
3739         return cast(__m128i) __simd(XMM.PSHUFHW, a, a, cast(ubyte)imm8);
3740     }
3741     else static if (GDC_with_SSE2)
3742     {
3743         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3744     }
3745     else static if (LDC_with_optimizations)
3746     {
3747         return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3,
3748                                           4 + ( (imm8 >> 0) & 3 ),
3749                                           4 + ( (imm8 >> 2) & 3 ),
3750                                           4 + ( (imm8 >> 4) & 3 ),
3751                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3752     }
3753     else
3754     {
3755         short8 r = cast(short8)a;
3756         short8 sa = cast(short8)a;
3757         r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ];
3758         r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ];
3759         r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ];
3760         r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ];
3761         return cast(__m128i) r;
3762     }
3763 }
3764 unittest
3765 {
3766     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3767     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3768     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3769     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3770     assert(C.array == expectedC);
3771 }
3772 
3773 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3774 /// bits of result, with the high 64 bits being copied from from `a` to result.
3775 /// See_also: `_MM_SHUFFLE`.
3776 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted
3777 {
3778     static if (DMD_with_DSIMD)
3779     {
3780         return cast(__m128i) __simd(XMM.PSHUFLW, a, a, cast(ubyte)imm8);
3781     }
3782     else static if (GDC_with_SSE2)
3783     {
3784         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3785     }
3786     else static if (LDC_with_optimizations)
3787     {
3788         return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ),
3789                                                        ( (imm8 >> 2) & 3 ),
3790                                                        ( (imm8 >> 4) & 3 ),
3791                                                        ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3792     }
3793     else
3794     {
3795         short8 r = cast(short8)a;
3796         short8 sa = cast(short8)a;
3797         r.ptr[0] = sa.array[(imm8 >> 0) & 3];
3798         r.ptr[1] = sa.array[(imm8 >> 2) & 3];
3799         r.ptr[2] = sa.array[(imm8 >> 4) & 3];
3800         r.ptr[3] = sa.array[(imm8 >> 6) & 3];
3801         return cast(__m128i) r;
3802     }
3803 }
3804 unittest
3805 {
3806     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3807     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3808     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3809     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3810     assert(B.array == expectedB);
3811 }
3812 
3813 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3814 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
3815 /// If bit-shift > 31, result is defined to be all zeroes.
3816 /// Note: prefer `_mm_slli_epi32`, less of a trap.
3817 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3818 {
3819     static if (GDC_or_LDC_with_SSE2)
3820     {
3821         return __builtin_ia32_pslld128(a, count);
3822     }
3823     else
3824     {
3825         int4 r = void;
3826         long2 lc = cast(long2)count;
3827         ulong bits = cast(ulong)(lc.array[0]);
3828         foreach(i; 0..4)
3829             r[i] = cast(uint)(a[i]) << bits;
3830         if (bits > 31)
3831             r = int4(0);
3832         return r;
3833     }
3834 }
3835 unittest
3836 {
3837     __m128i shift0 = _mm_setzero_si128();
3838     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
3839     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
3840     __m128i A = _mm_setr_epi32(4, -9, 11, -2147483648);
3841     int[4] correct0  = A.array;
3842     int[4] correctX  = [0, 0, 0, 0];
3843     int[4] correct2  = [16, -36, 44, 0];
3844     int4 B0 = cast(int4) _mm_sll_epi32(A, shift0);
3845     int4 BX = cast(int4) _mm_sll_epi32(A, shiftX);
3846     int4 B2 = cast(int4) _mm_sll_epi32(A, shift2);
3847     assert(B0.array == correct0);
3848     assert(BX.array == correctX);
3849     assert(B2.array == correct2);
3850 }
3851 
3852 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3853 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
3854 /// If bit-shift > 63, result is defined to be all zeroes.
3855 /// Note: prefer `_mm_slli_epi64`, less of a trap.
3856 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3857 {
3858     static if (GDC_or_LDC_with_SSE2)
3859     {
3860         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3861     }
3862     else
3863     {
3864         // ARM: good since LDC 1.12 -O2
3865         // ~but -O0 version is catastrophic
3866         long2 r = void;
3867         long2 sa = cast(long2)a;
3868         long2 lc = cast(long2)count;
3869         ulong bits = cast(ulong)(lc.array[0]);
3870         foreach(i; 0..2)
3871             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3872         if (bits > 63)
3873             r = long2(0);
3874         return cast(__m128i)r;
3875     }
3876 }
3877 unittest
3878 {
3879     __m128i shift0 = _mm_setzero_si128();
3880     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
3881     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
3882     __m128i A = _mm_setr_epi64(4, -9);
3883     long[2] correct0  = [ 4,  -9];
3884     long[2] correctX  = [ 0,   0];
3885     long[2] correct2  = [16, -36];
3886     long2 B0 = cast(long2) _mm_sll_epi64(A, shift0);
3887     long2 BX = cast(long2) _mm_sll_epi64(A, shiftX);
3888     long2 B2 = cast(long2) _mm_sll_epi64(A, shift2);
3889     assert(B0.array == correct0);
3890     assert(BX.array == correctX);
3891     assert(B2.array == correct2);
3892 }
3893 
3894 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3895 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
3896 /// If bit-shift > 15, result is defined to be all zeroes.
3897 /// Warning: prefer `_mm_slli_epi16`, less of a trap.
3898 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3899 {
3900     static if (GDC_or_LDC_with_SSE2)
3901     {
3902         return cast(__m128i)__builtin_ia32_psllw128(cast(short8)a, cast(short8)count);
3903     }
3904     else
3905     {
3906         short8 sa = cast(short8)a;
3907         long2 lc = cast(long2)count;
3908         ulong bits = cast(ulong)(lc.array[0]);
3909         short8 r = void;
3910         foreach(i; 0..8)
3911             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3912         if (bits > 15)
3913             r = short8(0);
3914         return cast(int4)r;
3915     }
3916 }
3917 unittest
3918 {
3919     __m128i shift0 = _mm_setzero_si128();
3920     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
3921     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
3922     __m128i A = _mm_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768);
3923     short[8] correct0  = (cast(short8)A).array;
3924     short[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
3925     short[8] correct2  = [16, -32, 44, 0, 16, -32, 44, 0];
3926     short8 B0 = cast(short8) _mm_sll_epi16(A, shift0);
3927     short8 BX = cast(short8) _mm_sll_epi16(A, shiftX);
3928     short8 B2 = cast(short8) _mm_sll_epi16(A, shift2);
3929     assert(B0.array == correct0);
3930     assert(BX.array == correctX);
3931     assert(B2.array == correct2);
3932 }
3933 
3934 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3935 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3936 {
3937     static if (GDC_with_SSE2)
3938     {
3939         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3940     }
3941     else static if (LDC_with_SSE2)
3942     {
3943         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3944     }
3945     else
3946     {
3947         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3948         //       D says "It's illegal to shift by the same or more bits 
3949         //       than the size of the quantity being shifted"
3950         //       and it's UB instead.
3951         int4 r = _mm_setzero_si128();
3952 
3953         ubyte count = cast(ubyte) imm8;
3954         if (count > 31)
3955             return r;
3956         
3957         foreach(i; 0..4)
3958             r.array[i] = cast(uint)(a.array[i]) << count;
3959         return r;
3960     }
3961 }
3962 unittest
3963 {
3964     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3965     __m128i B = _mm_slli_epi32(A, 1);
3966     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3967     int[4] expectedB = [ 0, 4, 6, -8];
3968     assert(B.array == expectedB);
3969     assert(B2.array == expectedB);
3970 
3971     __m128i C = _mm_slli_epi32(A, 0);
3972     int[4] expectedC = [ 0, 2, 3, -4];
3973     assert(C.array == expectedC);
3974 
3975     __m128i D = _mm_slli_epi32(A, 65);
3976     int[4] expectedD = [ 0, 0, 0, 0];
3977     assert(D.array == expectedD);
3978 }
3979 
3980 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3981 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3982 {
3983     static if (GDC_with_SSE2)
3984     {
3985         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3986     }
3987     else static if (LDC_with_SSE2)
3988     {
3989         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3990     }
3991     else
3992     {
3993         long2 sa = cast(long2)a;
3994 
3995         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3996         //       D says "It's illegal to shift by the same or more bits 
3997         //       than the size of the quantity being shifted"
3998         //       and it's UB instead.
3999         long2 r = cast(long2) _mm_setzero_si128();
4000         ubyte count = cast(ubyte) imm8;
4001         if (count > 63)
4002             return cast(__m128i)r;
4003 
4004         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
4005         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
4006         return cast(__m128i)r;
4007     }
4008 }
4009 unittest
4010 {
4011     __m128i A = _mm_setr_epi64(8, -4);
4012     long2 B = cast(long2) _mm_slli_epi64(A, 1);
4013     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
4014     long[2] expectedB = [ 16, -8];
4015     assert(B.array == expectedB);
4016     assert(B2.array == expectedB);
4017 
4018     long2 C = cast(long2) _mm_slli_epi64(A, 0);
4019     long[2] expectedC = [ 8, -4];
4020     assert(C.array == expectedC);
4021 
4022     long2 D = cast(long2) _mm_slli_epi64(A, 64);
4023     long[2] expectedD = [ 0, -0];
4024     assert(D.array == expectedD);
4025 }
4026 
4027 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
4028 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
4029 {
4030     static if (GDC_with_SSE2)
4031     {
4032         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
4033     }
4034     else static if (LDC_with_SSE2)
4035     {
4036         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
4037     }
4038     else static if (LDC_with_ARM64)
4039     {
4040         short8 sa = cast(short8)a;
4041         short8 r = cast(short8)_mm_setzero_si128();
4042         ubyte count = cast(ubyte) imm8;
4043         if (count > 15)
4044             return cast(__m128i)r;
4045         r = sa << short8(count);
4046         return cast(__m128i)r;
4047     }
4048     else
4049     {
4050         short8 sa = cast(short8)a;
4051         short8 r = cast(short8)_mm_setzero_si128();
4052         ubyte count = cast(ubyte) imm8;
4053         if (count > 15)
4054             return cast(__m128i)r;
4055         foreach(i; 0..8)
4056             r.ptr[i] = cast(short)(sa.array[i] << count);
4057         return cast(__m128i)r;
4058     }
4059 }
4060 unittest
4061 {
4062     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4063     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
4064     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
4065     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
4066     assert(B.array == expectedB);
4067     assert(B2.array == expectedB);
4068 
4069     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
4070     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
4071     assert(C.array == expectedC);
4072 }
4073 
4074 
4075 /// Shift `a` left by `bytes` bytes while shifting in zeros.
4076 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
4077 {
4078     static if (bytes & 0xF0)
4079     {
4080         return _mm_setzero_si128();
4081     }
4082     else static if (DMD_with_DSIMD)
4083     {
4084         return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes);
4085     }
4086     else static if (GDC_with_SSE2)
4087     {
4088         pragma(inline, true); // else it doesn't seem to be inlined at all by GDC PERF do it in _mm_srli_si128 and check
4089         return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
4090     }
4091     else static if (LDC_with_optimizations)
4092     {
4093         return cast(__m128i) shufflevectorLDC!(byte16,
4094                                                16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
4095                                                22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
4096                                                28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
4097                                                (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
4098     }
4099     else static if (DMD_with_32bit_asm)
4100     {
4101         asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
4102         {
4103             movdqu XMM0, op;
4104             pslldq XMM0, bytes;
4105             movdqu op, XMM0;
4106         }
4107         return op;
4108     }
4109     else
4110     {
4111         byte16 A = cast(byte16)op;
4112         byte16 R = void;
4113         for (int n = 15; n >= bytes; --n)
4114             R.ptr[n] = A.array[n-bytes];
4115         for (int n = bytes-1; n >= 0; --n)
4116             R.ptr[n] = 0;
4117         return cast(__m128i)R;
4118     }
4119 }
4120 unittest
4121 {
4122     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4123     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
4124     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
4125     assert(R.array == correct);
4126 
4127     __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1));
4128     int[4] expectedB = [0, 0, 0, 0];
4129     assert(B.array == expectedB);
4130 }
4131 
4132 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
4133 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
4134 {
4135     version(LDC)
4136     {
4137         // Disappeared with LDC 1.11
4138         static if (__VERSION__ < 2081)
4139             return __builtin_ia32_sqrtpd(vec);
4140         else
4141         {
4142             // PERF: use llvm_sqrt on the vector
4143             vec.array[0] = llvm_sqrt(vec.array[0]); 
4144             vec.array[1] = llvm_sqrt(vec.array[1]);
4145             return vec;
4146         }
4147     }
4148     else static if (GDC_with_SSE2)    
4149     {
4150         return __builtin_ia32_sqrtpd(vec);
4151     }
4152     else
4153     {
4154         vec.ptr[0] = sqrt(vec.array[0]);
4155         vec.ptr[1] = sqrt(vec.array[1]);
4156         return vec;
4157     }
4158 }
4159 
4160 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
4161 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
4162 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
4163 {
4164     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
4165     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
4166     //        The quadword at bits 127:64 of the destination operand remains unchanged."
4167     version(LDC)
4168     {
4169         // Disappeared with LDC 1.11
4170         static if (__VERSION__ < 2081)
4171         {
4172             __m128d c = __builtin_ia32_sqrtsd(b);
4173             a[0] = c[0];
4174             return a;
4175         }
4176         else
4177         {
4178             a.array[0] = llvm_sqrt(b.array[0]);
4179             return a;
4180         }
4181     }
4182     else static if (GDC_with_SSE2)
4183     {
4184         __m128d c = __builtin_ia32_sqrtsd(b);
4185         a.ptr[0] = c.array[0];
4186         return a;
4187     }
4188     else
4189     {
4190         a.ptr[0] = sqrt(b.array[0]);
4191         return a;
4192     }
4193 }
4194 unittest
4195 {
4196     __m128d A = _mm_setr_pd(1.0, 3.0);
4197     __m128d B = _mm_setr_pd(4.0, 5.0);
4198     __m128d R = _mm_sqrt_sd(A, B);
4199     double[2] correct = [2.0, 3.0 ];
4200     assert(R.array == correct);
4201 }
4202 
4203 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
4204 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4205 /// If bit-shift > 15, result is defined to be all sign bits.
4206 /// Warning: prefer `_mm_srai_epi16`, less of a trap.
4207 __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
4208 {
4209     static if (GDC_or_LDC_with_SSE2)
4210     {
4211         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
4212     }
4213     else
4214     {
4215         short8 sa = cast(short8)a;
4216         long2 lc = cast(long2)count;
4217         ulong bits = cast(ulong)(lc.array[0]);
4218         if (bits > 15) 
4219             bits = 15;
4220         short8 r = void;
4221         foreach(i; 0..8)
4222             r.ptr[i] = cast(short)(sa.array[i] >> bits);
4223         return cast(int4)r;
4224     }
4225 }
4226 unittest
4227 {
4228     __m128i shift0 = _mm_setzero_si128();
4229     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4230     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4231     __m128i A = _mm_setr_epi16(4, -9, 11, -32768, 4, -8, 11, -32768);
4232     short[8] correct0  = (cast(short8)A).array;
4233     short[8] correctX  = [0, -1, 0, -1, 0, -1, 0, -1]; 
4234     short[8] correct2  =      [1, -3,  2, -8192,  1, -2,  2, -8192];
4235     short8 B0 = cast(short8) _mm_sra_epi16(A, shift0);
4236     short8 BX = cast(short8) _mm_sra_epi16(A, shiftX);
4237     short8 B2 = cast(short8) _mm_sra_epi16(A, shift2);
4238     assert(B0.array == correct0);
4239     assert(BX.array == correctX);
4240     assert(B2.array == correct2);
4241 }
4242 
4243 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
4244 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4245 /// If bit-shift > 31, result is defined to be all sign bits.
4246 /// Note: prefer `_mm_srai_epi32`, less of a trap.
4247 __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
4248 {
4249     static if (GDC_or_LDC_with_SSE2)
4250     {
4251         return __builtin_ia32_psrad128(a, count);
4252     }
4253     else
4254     {    
4255         int4 r = void;
4256         long2 lc = cast(long2)count;
4257         ulong bits = cast(ulong)(lc.array[0]);
4258         if (bits > 31)
4259             bits = 31;
4260         r.ptr[0] = (a.array[0] >> bits);
4261         r.ptr[1] = (a.array[1] >> bits);
4262         r.ptr[2] = (a.array[2] >> bits);
4263         r.ptr[3] = (a.array[3] >> bits);
4264         return r;
4265     }
4266 }
4267 unittest
4268 {
4269     __m128i shift0 = _mm_setzero_si128();
4270     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4271     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4272     __m128i A = _mm_setr_epi32(4, -9, 11, -2147483648);
4273     int[4] correct0  = A.array;
4274     int[4] correctX  = [0, -1, 0, -1]; 
4275     int[4] correct2  = [1, -3, 2, -536870912];
4276     int4 B0 = cast(int4) _mm_sra_epi32(A, shift0);
4277     int4 BX = cast(int4) _mm_sra_epi32(A, shiftX);
4278     int4 B2 = cast(int4) _mm_sra_epi32(A, shift2);
4279     assert(B0.array == correct0);
4280     assert(BX.array == correctX);
4281     assert(B2.array == correct2);
4282 }
4283 
4284 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
4285 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
4286 {
4287     static if (GDC_with_SSE2)
4288     {
4289         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4290     }
4291     else static if (LDC_with_SSE2)
4292     {
4293         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4294     }
4295     else static if (LDC_with_ARM64)
4296     {
4297         short8 sa = cast(short8)a;
4298         ubyte count = cast(ubyte)imm8;
4299         if (count > 15) 
4300             count = 15;
4301         short8 r = sa >> short8(count);
4302         return cast(__m128i)r;
4303     }
4304     else
4305     {
4306         short8 sa = cast(short8)a;
4307         short8 r = void;
4308 
4309         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4310         //       D says "It's illegal to shift by the same or more bits 
4311         //       than the size of the quantity being shifted"
4312         //       and it's UB instead.
4313         ubyte count = cast(ubyte)imm8;
4314         if (count > 15) 
4315             count = 15;
4316         foreach(i; 0..8)
4317             r.ptr[i] = cast(short)(sa.array[i] >> count);
4318         return cast(int4)r;
4319     }
4320 }
4321 unittest
4322 {
4323     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4324     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
4325     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
4326     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
4327     assert(B.array == expectedB);
4328     assert(B2.array == expectedB);
4329 
4330     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
4331     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
4332     assert(C.array == expectedC);
4333 }
4334 
4335 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
4336 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
4337 {
4338     static if (LDC_with_SSE2)
4339     {
4340         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4341     }
4342     else static if (GDC_with_SSE2)
4343     {
4344         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4345     }
4346     else
4347     {
4348         int4 r = void;
4349 
4350         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4351         //       D says "It's illegal to shift by the same or more bits 
4352         //       than the size of the quantity being shifted"
4353         //       and it's UB instead.
4354         // See Issue: #56
4355         ubyte count = cast(ubyte) imm8;
4356         if (count > 31)
4357             count = 31;
4358 
4359         r.ptr[0] = (a.array[0] >> count);
4360         r.ptr[1] = (a.array[1] >> count);
4361         r.ptr[2] = (a.array[2] >> count);
4362         r.ptr[3] = (a.array[3] >> count);
4363         return r;
4364     }
4365 }
4366 unittest
4367 {
4368     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4369     __m128i B = _mm_srai_epi32(A, 1);
4370     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
4371     int[4] expectedB = [ 0, 1, 1, -2];
4372     assert(B.array == expectedB);
4373     assert(B2.array == expectedB);
4374 
4375     __m128i C = _mm_srai_epi32(A, 32);
4376     int[4] expectedC = [ 0, 0, 0, -1];
4377     assert(C.array == expectedC);
4378 
4379     __m128i D = _mm_srai_epi32(A, 0);
4380     int[4] expectedD = [ 0, 2, 3, -4];
4381     assert(D.array == expectedD);
4382 }
4383 
4384 /// Shift packed 16-bit integers in `a` right by `count` while shifting in zeros.
4385 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4386 /// If bit-shift > 15, result is defined to be all zeroes.
4387 /// Warning: prefer `_mm_srli_epi16`, less of a trap.
4388 __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
4389 {
4390     // PERF ARM64
4391     static if (GDC_or_LDC_with_SSE2)
4392     {
4393         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4394     }
4395     else
4396     {
4397         short8 sa = cast(short8)a;
4398         long2 lc = cast(long2)count;
4399         ulong bits = cast(ulong)(lc.array[0]);
4400         short8 r = void;
4401         foreach(i; 0..8)
4402             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
4403         if (bits > 15)
4404             r = short8(0);
4405         return cast(__m128i)r;
4406     }
4407 }
4408 unittest
4409 {
4410     __m128i shift0 = _mm_setzero_si128();
4411     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4412     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4413     __m128i A = _mm_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768);
4414     short[8] correct0  = (cast(short8)A).array;
4415     short[8] correctX  = [0, 0, 0, 0, 0, 0, 0, 0]; 
4416     short[8] correct2  = [1, 16382, 2, 8192, 1, 16382, 2, 8192];
4417     short8 B0 = cast(short8) _mm_srl_epi16(A, shift0);
4418     short8 BX = cast(short8) _mm_srl_epi16(A, shiftX);
4419     short8 B2 = cast(short8) _mm_srl_epi16(A, shift2);
4420     assert(B0.array == correct0);
4421     assert(BX.array == correctX);
4422     assert(B2.array == correct2);
4423 }
4424 
4425 /// Shift packed 32-bit integers in `a` right by `count` while shifting in zeros.
4426 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4427 /// If bit-shift > 31, result is defined to be all zeroes.
4428 /// Note: prefer `_mm_srli_epi32`, less of a trap.
4429 __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
4430 {
4431     static if (GDC_or_LDC_with_SSE2)
4432     {
4433         return __builtin_ia32_psrld128(a, count);
4434     }
4435     else
4436     {
4437         int4 r = void;
4438         long2 lc = cast(long2)count;
4439         ulong bits = cast(ulong)(lc.array[0]);
4440         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
4441         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
4442         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
4443         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
4444         if (bits > 31) // Same semantics as x86 instruction
4445             r = int4(0);
4446         return r;
4447     }
4448 }
4449 unittest
4450 {
4451     __m128i shift0 = _mm_setzero_si128();
4452     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4453     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4454     __m128i A = _mm_setr_epi32(4, -8, 11, -0x80000000);
4455     int[4] correct0  = A.array;
4456     int[4] correctX  = [0, 0, 0, 0]; 
4457     int[4] correct2  = [1, 1073741822, 2, 536870912];
4458     int4 B0 = cast(int4) _mm_srl_epi32(A, shift0);
4459     int4 BX = cast(int4) _mm_srl_epi32(A, shiftX);
4460     int4 B2 = cast(int4) _mm_srl_epi32(A, shift2);
4461     assert(B0.array == correct0);
4462     assert(BX.array == correctX);
4463     assert(B2.array == correct2);
4464 }
4465 
4466 /// Shift packed 64-bit integers in `a` right by `count` while shifting in zeroes.
4467 /// Bit-shift is a single value in the low-order 64-bit of `count`. 
4468 /// If bit-shift > 63, result is defined to be all zeroes.
4469 /// Note: prefer `_mm_srli_epi64`, less of a trap.
4470 __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
4471 {
4472     static if (GDC_or_LDC_with_SSE2)
4473     {
4474         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4475     }
4476     else
4477     {
4478         long2 r;
4479         long2 sa = cast(long2)a;
4480         long2 lc = cast(long2)count;
4481         ulong bits = cast(ulong)(lc.array[0]);
4482         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
4483         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
4484         if (bits > 63)
4485             r = long2(0);
4486         return cast(__m128i)r;
4487     }
4488 }
4489 unittest
4490 {
4491     __m128i shift0 = _mm_setzero_si128();
4492     __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift
4493     __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5);
4494     __m128i A = _mm_setr_epi64(4, -9);
4495     long[2] correct0  = [4, -9];
4496     long[2] correctX  = [0,  0];
4497     long[2] correct2  = [1, 4611686018427387901];
4498     long2 B0 = cast(long2) _mm_srl_epi64(A, shift0);
4499     long2 BX = cast(long2) _mm_srl_epi64(A, shiftX);
4500     long2 B2 = cast(long2) _mm_srl_epi64(A, shift2);
4501     assert(B0.array == correct0);
4502     assert(BX.array == correctX);
4503     assert(B2.array == correct2);
4504 }
4505 
4506 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
4507 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
4508 {
4509     static if (GDC_with_SSE2)
4510     {
4511         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4512     }
4513     else static if (LDC_with_SSE2)
4514     {
4515         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4516     }
4517     else static if (LDC_with_ARM64)
4518     {
4519         short8 sa = cast(short8)a;
4520         short8 r = cast(short8) _mm_setzero_si128();
4521 
4522         ubyte count = cast(ubyte)imm8;
4523         if (count >= 16)
4524             return cast(__m128i)r;
4525 
4526         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
4527         return cast(__m128i)r;
4528     }
4529     else
4530     {
4531         short8 sa = cast(short8)a;
4532         ubyte count = cast(ubyte)imm8;
4533 
4534         short8 r = cast(short8) _mm_setzero_si128();
4535         if (count >= 16)
4536             return cast(__m128i)r;
4537 
4538         foreach(i; 0..8)
4539             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4540         return cast(__m128i)r;
4541     }
4542 }
4543 unittest
4544 {
4545     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4546     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4547     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4548     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4549     assert(B.array == expectedB);
4550     assert(B2.array == expectedB);
4551 
4552     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4553     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4554     assert(C.array == expectedC);
4555 
4556     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4557     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4558     assert(D.array == expectedD);
4559 }
4560 
4561 
4562 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4563 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4564 {
4565     static if (GDC_with_SSE2)
4566     {
4567         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4568     }
4569     else static if (LDC_with_SSE2)
4570     {
4571         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4572     }
4573     else
4574     {
4575         ubyte count = cast(ubyte) imm8;
4576 
4577         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4578         //       D says "It's illegal to shift by the same or more bits 
4579         //       than the size of the quantity being shifted"
4580         //       and it's UB instead.
4581         int4 r = _mm_setzero_si128();
4582         if (count >= 32)
4583             return r;
4584         r.ptr[0] = a.array[0] >>> count;
4585         r.ptr[1] = a.array[1] >>> count;
4586         r.ptr[2] = a.array[2] >>> count;
4587         r.ptr[3] = a.array[3] >>> count;
4588         return r;
4589     }
4590 }
4591 unittest
4592 {
4593     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4594     __m128i B = _mm_srli_epi32(A, 1);
4595     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4596     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4597     assert(B.array == expectedB);
4598     assert(B2.array == expectedB);
4599  
4600     __m128i C = _mm_srli_epi32(A, 255);
4601     int[4] expectedC = [ 0, 0, 0, 0 ];
4602     assert(C.array == expectedC);
4603 }
4604 
4605 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4606 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4607 {
4608     // PERF DMD
4609     static if (GDC_or_LDC_with_SSE2)
4610     {
4611         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4612     }
4613     else
4614     {
4615         long2 r = cast(long2) _mm_setzero_si128();
4616         long2 sa = cast(long2)a;
4617 
4618         ubyte count = cast(ubyte) imm8;
4619         if (count >= 64)
4620             return cast(__m128i)r;
4621 
4622         r.ptr[0] = sa.array[0] >>> count;
4623         r.ptr[1] = sa.array[1] >>> count;
4624         return cast(__m128i)r;
4625     }
4626 }
4627 unittest
4628 {
4629     __m128i A = _mm_setr_epi64(8, -4);
4630     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4631     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4632     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4633     assert(B.array == expectedB);
4634     assert(B2.array == expectedB);
4635 
4636     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4637     long[2] expectedC = [ 0, 0 ];
4638     assert(C.array == expectedC);
4639 }
4640 
4641 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4642 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted
4643 {
4644     static if (bytes & 0xF0)
4645     {
4646         return _mm_setzero_si128();
4647     }
4648     else static if (DMD_with_DSIMD)
4649     {
4650         return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes);
4651     }
4652     else static if (GDC_with_SSE2)
4653     {
4654         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4655     }
4656     else static if (DMD_with_32bit_asm)
4657     {
4658         asm pure nothrow @nogc @trusted
4659         {
4660             movdqu XMM0, v;
4661             psrldq XMM0, bytes;
4662             movdqu v, XMM0;
4663         }
4664         return v;
4665     }
4666     else static if (LDC_with_optimizations)
4667     {
4668         return cast(__m128i) shufflevectorLDC!(byte16,
4669                                                bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4670                                                bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4671                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4672     }
4673     else
4674     {
4675         byte16 A = cast(byte16)v;
4676         byte16 R = void;
4677         for (int n = 0; n < bytes; ++n)
4678             R.ptr[15-n] = 0;
4679         for (int n = bytes; n < 16; ++n)
4680             R.ptr[15-n] = A.array[15 - n + bytes];
4681         return cast(__m128i)R;
4682     }
4683 }
4684 unittest
4685 {
4686     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1));
4687     int[4] correct = [-2, 3, 4, 0];
4688     assert(R.array == correct);
4689 
4690     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4691     int[4] expectedA = [0, 0, 0, 0];
4692     assert(A.array == expectedA);
4693 }
4694 
4695 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4696 /// #BONUS
4697 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4698 {
4699     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4700 }
4701 unittest
4702 {
4703     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4704     float[4] correct = [3.0f, 4.0f, 0, 0];
4705     assert(R.array == correct);
4706 }
4707 
4708 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4709 /// #BONUS
4710 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4711 {
4712     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4713 }
4714 
4715 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4716 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4717 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4718 {
4719     pragma(inline, true);
4720     __m128d* aligned = cast(__m128d*)mem_addr;
4721     *aligned = a;
4722 }
4723 unittest
4724 {
4725     align(16) double[2] A;
4726     __m128d B = _mm_setr_pd(-8.0, 9.0);
4727     _mm_store_pd(A.ptr, B);
4728     assert(A == [-8.0, 9.0]);
4729 }
4730 
4731 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4732 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4733 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4734 {
4735     __m128d* aligned = cast(__m128d*)mem_addr;
4736     __m128d r; // PERF =void;
4737     r.ptr[0] = a.array[0];
4738     r.ptr[1] = a.array[0];
4739     *aligned = r;
4740 }
4741 
4742 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4743 /// be aligned on any particular boundary.
4744 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4745 {
4746     pragma(inline, true);
4747     *mem_addr = a.array[0];
4748 }
4749 
4750 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4751 /// general-protection exception may be generated.
4752 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4753 {
4754     pragma(inline, true);
4755     *mem_addr = a;
4756 }
4757 
4758 alias _mm_store1_pd = _mm_store_pd1; ///
4759 
4760 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4761 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4762 {
4763     pragma(inline, true);
4764     *mem_addr = a.array[1];
4765 }
4766 
4767 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4768 // expectations from the user point of view. This problem also exist in C++.
4769 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4770 {
4771     pragma(inline, true);
4772     long* dest = cast(long*)mem_addr;
4773     long2 la = cast(long2)a;
4774     *dest = la.array[0];
4775 }
4776 unittest
4777 {
4778     long[3] A = [1, 2, 3];
4779     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4780     long[3] correct = [1, 0x1_0000_0000, 3];
4781     assert(A == correct);
4782 }
4783 
4784 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4785 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4786 {
4787     pragma(inline, true);
4788     *mem_addr = a.array[0];
4789 }
4790 
4791 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 
4792 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 
4793 /// may be generated.
4794 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
4795 {
4796     __m128d reversed = void;
4797     reversed.ptr[0] = a.array[1];
4798     reversed.ptr[1] = a.array[0];
4799     *cast(__m128d*)mem_addr = reversed;
4800 }
4801 unittest
4802 {
4803     align(16) double[2] A = [0.0, 1.0];
4804     _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0));
4805     assert(A[0] == 3.0 && A[1] == 2.0);
4806 }
4807 
4808 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 
4809 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
4810 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
4811 {
4812     // PERF DMD
4813     pragma(inline, true);
4814     static if (GDC_with_SSE2)
4815     {
4816         __builtin_ia32_storeupd(mem_addr, a);
4817     }
4818     else static if (LDC_with_optimizations)
4819     {
4820         storeUnaligned!double2(a, mem_addr);
4821     }
4822     else
4823     {
4824         mem_addr[0] = a.array[0];
4825         mem_addr[1] = a.array[1];
4826     }
4827 }
4828 unittest
4829 {
4830     __m128d A = _mm_setr_pd(3.0, 4.0);
4831     align(16) double[4] R = [0.0, 0, 0, 0];
4832     double[2] correct = [3.0, 4.0];
4833     _mm_storeu_pd(&R[1], A);
4834     assert(R[1..3] == correct);
4835 }
4836 
4837 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4838 /// boundary.
4839 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system
4840 {
4841     // PERF: DMD
4842     pragma(inline, true);
4843     static if (GDC_with_SSE2)
4844     {
4845         __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
4846     }
4847     else static if (LDC_with_optimizations)
4848     {
4849         storeUnaligned!__m128i(a, cast(int*)mem_addr);
4850     }
4851     else
4852     {
4853         int* p = cast(int*)mem_addr;
4854         p[0] = a.array[0];
4855         p[1] = a.array[1];
4856         p[2] = a.array[2];
4857         p[3] = a.array[3];
4858     }
4859 }
4860 unittest
4861 {
4862     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4863     align(16) int[6] R = [0, 0, 0, 0, 0, 0];
4864     int[4] correct = [1, 2, 3, 4];
4865     _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
4866     assert(R[1..5] == correct);
4867 }
4868 
4869 /// Store 16-bit integer from the first element of `a` into memory. 
4870 /// `mem_addr` does not need to be aligned on any particular boundary.
4871 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system
4872 {
4873     short* dest = cast(short*)mem_addr;
4874     *dest = (cast(short8)a).array[0];
4875 }
4876 unittest
4877 {
4878     short[2] arr = [-24, 12];
4879     _mm_storeu_si16(&arr[1], _mm_set1_epi16(26));
4880     short[2] correct = [-24, 26];
4881     assert(arr == correct);
4882 }
4883 
4884 /// Store 32-bit integer from the first element of `a` into memory. 
4885 /// `mem_addr` does not need to be aligned on any particular boundary.
4886 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem
4887 {
4888     pragma(inline, true);
4889     int* dest = cast(int*)mem_addr;
4890     *dest = a.array[0];
4891 }
4892 unittest
4893 {
4894     int[2] arr = [-24, 12];
4895     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4896     assert(arr == [-24, -1]);
4897 }
4898 
4899 /// Store 64-bit integer from the first element of `a` into memory. 
4900 /// `mem_addr` does not need to be aligned on any particular boundary.
4901 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system
4902 {
4903     pragma(inline, true);
4904     long* dest = cast(long*)mem_addr;
4905     long2 la = cast(long2)a;
4906     *dest = la.array[0];
4907 }
4908 unittest
4909 {
4910     long[3] A = [1, 2, 3];
4911     _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4912     long[3] correct = [1, 0x1_0000_0000, 3];
4913     assert(A == correct);
4914 }
4915 
4916 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4917 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte
4918 /// boundary or a general-protection exception may be generated.
4919 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4920 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system
4921 {
4922     // PERF DMD D_SIMD
4923     static if (GDC_with_SSE2)
4924     {
4925         return __builtin_ia32_movntpd(mem_addr, a); 
4926     }
4927     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4928     {
4929         enum prefix = `!0 = !{ i32 1 }`;
4930         enum ir = `
4931             store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0
4932             ret void`;
4933         LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a);
4934     }
4935     else
4936     {
4937         // Regular store instead.
4938         __m128d* dest = cast(__m128d*)mem_addr;
4939         *dest = a;
4940     }
4941 }
4942 unittest
4943 {
4944     align(16) double[2] A;
4945     __m128d B = _mm_setr_pd(-8.0, 9.0);
4946     _mm_stream_pd(A.ptr, B);
4947     assert(A == [-8.0, 9.0]);
4948 }
4949 
4950 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4951 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
4952 /// may be generated.
4953 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4954 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted
4955 {
4956     // PERF DMD D_SIMD
4957     static if (GDC_with_SSE2)
4958     {
4959         return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 
4960     }
4961     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4962     {
4963         enum prefix = `!0 = !{ i32 1 }`;
4964         enum ir = `
4965             store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0
4966             ret void`;
4967         LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a);
4968     }
4969     else
4970     {
4971         // Regular store instead.
4972         __m128i* dest = cast(__m128i*)mem_addr;
4973         *dest = a;
4974     }
4975 }
4976 unittest
4977 {
4978     align(16) int[4] A;
4979     __m128i B = _mm_setr_epi32(-8, 9, 10, -11);
4980     _mm_stream_si128(cast(__m128i*)A.ptr, B);
4981     assert(A == [-8, 9, 10, -11]);
4982 }
4983 
4984 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4985 /// pollution. If the cache line containing address `mem_addr` is already in the cache,
4986 /// the cache will be updated.
4987 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4988 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted
4989 {
4990     // PERF DMD D_SIMD
4991     static if (GDC_with_SSE2)
4992     {
4993         return __builtin_ia32_movnti(mem_addr, a);
4994     }
4995     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
4996     {
4997         enum prefix = `!0 = !{ i32 1 }`;
4998         enum ir = `
4999             store i32 %1, i32* %0, !nontemporal !0
5000             ret void`;
5001         LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a);
5002     }
5003     else
5004     {
5005         // Regular store instead.
5006         *mem_addr = a;
5007     }
5008 }
5009 unittest
5010 {
5011     int A;
5012     _mm_stream_si32(&A, -34);
5013     assert(A == -34);
5014 }
5015 
5016 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
5017 /// cache pollution. If the cache line containing address `mem_addr` is already
5018 /// in the cache, the cache will be updated.
5019 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
5020 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted
5021 {
5022     // PERF DMD D_SIMD
5023     static if (GDC_with_SSE2)
5024     {
5025         return __builtin_ia32_movnti64(mem_addr, a);
5026     }
5027     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
5028     {
5029         enum prefix = `!0 = !{ i32 1 }`;
5030         enum ir = `
5031             store i64 %1, i64* %0, !nontemporal !0
5032             ret void`;
5033         LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a);
5034 
5035     }
5036     else
5037     {
5038         // Regular store instead.
5039         *mem_addr = a;
5040     }
5041 }
5042 unittest
5043 {
5044     long A;
5045     _mm_stream_si64(&A, -46);
5046     assert(A == -46);
5047 }
5048 
5049 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
5050 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
5051 {
5052     pragma(inline, true);
5053     return cast(__m128i)(cast(short8)a - cast(short8)b);
5054 }
5055 unittest
5056 {
5057     __m128i A = _mm_setr_epi16(16,  32767, 1, 2,    3, 4, 6, 6);
5058     __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6);
5059     short8 C = cast(short8) _mm_sub_epi16(A, B);
5060     short[8] correct =        [ 1,     -1,-5,-6, -997, 3, 1, 0];
5061     assert(C.array == correct);
5062 }
5063 
5064 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
5065 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
5066 {
5067     pragma(inline, true);
5068     return cast(__m128i)(cast(int4)a - cast(int4)b);
5069 }
5070 unittest
5071 {
5072     __m128i A = _mm_setr_epi32(16, int.max, 1, 8);
5073     __m128i B = _mm_setr_epi32(15, int.min, 6, 2);
5074     int4 C = cast(int4) _mm_sub_epi32(A, B);
5075     int[4] correct =          [ 1,      -1,-5, 6];
5076     assert(C.array == correct);
5077 }
5078 
5079 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
5080 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
5081 {
5082     pragma(inline, true);
5083     return cast(__m128i)(cast(long2)a - cast(long2)b);
5084 }
5085 unittest
5086 {
5087     __m128i A = _mm_setr_epi64(  16, long.max);
5088     __m128i B = _mm_setr_epi64( 199, long.min);
5089     long2 C = cast(long2) _mm_sub_epi64(A, B);
5090     long[2] correct =         [-183,       -1];
5091     assert(C.array == correct);
5092 }
5093 
5094 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
5095 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
5096 {
5097     pragma(inline, true);
5098     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
5099 }
5100 unittest
5101 {
5102     __m128i A = _mm_setr_epi8(16,  127, 1, 2, 3, 4, 6, 6, 16,  127, 1, 2, 3, 4, 6, 6);
5103     __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16,  127, 1, 2, 3, 4, 6, 6);
5104     byte16 C = cast(byte16) _mm_sub_epi8(A, B);
5105     byte[16] correct =       [ 1,   -1,-5,-6, 0, 3, 1, 0,  0,    0, 0, 0, 0, 0, 0, 0];
5106     assert(C.array == correct);
5107 }
5108 
5109 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
5110 /// floating-point elements in `a`.
5111 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
5112 {
5113     pragma(inline, true);
5114     return a - b;
5115 }
5116 unittest
5117 {
5118     __m128d A = _mm_setr_pd(4000.0, -8.0);
5119     __m128d B = _mm_setr_pd(12.0, -8450.0);
5120     __m128d C = _mm_sub_pd(A, B);
5121     double[2] correct =     [3988.0, 8442.0];
5122     assert(C.array == correct);
5123 }
5124 
5125 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
5126 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
5127 /// upper element of result.
5128 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
5129 {
5130     version(DigitalMars)
5131     {
5132         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
5133         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
5134         asm pure nothrow @nogc @trusted { nop;}
5135         a[0] = a[0] - b[0];
5136         return a;
5137     }
5138     else static if (GDC_with_SSE2)
5139     {
5140         return __builtin_ia32_subsd(a, b);
5141     }
5142     else
5143     {
5144         a.ptr[0] -= b.array[0];
5145         return a;
5146     }
5147 }
5148 unittest
5149 {
5150     __m128d a = [1.5, -2.0];
5151     a = _mm_sub_sd(a, a);
5152     assert(a.array == [0.0, -2.0]);
5153 }
5154 
5155 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
5156 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
5157 {
5158     pragma(inline, true);
5159     return a - b;
5160 }
5161 unittest
5162 {
5163     __m64 A, B;
5164     A = -1214;
5165     B = 489415;
5166     __m64 C = _mm_sub_si64(B, A);
5167     assert(C.array[0] == 489415 + 1214);
5168 }
5169 
5170 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using
5171 /// saturation.
5172 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
5173 {
5174     // PERF DMD psubsw
5175     static if(LDC_with_saturated_intrinsics)
5176     {
5177         return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b);
5178     }
5179     else static if (GDC_with_SSE2)
5180     {
5181         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
5182     }
5183     else
5184     {
5185         short[8] res; // PERF =void;
5186         short8 sa = cast(short8)a;
5187         short8 sb = cast(short8)b;
5188         foreach(i; 0..8)
5189             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
5190         return _mm_loadu_si128(cast(int4*)res.ptr);
5191     }
5192 }
5193 unittest
5194 {
5195     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
5196                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
5197     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
5198     assert(res.array == correctResult);
5199 }
5200 
5201 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
5202 /// saturation.
5203 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
5204 {
5205     static if(LDC_with_saturated_intrinsics)
5206     {
5207         return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b);
5208     }
5209     else static if (GDC_with_SSE2)
5210     {
5211         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
5212     }
5213     else
5214     {
5215         byte[16] res; // PERF =void;
5216         byte16 sa = cast(byte16)a;
5217         byte16 sb = cast(byte16)b;
5218         foreach(i; 0..16)
5219             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
5220         return _mm_loadu_si128(cast(int4*)res.ptr);
5221     }
5222 }
5223 unittest
5224 {
5225     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5226                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5227     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5228     assert(res.array == correctResult);
5229 }
5230 
5231 /// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
5232 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
5233 {
5234     static if(LDC_with_saturated_intrinsics)
5235     {
5236         return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b);
5237     }
5238     else static if (GDC_with_SSE2)
5239     {
5240         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
5241     }
5242     else
5243     {
5244         short[8] res; // PERF =void;
5245         short8 sa = cast(short8)a;
5246         short8 sb = cast(short8)b;
5247         foreach(i; 0..8)
5248         {
5249             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
5250             res[i] = saturateSignedIntToUnsignedShort(sum);
5251         }
5252         return _mm_loadu_si128(cast(int4*)res.ptr);
5253     }
5254 }
5255 unittest
5256 {
5257     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
5258                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
5259     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
5260     assert(R.array == correct);
5261 }
5262 
5263 /// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
5264 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
5265 {
5266     static if(LDC_with_saturated_intrinsics)
5267     {
5268         return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b);
5269     }
5270     else static if (GDC_with_SSE2)
5271     {
5272         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
5273     }
5274     else
5275     {
5276         ubyte[16] res; // PERF =void;
5277         byte16 sa = cast(byte16)a;
5278         byte16 sb = cast(byte16)b;
5279         foreach(i; 0..16)
5280             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
5281         return _mm_loadu_si128(cast(int4*)res.ptr);
5282     }
5283 }
5284 unittest
5285 {
5286     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5287                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5288     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5289     assert(res.array == correctResult);
5290 }
5291 
5292 // Note: the only difference between these intrinsics is the signalling
5293 //       behaviour of quiet NaNs. This is incorrect but the case where
5294 //       you would want to differentiate between qNaN and sNaN and then
5295 //       treat them differently on purpose seems extremely rare.
5296 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
5297 alias _mm_ucomige_sd = _mm_comige_sd; ///
5298 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
5299 alias _mm_ucomile_sd = _mm_comile_sd; ///
5300 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
5301 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
5302 
5303 /// Return vector of type `__m128d` with undefined elements.
5304 __m128d _mm_undefined_pd() pure @safe
5305 {
5306     pragma(inline, true);
5307     __m128d result = void;
5308     return result;
5309 }
5310 
5311 /// Return vector of type `__m128i` with undefined elements.
5312 __m128i _mm_undefined_si128() pure @safe
5313 {
5314     pragma(inline, true);
5315     __m128i result = void;
5316     return result;
5317 }
5318 
5319 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
5320 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted
5321 {
5322     static if (DMD_with_DSIMD)
5323     {
5324         return cast(__m128i) __simd(XMM.PUNPCKHWD, a, b);
5325     }
5326     else static if (GDC_with_SSE2)
5327     {
5328         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
5329     }
5330     else static if (LDC_with_optimizations)
5331     {
5332         enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
5333                    ret <8 x i16> %r`;
5334         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
5335     }
5336     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5337     {
5338         asm pure nothrow @nogc @trusted
5339         {
5340             movdqu XMM0, a;
5341             movdqu XMM1, b;
5342             punpckhwd XMM0, XMM1;
5343             movdqu a, XMM0;
5344         }
5345         return a;
5346     }   
5347     else
5348     {
5349         short8 r = void;
5350         short8 sa = cast(short8)a;
5351         short8 sb = cast(short8)b;
5352         r.ptr[0] = sa.array[4];
5353         r.ptr[1] = sb.array[4];
5354         r.ptr[2] = sa.array[5];
5355         r.ptr[3] = sb.array[5];
5356         r.ptr[4] = sa.array[6];
5357         r.ptr[5] = sb.array[6];
5358         r.ptr[6] = sa.array[7];
5359         r.ptr[7] = sb.array[7];
5360         return cast(__m128i)r;
5361     }
5362 }
5363 unittest
5364 {
5365     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
5366     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
5367     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
5368     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
5369     assert(C.array == correct);
5370 }
5371 
5372 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
5373 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
5374 {
5375     static if (DMD_with_DSIMD)
5376     {
5377         return cast(__m128i) __simd(XMM.PUNPCKHDQ, a, b);
5378     }
5379     else static if (GDC_with_SSE2)
5380     {
5381         return __builtin_ia32_punpckhdq128(a, b);
5382     }
5383     else static if (LDC_with_optimizations)
5384     {
5385         enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
5386                    ret <4 x i32> %r`;
5387         return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
5388     }
5389     else
5390     {
5391         __m128i r = void;
5392         r.ptr[0] = a.array[2];
5393         r.ptr[1] = b.array[2];
5394         r.ptr[2] = a.array[3];
5395         r.ptr[3] = b.array[3];
5396         return r;
5397     }
5398 }
5399 unittest
5400 {
5401     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5402     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5403     __m128i C = _mm_unpackhi_epi32(A, B);
5404     int[4] correct = [3, 7, 4, 8];
5405     assert(C.array == correct);
5406 }
5407 
5408 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
5409 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
5410 {
5411     static if (GDC_with_SSE2)
5412     {
5413         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
5414     }
5415     else
5416     {
5417         __m128i r = cast(__m128i)b;
5418         r[0] = a[2];
5419         r[1] = a[3];
5420         return r; 
5421     }
5422 }
5423 unittest // Issue #36
5424 {
5425     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5426     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5427     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
5428     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
5429     assert(C.array == correct);
5430 }
5431 
5432 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
5433 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted
5434 {
5435     static if (DMD_with_DSIMD)
5436     {
5437         return cast(__m128i) __simd(XMM.PUNPCKHBW, a, b);
5438     }
5439     else static if (GDC_with_SSE2)
5440     {
5441         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
5442     }
5443     else static if (LDC_with_optimizations)
5444     {
5445         enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
5446                    ret <16 x i8> %r`;
5447         return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5448     }
5449     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5450     {
5451         asm pure nothrow @nogc @trusted
5452         {
5453             movdqu XMM0, a;
5454             movdqu XMM1, b;
5455             punpckhbw XMM0, XMM1;
5456             movdqu a, XMM0;
5457         }
5458         return a;
5459     }
5460     else
5461     {
5462         byte16 r = void;
5463         byte16 ba = cast(byte16)a;
5464         byte16 bb = cast(byte16)b;
5465         r.ptr[0] = ba.array[8];
5466         r.ptr[1] = bb.array[8];
5467         r.ptr[2] = ba.array[9];
5468         r.ptr[3] = bb.array[9];
5469         r.ptr[4] = ba.array[10];
5470         r.ptr[5] = bb.array[10];
5471         r.ptr[6] = ba.array[11];
5472         r.ptr[7] = bb.array[11];
5473         r.ptr[8] = ba.array[12];
5474         r.ptr[9] = bb.array[12];
5475         r.ptr[10] = ba.array[13];
5476         r.ptr[11] = bb.array[13];
5477         r.ptr[12] = ba.array[14];
5478         r.ptr[13] = bb.array[14];
5479         r.ptr[14] = ba.array[15];
5480         r.ptr[15] = bb.array[15];
5481         return cast(__m128i)r;
5482     }
5483 }
5484 unittest
5485 {
5486     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5487     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5488     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
5489     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
5490     assert(C.array == correct);
5491 }
5492 
5493 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
5494 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted
5495 {
5496     // PERF DMD D_SIMD
5497     static if (GDC_with_SSE2)
5498     {
5499         return __builtin_ia32_unpckhpd(a, b);
5500     }
5501     else static if (LDC_with_optimizations)
5502     {
5503         enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 1, i32 3>
5504                    ret <2 x double> %r`;
5505         return LDCInlineIR!(ir, double2, double2, double2)(a, b);
5506     }
5507     else
5508     {
5509         double2 r = void;
5510         r.ptr[0] = a.array[1];
5511         r.ptr[1] = b.array[1];
5512         return r;
5513     }
5514 }
5515 unittest
5516 {
5517     __m128d A = _mm_setr_pd(4.0, 6.0);
5518     __m128d B = _mm_setr_pd(7.0, 9.0);
5519     __m128d C = _mm_unpackhi_pd(A, B);
5520     double[2] correct = [6.0, 9.0];
5521     assert(C.array == correct);
5522 }
5523 
5524 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
5525 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted
5526 {
5527     static if (DMD_with_DSIMD)
5528     {
5529         return cast(__m128i) __simd(XMM.PUNPCKLWD, a, b);
5530     }
5531     else static if (GDC_with_SSE2)
5532     {
5533         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
5534     }
5535     else static if (LDC_with_optimizations)
5536     {
5537         enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
5538             ret <8 x i16> %r`;
5539         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
5540     }
5541     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5542     {
5543         asm pure nothrow @nogc @trusted
5544         {
5545             movdqu XMM0, a;
5546             movdqu XMM1, b;
5547             punpcklwd XMM0, XMM1;
5548             movdqu a, XMM0;
5549         }
5550         return a;
5551     }
5552     else
5553     {
5554         short8 r = void;
5555         short8 sa = cast(short8)a;
5556         short8 sb = cast(short8)b;
5557         r.ptr[0] = sa.array[0];
5558         r.ptr[1] = sb.array[0];
5559         r.ptr[2] = sa.array[1];
5560         r.ptr[3] = sb.array[1];
5561         r.ptr[4] = sa.array[2];
5562         r.ptr[5] = sb.array[2];
5563         r.ptr[6] = sa.array[3];
5564         r.ptr[7] = sb.array[3];
5565         return cast(__m128i)r;
5566     }
5567 }
5568 unittest
5569 {
5570     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
5571     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
5572     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
5573     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
5574     assert(C.array == correct);
5575 }
5576 
5577 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
5578 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
5579 {
5580     static if (DMD_with_DSIMD)
5581     {
5582         return cast(__m128i) __simd(XMM.PUNPCKLDQ, a, b);
5583     }
5584     else static if (GDC_with_SSE2)
5585     {
5586         return __builtin_ia32_punpckldq128(a, b);
5587     }
5588     else static if (LDC_with_optimizations)
5589     {
5590         enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
5591             ret <4 x i32> %r`;
5592         return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b);
5593     }
5594     else
5595     {
5596         __m128i r;
5597         r.ptr[0] = a.array[0];
5598         r.ptr[1] = b.array[0];
5599         r.ptr[2] = a.array[1];
5600         r.ptr[3] = b.array[1];
5601         return r;
5602     }
5603 }
5604 unittest
5605 {
5606     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5607     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5608     __m128i C = _mm_unpacklo_epi32(A, B);
5609     int[4] correct = [1, 5, 2, 6];
5610     assert(C.array == correct);
5611 }
5612 
5613 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
5614 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
5615 {
5616     static if (GDC_with_SSE2)
5617     {
5618         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
5619     }
5620     else
5621     {
5622         long2 lA = cast(long2)a;
5623         long2 lB = cast(long2)b;
5624         long2 R; // PERF =void;
5625         R.ptr[0] = lA.array[0];
5626         R.ptr[1] = lB.array[0];
5627         return cast(__m128i)R;
5628     }
5629 }
5630 unittest // Issue #36
5631 {
5632     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5633     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5634     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
5635     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
5636     assert(C.array == correct);
5637 }
5638 
5639 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
5640 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted
5641 {
5642     static if (DMD_with_DSIMD)
5643     {
5644         return cast(__m128i) __simd(XMM.PUNPCKLBW, a, b);
5645     }
5646     else static if (GDC_with_SSE2)
5647     {
5648         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
5649     }
5650     else static if (LDC_with_optimizations)
5651     {
5652         enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
5653             ret <16 x i8> %r`;
5654         return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5655     }
5656     else static if (DMD_with_32bit_asm || LDC_with_x86_asm)
5657     {
5658         asm pure nothrow @nogc @trusted
5659         {
5660             movdqu XMM0, a;
5661             movdqu XMM1, b;
5662             punpcklbw XMM0, XMM1;
5663             movdqu a, XMM0;
5664         }
5665         return a;
5666     }
5667     else
5668     {
5669         byte16 r = void;
5670         byte16 ba = cast(byte16)a;
5671         byte16 bb = cast(byte16)b;
5672         r.ptr[0] = ba.array[0];
5673         r.ptr[1] = bb.array[0];
5674         r.ptr[2] = ba.array[1];
5675         r.ptr[3] = bb.array[1];
5676         r.ptr[4] = ba.array[2];
5677         r.ptr[5] = bb.array[2];
5678         r.ptr[6] = ba.array[3];
5679         r.ptr[7] = bb.array[3];
5680         r.ptr[8] = ba.array[4];
5681         r.ptr[9] = bb.array[4];
5682         r.ptr[10] = ba.array[5];
5683         r.ptr[11] = bb.array[5];
5684         r.ptr[12] = ba.array[6];
5685         r.ptr[13] = bb.array[6];
5686         r.ptr[14] = ba.array[7];
5687         r.ptr[15] = bb.array[7];
5688         return cast(__m128i)r;
5689     }
5690 }
5691 unittest
5692 {
5693     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5694     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5695     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
5696     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
5697     assert(C.array == correct);
5698 }
5699 
5700 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
5701 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted
5702 {
5703     // PERF DMD D_SIMD
5704     static if (GDC_with_SSE2)
5705     {
5706         return __builtin_ia32_unpcklpd(a, b);
5707     }
5708     else static if (LDC_with_optimizations)
5709     {
5710         enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2>
5711                    ret <2 x double> %r`;
5712         return LDCInlineIR!(ir, double2, double2, double2)(a, b);
5713     }
5714     else
5715     {
5716         double2 r = void;
5717         r.ptr[0] = a.array[0];
5718         r.ptr[1] = b.array[0];
5719         return r;
5720     }
5721 }
5722 unittest
5723 {
5724     __m128d A = _mm_setr_pd(4.0, 6.0);
5725     __m128d B = _mm_setr_pd(7.0, 9.0);
5726     __m128d C = _mm_unpacklo_pd(A, B);
5727     double[2] correct = [4.0, 7.0];
5728     assert(C.array == correct);
5729 }
5730 
5731 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
5732 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
5733 {
5734     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
5735 }
5736 unittest
5737 {
5738     __m128d A = _mm_setr_pd(-4.0, 6.0);
5739     __m128d B = _mm_setr_pd(4.0, -6.0);
5740     long2 R = cast(long2) _mm_xor_pd(A, B);
5741     long[2] correct = [long.min, long.min];
5742     assert(R.array == correct);
5743 }
5744 
5745 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
5746 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
5747 {
5748     return a ^ b;
5749 }
5750 unittest
5751 {
5752     __m128i A = _mm_setr_epi64(975394, 619809709);
5753     __m128i B = _mm_setr_epi64(-920275025, -6);
5754     long2 R = cast(long2) _mm_xor_si128(A, B);
5755     long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6];
5756     assert(R.array == correct);
5757 }
5758 
5759 unittest
5760 {
5761     float distance(float[4] a, float[4] b) nothrow @nogc
5762     {
5763         __m128 va = _mm_loadu_ps(a.ptr);
5764         __m128 vb = _mm_loadu_ps(b.ptr);
5765         __m128 diffSquared = _mm_sub_ps(va, vb);
5766         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
5767         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
5768         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
5769         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
5770     }
5771     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
5772 }