1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (DMD_with_DSIMD)
83     {
84         return cast(__m128d) __simd(XMM.ADDSD, a, b);
85     }
86     else static if (GDC_with_SSE2)
87     {
88         return __builtin_ia32_addsd(a, b);
89     }
90     else version(DigitalMars)
91     {
92         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
93         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
94         asm pure nothrow @nogc @trusted { nop;}
95         a[0] = a[0] + b[0];
96         return a;
97     }
98     else
99     {
100         a[0] += b[0];
101         return a;
102     }
103 }
104 unittest
105 {
106     __m128d a = [1.5, -2.0];
107     a = _mm_add_sd(a, a);
108     assert(a.array == [3.0, -2.0]);
109 }
110 
111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
113 {
114     pragma(inline, true);
115     return a + b;
116 }
117 unittest
118 {
119     __m128d a = [1.5, -2.0];
120     a = _mm_add_pd(a, a);
121     assert(a.array == [3.0, -4.0]);
122 }
123 
124 /// Add 64-bit integers `a` and `b`.
125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
126 {
127     // PERF DMD
128     pragma(inline, true);
129     return a + b;
130 }
131 
132 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
134 {
135     static if (DMD_with_DSIMD)
136     {
137         return cast(__m128i) __simd(XMM.PADDSW, a, b);
138     }
139     else static if (GDC_with_SSE2)
140     {
141         return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
142     }
143     else version(LDC)
144     {
145         return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b);
146     }
147     else
148     {
149         short[8] res; // PERF =void;
150         short8 sa = cast(short8)a;
151         short8 sb = cast(short8)b;
152         foreach(i; 0..8)
153             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
154         return _mm_loadu_si128(cast(int4*)res.ptr);
155     }
156 }
157 unittest
158 {
159     short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7,  6,  5, -32768, 3, 3, 32767,   0),
160                                              _mm_setr_epi16( 7,  6,  5, -30000, 3, 1,     1, -10));
161     static immutable short[8] correctResult             =  [14, 12, 10, -32768, 6, 4, 32767, -10];
162     assert(res.array == correctResult);
163 }
164 
165 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
166 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
167 {
168     static if (DMD_with_DSIMD)
169     {
170         return cast(__m128i) __simd(XMM.PADDSB, a, b);
171     }
172     else static if (GDC_with_SSE2)
173     {
174         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
175     }
176     else version(LDC)
177     {
178         return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b);
179     }
180     else
181     {
182         byte[16] res; // PERF =void;
183         byte16 sa = cast(byte16)a;
184         byte16 sb = cast(byte16)b;
185         foreach(i; 0..16)
186             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
187         return _mm_loadu_si128(cast(int4*)res.ptr);
188     }
189 }
190 unittest
191 {
192     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0),
193                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0));
194     static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14,
195                                                16, 18, 127, 22, 24, 26, 28, 30];
196     assert(res.array == correctResult);
197 }
198 
199 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
200 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
201 {
202     static if (DMD_with_DSIMD)
203     {
204         return cast(__m128i) __simd(XMM.PADDUSB, a, b);
205     }
206     else static if (GDC_with_SSE2)
207     {
208         return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b);
209     }
210     else version(LDC)
211     {
212         return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b);
213     }
214     else
215     {
216         ubyte[16] res; // PERF =void;
217         byte16 sa = cast(byte16)a;
218         byte16 sb = cast(byte16)b;
219         foreach(i; 0..16)
220             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
221         return _mm_loadu_si128(cast(int4*)res.ptr);
222     }
223 }
224 unittest
225 {
226     byte16 res = cast(byte16) 
227         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
228                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
229     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
230                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
231     assert(res.array == correctResult);
232 }
233 
234 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
235 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
236 {
237     static if (DMD_with_DSIMD)
238     {
239         // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway
240         return cast(__m128i) __simd(XMM.PADDUSW, a, b);
241     }
242     else static if (GDC_with_SSE2)
243     {
244         return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b);
245     }
246     else version(LDC)
247     {
248         return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b);
249     }
250     else
251     {
252         ushort[8] res; // PERF =void;
253         short8 sa = cast(short8)a;
254         short8 sb = cast(short8)b;
255         foreach(i; 0..8)
256             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
257         return _mm_loadu_si128(cast(int4*)res.ptr);
258     }
259 }
260 unittest
261 {
262     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
263                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
264     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
265     assert(res.array == correctResult);
266 }
267 
268 /// Compute the bitwise AND of packed double-precision (64-bit) 
269 /// floating-point elements in `a` and `b`.
270 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
271 {
272     pragma(inline, true);
273     return cast(__m128d)( cast(long2)a & cast(long2)b );
274 }
275 unittest
276 {
277     double a = 4.32;
278     double b = -78.99;
279     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
280     __m128d A = _mm_set_pd(a, b);
281     __m128d B = _mm_set_pd(b, a);
282     long2 R = cast(long2)( _mm_and_pd(A, B) );
283     assert(R.array[0] == correct);
284     assert(R.array[1] == correct);
285 }
286 
287 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
288 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
289 {
290     pragma(inline, true);
291     return a & b;
292 }
293 unittest
294 {
295     __m128i A = _mm_set1_epi32(7);
296     __m128i B = _mm_set1_epi32(14);
297     __m128i R = _mm_and_si128(A, B);
298     int[4] correct = [6, 6, 6, 6];
299     assert(R.array == correct);
300 }
301 
302 /// Compute the bitwise NOT of packed double-precision (64-bit) 
303 /// floating-point elements in `a` and then AND with `b`.
304 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
305 {
306     static if (DMD_with_DSIMD)
307     {
308         return cast(__m128d) __simd(XMM.ANDNPD, a, b);
309     }
310     else
311     {
312         return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
313     }
314 }
315 unittest
316 {
317     double a = 4.32;
318     double b = -78.99;
319     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
320     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
321     __m128d A = _mm_setr_pd(a, b);
322     __m128d B = _mm_setr_pd(b, a);
323     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
324     assert(R.array[0] == correct);
325     assert(R.array[1] == correct2);
326 }
327 
328 /// Compute the bitwise NOT of 128 bits (representing integer data) 
329 /// in `a` and then AND with `b`.
330 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
331 {
332     static if (DMD_with_DSIMD)
333     {
334         return cast(__m128i) __simd(XMM.PANDN, a, b);
335     }
336     else
337     {
338         return (~a) & b;
339     }
340 }
341 unittest
342 {
343     __m128i A = _mm_setr_epi32(7, -2, 9, 54654);
344     __m128i B = _mm_setr_epi32(14, 78, 111, -256);
345     __m128i R = _mm_andnot_si128(A, B);
346     int[4] correct = [8, 0, 102, -54784];
347     assert(R.array == correct);
348 }
349 
350 /// Average packed unsigned 16-bit integers in `a` and `b`.
351 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
352 {
353     static if (DMD_with_DSIMD)
354     {
355         return cast(__m128i) __simd(XMM.PAVGW, a, b);
356     }
357     else static if (GDC_with_SSE2)
358     {
359         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
360     }
361     else static if (LDC_with_ARM64)
362     {
363         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
364     }
365     else version(LDC)
366     {
367         // Generates pavgw even in LDC 1.0, even in -O0
368         // But not in ARM
369         enum ir = `
370             %ia = zext <8 x i16> %0 to <8 x i32>
371             %ib = zext <8 x i16> %1 to <8 x i32>
372             %isum = add <8 x i32> %ia, %ib
373             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
374             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
375             %r = trunc <8 x i32> %isums to <8 x i16>
376             ret <8 x i16> %r`;
377         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
378     }
379     else
380     {
381         short8 sa = cast(short8)a;
382         short8 sb = cast(short8)b;
383         short8 sr = void;
384         foreach(i; 0..8)
385         {
386             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
387         }
388         return cast(int4)sr;
389     }
390 }
391 unittest
392 {
393     __m128i A = _mm_set1_epi16(31);
394     __m128i B = _mm_set1_epi16(64);
395     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
396     foreach(i; 0..8)
397         assert(avg.array[i] == 48);
398 }
399 
400 /// Average packed unsigned 8-bit integers in `a` and `b`.
401 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
402 {
403     static if (DMD_with_DSIMD)
404     {
405         return cast(__m128i) __simd(XMM.PAVGB, a, b);
406     }
407     else static if (GDC_with_SSE2)
408     {
409         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
410     }
411     else static if (LDC_with_ARM64)
412     {
413         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
414     }
415     else version(LDC)
416     {
417         // Generates pavgb even in LDC 1.0, even in -O0
418         // But not in ARM
419         enum ir = `
420             %ia = zext <16 x i8> %0 to <16 x i16>
421             %ib = zext <16 x i8> %1 to <16 x i16>
422             %isum = add <16 x i16> %ia, %ib
423             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
424             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
425             %r = trunc <16 x i16> %isums to <16 x i8>
426             ret <16 x i8> %r`;
427         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
428     }
429     else
430     {
431         byte16 sa = cast(byte16)a;
432         byte16 sb = cast(byte16)b;
433         byte16 sr = void;
434         foreach(i; 0..16)
435         {
436             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
437         }
438         return cast(int4)sr;
439     }
440 }
441 unittest
442 {
443     __m128i A = _mm_set1_epi8(31);
444     __m128i B = _mm_set1_epi8(64);
445     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
446     foreach(i; 0..16)
447         assert(avg.array[i] == 48);
448 }
449 
450 /// Shift `a` left by `bytes` bytes while shifting in zeros.
451 alias _mm_bslli_si128 = _mm_slli_si128;
452 unittest
453 {
454     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
455     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
456     __m128i result = _mm_bslli_si128!5(toShift);
457     assert( (cast(byte16)result).array == exact);
458 }
459 
460 /// Shift `v` right by `bytes` bytes while shifting in zeros.
461 alias _mm_bsrli_si128 = _mm_srli_si128;
462 unittest
463 {
464     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
465     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
466     __m128i result = _mm_bsrli_si128!5(toShift);
467     assert( (cast(byte16)result).array == exact);
468 }
469 
470 /// Cast vector of type `__m128d` to type `__m128`. 
471 /// Note: Also possible with a regular `cast(__m128)(a)`.
472 __m128 _mm_castpd_ps (__m128d a) pure @safe
473 {
474     return cast(__m128)a;
475 }
476 
477 /// Cast vector of type `__m128d` to type `__m128i`. 
478 /// Note: Also possible with a regular `cast(__m128i)(a)`.
479 __m128i _mm_castpd_si128 (__m128d a) pure @safe
480 {
481     return cast(__m128i)a;
482 }
483 
484 /// Cast vector of type `__m128` to type `__m128d`. 
485 /// Note: Also possible with a regular `cast(__m128d)(a)`.
486 __m128d _mm_castps_pd (__m128 a) pure @safe
487 {
488     return cast(__m128d)a;
489 }
490 
491 /// Cast vector of type `__m128` to type `__m128i`. 
492 /// Note: Also possible with a regular `cast(__m128i)(a)`.
493 __m128i _mm_castps_si128 (__m128 a) pure @safe
494 {
495     return cast(__m128i)a;
496 }
497 
498 /// Cast vector of type `__m128i` to type `__m128d`. 
499 /// Note: Also possible with a regular `cast(__m128d)(a)`.
500 __m128d _mm_castsi128_pd (__m128i a) pure @safe
501 {
502     return cast(__m128d)a;
503 }
504 
505 /// Cast vector of type `__m128i` to type `__m128`. 
506 /// Note: Also possible with a regular `cast(__m128)(a)`.
507 __m128 _mm_castsi128_ps (__m128i a) pure @safe
508 {
509     return cast(__m128)a;
510 }
511 
512 /// Invalidate and flush the cache line that contains `p` 
513 /// from all levels of the cache hierarchy.
514 void _mm_clflush (const(void)* p) @trusted
515 {
516     static if (GDC_with_SSE2)
517     {
518         __builtin_ia32_clflush(p);
519     }
520     else static if (LDC_with_SSE2)
521     {
522         __builtin_ia32_clflush(cast(void*)p);
523     }
524     else version(D_InlineAsm_X86)
525     {
526         asm pure nothrow @nogc @safe
527         {
528             mov EAX, p;
529             clflush [EAX];
530         }
531     }
532     else version(D_InlineAsm_X86_64)
533     {
534         asm pure nothrow @nogc @safe
535         {
536             mov RAX, p;
537             clflush [RAX];
538         }
539     }
540     else 
541     {
542         // Do nothing. Invalidating cacheline does
543         // not affect correctness.
544     }
545 }
546 unittest
547 {
548     ubyte[64] cacheline;
549     _mm_clflush(cacheline.ptr);
550 }
551 
552 /// Compare packed 16-bit integers in `a` and `b` for equality.
553 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
554 {
555     static if (GDC_with_SSE2)
556     {
557         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
558     }
559     else
560     {
561         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
562     }
563 }
564 unittest
565 {
566     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
567     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
568     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
569     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
570     assert(R.array == E);
571 }
572 
573 /// Compare packed 32-bit integers in `a` and `b` for equality.
574 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
575 {
576     static if (GDC_with_SSE2)
577     {
578         return __builtin_ia32_pcmpeqd128(a, b);
579     }
580     else
581     {
582         return equalMask!__m128i(a, b);
583     }
584 }
585 unittest
586 {
587     int4   A = [-3, -2, -1,  0];
588     int4   B = [ 4, -2,  2,  0];
589     int[4] E = [ 0, -1,  0, -1];
590     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
591     assert(R.array == E);
592 }
593 
594 /// Compare packed 8-bit integers in `a` and `b` for equality.
595 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
596 {
597     static if (GDC_with_SSE2)
598     {
599         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
600     }
601     else
602     {
603         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
604     }
605 }
606 unittest
607 {
608     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
609     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
610     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
611     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
612     assert(C.array == correct);
613 }
614 
615 /// Compare packed double-precision (64-bit) floating-point elements 
616 /// in `a` and `b` for equality.
617 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
618 {
619     static if (GDC_with_SSE2)
620     {
621         return __builtin_ia32_cmpeqpd(a, b);
622     }
623     else
624     {
625         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
626     }
627 }
628 
629 /// Compare the lower double-precision (64-bit) floating-point elements
630 /// in `a` and `b` for equality, store the result in the lower element,
631 /// and copy the upper element from `a`.
632 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
633 {
634     static if (GDC_with_SSE2)
635     {
636         return __builtin_ia32_cmpeqsd(a, b);
637     }
638     else
639     {
640         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
641     }
642 }
643 
644 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
645 /// #BONUS
646 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe
647 {
648     version (LDC)
649     {
650         // LDC ARM64: generates cmge since -O1
651         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b);
652     }
653     else
654     {        
655         return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b));
656     }
657 }
658 unittest
659 {
660     short8   A = [-3, -2, -32768,  0,  0,  1,  2,  3];
661     short8   B = [ 4,  3,  32767,  1,  0, -1, -2, -3];
662     short[8] E = [ 0,  0,      0,  0,  -1, -1, -1, -1];
663     short8   R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B));
664     assert(R.array == E);
665 }
666 
667 /// Compare packed double-precision (64-bit) floating-point elements 
668 /// in `a` and `b` for greater-than-or-equal.
669 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
670 {
671     static if (GDC_with_SSE2)
672     {
673         return __builtin_ia32_cmpgepd(a, b);
674     }
675     else
676     {
677         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
678     }
679 }
680 
681 /// Compare the lower double-precision (64-bit) floating-point elements 
682 /// in `a` and `b` for greater-than-or-equal, store the result in the 
683 /// lower element, and copy the upper element from `a`.
684 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
685 {
686     // Note: There is no __builtin_ia32_cmpgesd builtin.
687     static if (GDC_with_SSE2)
688     {
689         return __builtin_ia32_cmpnltsd(b, a);
690     }
691     else
692     {
693         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
694     }
695 }
696 
697 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
698 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
699 {
700     static if (GDC_with_SSE2)
701     {
702         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
703     }
704     else
705     {
706         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
707     }
708 }
709 unittest
710 {
711     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
712     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
713     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
714     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
715     assert(R.array == E);
716 }
717 
718 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
719 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
720 {
721     static if (GDC_with_SSE2)
722     {
723         return __builtin_ia32_pcmpgtd128(a, b); 
724     }
725     else
726     {
727         return cast(__m128i)( greaterMask!int4(a, b));
728     }
729 }
730 unittest
731 {
732     int4   A = [-3,  2, -1,  0];
733     int4   B = [ 4, -2,  2,  0];
734     int[4] E = [ 0, -1,  0,  0];
735     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
736     assert(R.array == E);
737 }
738 
739 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
740 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
741 {
742     // Workaround of a GCC bug here.
743     // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence
744     // with __builtin_ia32_pcmpgtb128.
745     // GCC's emmintrin.h uses comparison operators we don't have instead.
746     // PERF: this is a quite severe GDC performance problem.
747     // Could be workarounded with inline assembly, or another algorithm I guess.
748   
749   /*
750     static if (GDC_with_SSE2)
751     {
752         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
753     }
754     else */
755     {
756         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
757     }
758 }
759 unittest
760 {
761     __m128i A = _mm_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
762     __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
763     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
764     byte[16] correct =       [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
765     __m128i D = _mm_cmpeq_epi8(A, B);
766     assert(C.array == correct);
767 }
768 
769 /// Compare packed double-precision (64-bit) floating-point elements 
770 /// in `a` and `b` for greater-than.
771 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
772 {
773     static if (GDC_with_SSE2)
774     {
775         return __builtin_ia32_cmpgtpd(a, b); 
776     }
777     else
778     {
779         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
780     }
781 }
782 
783 /// Compare the lower double-precision (64-bit) floating-point elements 
784 /// in `a` and `b` for greater-than, store the result in the lower element,
785 /// and copy the upper element from `a`.
786 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
787 {
788     // Note: There is no __builtin_ia32_cmpgtsd builtin.
789     static if (GDC_with_SSE2)
790     {
791         return __builtin_ia32_cmpnlesd(b, a);
792     }
793     else
794     {
795         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
796     }
797 }
798 
799 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
800 /// #BONUS
801 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe
802 {
803     version (LDC)
804     {
805         // LDC ARM64: generates cmge since -O1
806         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a);
807     }
808     else
809     {
810         return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a));
811     }
812 }
813 unittest
814 {
815     short8   A = [-3, -2, -32768,  1,  0,  1,  2,  3];
816     short8   B = [ 4,  3,  32767,  0,  0, -1, -2, -3];
817     short[8] E = [-1, -1,     -1,  0,  -1, 0,  0,  0];
818     short8   R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B));
819     assert(R.array == E);
820 }
821 
822 /// Compare packed double-precision (64-bit) floating-point elements 
823 /// in `a` and `b` for less-than-or-equal.
824 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
825 {
826     static if (GDC_with_SSE2)
827     {
828         return __builtin_ia32_cmplepd(a, b); 
829     }
830     else
831     {
832         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
833     }
834 }
835 
836 /// Compare the lower double-precision (64-bit) floating-point elements 
837 /// in `a` and `b` for less-than-or-equal, store the result in the 
838 /// lower element, and copy the upper element from `a`.
839 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
840 {
841     static if (GDC_with_SSE2)
842     {
843         return __builtin_ia32_cmplesd(a, b); 
844     }
845     else
846     {
847         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
848     }
849 }
850 
851 /// Compare packed 16-bit integers in `a` and `b` for less-than.
852 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
853 {
854     return _mm_cmpgt_epi16(b, a);
855 }
856 
857 /// Compare packed 32-bit integers in `a` and `b` for less-than.
858 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
859 {
860     return _mm_cmpgt_epi32(b, a);
861 }
862 
863 /// Compare packed 8-bit integers in `a` and `b` for less-than.
864 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
865 {
866     return _mm_cmpgt_epi8(b, a);
867 }
868 
869 /// Compare packed double-precision (64-bit) floating-point elements
870 /// in `a` and `b` for less-than.
871 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
872 {
873     static if (GDC_with_SSE2)
874     {
875         return __builtin_ia32_cmpltpd(a, b); 
876     }
877     else
878     {
879         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
880     }
881 }
882 
883 /// Compare the lower double-precision (64-bit) floating-point elements
884 /// in `a` and `b` for less-than, store the result in the lower 
885 /// element, and copy the upper element from `a`.
886 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
887 {
888     static if (GDC_with_SSE2)
889     {
890         return __builtin_ia32_cmpltsd(a, b); 
891     }
892     else
893     {
894         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
895     }
896 }
897 
898 /// Compare packed double-precision (64-bit) floating-point elements
899 /// in `a` and `b` for not-equal.
900 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
901 {
902     static if (GDC_with_SSE2)
903     {
904         return __builtin_ia32_cmpneqpd(a, b); 
905     }
906     else
907     {
908         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
909     }
910 }
911 
912 /// Compare the lower double-precision (64-bit) floating-point elements
913 /// in `a` and `b` for not-equal, store the result in the lower 
914 /// element, and copy the upper element from `a`.
915 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
916 {
917     static if (GDC_with_SSE2)
918     {
919         return __builtin_ia32_cmpneqsd(a, b); 
920     }
921     else
922     {
923         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
924     }
925 }
926 
927 /// Compare packed double-precision (64-bit) floating-point elements 
928 /// in `a` and `b` for not-greater-than-or-equal.
929 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
930 {
931     static if (GDC_with_SSE2)
932     {
933         return __builtin_ia32_cmpngepd(a, b); 
934     }
935     else
936     {
937         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
938     }
939 }
940 
941 /// Compare the lower double-precision (64-bit) floating-point elements 
942 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
943 /// the lower element, and copy the upper element from `a`.
944 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
945 {
946     // Note: There is no __builtin_ia32_cmpngesd builtin.
947     static if (GDC_with_SSE2)
948     {
949         return __builtin_ia32_cmpltsd(b, a); 
950     }
951     else
952     {
953         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
954     }
955 }
956 
957 /// Compare packed double-precision (64-bit) floating-point elements 
958 /// in `a` and `b` for not-greater-than.
959 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
960 {
961     static if (GDC_with_SSE2)
962     {
963         return __builtin_ia32_cmpngtpd(a, b);
964     }
965     else
966     {
967         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
968     }
969 }
970 
971 /// Compare the lower double-precision (64-bit) floating-point elements 
972 /// in `a` and `b` for not-greater-than, store the result in the 
973 /// lower element, and copy the upper element from `a`.
974 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
975 {
976     // Note: There is no __builtin_ia32_cmpngtsd builtin.
977     static if (GDC_with_SSE2)
978     {
979         return __builtin_ia32_cmplesd(b, a);
980     }
981     else
982     {
983         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
984     }
985 }
986 
987 /// Compare packed double-precision (64-bit) floating-point elements 
988 /// in `a` and `b` for not-less-than-or-equal.
989 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
990 {
991     static if (GDC_with_SSE2)
992     {
993         return __builtin_ia32_cmpnlepd(a, b);
994     }
995     else
996     {
997         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
998     }
999 }
1000 
1001 /// Compare the lower double-precision (64-bit) floating-point elements 
1002 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
1003 /// lower element, and copy the upper element from `a`.
1004 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
1005 {
1006     static if (GDC_with_SSE2)
1007     {
1008         return __builtin_ia32_cmpnlesd(a, b);
1009     }
1010     else
1011     {
1012         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
1013     }
1014 }
1015  
1016 /// Compare packed double-precision (64-bit) floating-point elements 
1017 /// in `a` and `b` for not-less-than.
1018 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
1019 {
1020     static if (GDC_with_SSE2)
1021     {
1022         return __builtin_ia32_cmpnltpd(a, b);
1023     }
1024     else
1025     {
1026         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1027     }
1028 }
1029 
1030 /// Compare the lower double-precision (64-bit) floating-point elements 
1031 /// in `a` and `b` for not-less-than, store the result in the lower 
1032 /// element, and copy the upper element from `a`.
1033 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1034 {
1035     static if (GDC_with_SSE2)
1036     {
1037         return __builtin_ia32_cmpnltsd(a, b);
1038     }
1039     else
1040     {
1041         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1042     }
1043 }
1044 
1045 /// Compare packed double-precision (64-bit) floating-point elements 
1046 /// in `a` and `b` to see if neither is NaN.
1047 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1048 {
1049     static if (GDC_with_SSE2)
1050     {
1051         return __builtin_ia32_cmpordpd(a, b);
1052     }
1053     else
1054     {
1055         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1056     }
1057 }
1058 
1059 /// Compare the lower double-precision (64-bit) floating-point elements 
1060 /// in `a` and `b` to see if neither is NaN, store the result in the 
1061 /// lower element, and copy the upper element from `a` to the upper element.
1062 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1063 {
1064     static if (GDC_with_SSE2)
1065     {
1066         return __builtin_ia32_cmpordsd(a, b);
1067     }
1068     else
1069     {
1070         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1071     }
1072 }
1073 
1074 /// Compare packed double-precision (64-bit) floating-point elements 
1075 /// in `a` and `b` to see if either is NaN.
1076 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1077 {
1078     static if (GDC_with_SSE2)
1079     {
1080         return __builtin_ia32_cmpunordpd(a, b);
1081     }
1082     else
1083     {
1084         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1085     }
1086 }
1087 
1088 /// Compare the lower double-precision (64-bit) floating-point elements 
1089 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1090 /// element, and copy the upper element from `a` to the upper element.
1091 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1092 {
1093     static if (GDC_with_SSE2)
1094     {
1095         return __builtin_ia32_cmpunordsd(a, b);
1096     }
1097     else
1098     {
1099         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1100     }
1101 }
1102 
1103 /// Compare the lower double-precision (64-bit) floating-point element 
1104 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1105 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1106 {
1107     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1108     // comisd instruction, it returns false in case of unordered instead.
1109     //
1110     // Actually C++ compilers disagree over the meaning of that instruction.
1111     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1112     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1113     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1114     return a.array[0] == b.array[0];
1115 }
1116 unittest
1117 {
1118     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1119     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1120     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1121     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1122     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1123 }
1124 
1125 /// Compare the lower double-precision (64-bit) floating-point element 
1126 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1127 /// result (0 or 1).
1128 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1129 {
1130     return a.array[0] >= b.array[0];
1131 }
1132 unittest
1133 {
1134     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1135     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1136     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1137     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1138     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1139     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1140 }
1141 
1142 /// Compare the lower double-precision (64-bit) floating-point element 
1143 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1144 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1145 {
1146     return a.array[0] > b.array[0];
1147 }
1148 unittest
1149 {
1150     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1151     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1152     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1153     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1154     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1155 }
1156 
1157 /// Compare the lower double-precision (64-bit) floating-point element 
1158 /// in `a` and `b` for less-than-or-equal.
1159 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1160 {
1161     return a.array[0] <= b.array[0];
1162 }
1163 unittest
1164 {
1165     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1166     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1167     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1168     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1169     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1170     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1171 }
1172 
1173 /// Compare the lower double-precision (64-bit) floating-point element 
1174 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1175 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1176 {
1177     return a.array[0] < b.array[0];
1178 }
1179 unittest
1180 {
1181     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1182     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1183     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1184     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1185     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1186     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1187 }
1188 
1189 /// Compare the lower double-precision (64-bit) floating-point element
1190 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1191 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1192 {
1193     return a.array[0] != b.array[0];
1194 }
1195 unittest
1196 {
1197     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1198     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1199     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1200     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1201     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1202 }
1203 
1204 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1205 /// floating-point elements.
1206 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1207 {
1208     version(LDC)
1209     {
1210         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1211         enum ir = `
1212             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1213             %r = sitofp <2 x i32> %v to <2 x double>
1214             ret <2 x double> %r`;
1215         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1216     }
1217     else static if (GDC_with_SSE2)
1218     {
1219         return __builtin_ia32_cvtdq2pd(a);
1220     }
1221     else
1222     {
1223         double2 r = void;
1224         r.ptr[0] = a.array[0];
1225         r.ptr[1] = a.array[1];
1226         return r;
1227     }
1228 }
1229 unittest
1230 {
1231     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1232     assert(A.array[0] == 54.0);
1233     assert(A.array[1] == 54.0);
1234 }
1235 
1236 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1237 /// floating-point elements.
1238 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1239 {
1240     static if (DMD_with_DSIMD)
1241     {
1242         return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a);
1243     }
1244     else static if (GDC_with_SSE2)
1245     {
1246         return __builtin_ia32_cvtdq2ps(a);
1247     }
1248     else version(LDC)
1249     {
1250         // See #86 for why we had to resort to LLVM IR.
1251         // Plain code below was leading to catastrophic behaviour. 
1252         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1253         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1254         enum ir = `
1255             %r = sitofp <4 x i32> %0 to <4 x float>
1256             ret <4 x float> %r`;
1257         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1258     }
1259     else
1260     {
1261         __m128 res; // PERF =void;
1262         res.ptr[0] = cast(float)a.array[0];
1263         res.ptr[1] = cast(float)a.array[1];
1264         res.ptr[2] = cast(float)a.array[2];
1265         res.ptr[3] = cast(float)a.array[3];
1266         return res;
1267     }
1268 }
1269 unittest
1270 {
1271     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1272     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1273 }
1274 
1275 /// Convert packed double-precision (64-bit) floating-point elements 
1276 /// in `a` to packed 32-bit integers.
1277 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1278 {
1279     // PERF ARM32
1280     static if (LDC_with_SSE2)
1281     {
1282         return __builtin_ia32_cvtpd2dq(a);
1283     }
1284     else static if (GDC_with_SSE2)
1285     {
1286         return __builtin_ia32_cvtpd2dq(a);
1287     }
1288     else static if (LDC_with_ARM64)
1289     {
1290         // Get current rounding mode.
1291         uint fpscr = arm_get_fpcr();
1292         long2 i;
1293         switch(fpscr & _MM_ROUND_MASK_ARM)
1294         {
1295             default:
1296             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1297             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1298             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1299             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1300         }
1301         int4 zero = 0;
1302         return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1303     }
1304     else
1305     {
1306         // PERF ARM32
1307         __m128i r = _mm_setzero_si128();
1308         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1309         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1310         return r;
1311     }
1312 }
1313 unittest
1314 {
1315     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1316     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1317 }
1318 
1319 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1320 /// to packed 32-bit integers
1321 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1322 {
1323     return to_m64(_mm_cvtpd_epi32(v));
1324 }
1325 unittest
1326 {
1327     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1328     assert(A.array[0] == 55 && A.array[1] == 61);
1329 }
1330 
1331 /// Convert packed double-precision (64-bit) floating-point elements 
1332 /// in `a` to packed single-precision (32-bit) floating-point elements.
1333 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1334 {
1335     static if (LDC_with_SSE2)
1336     {
1337         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1338     }
1339     else static if (GDC_with_SSE2)
1340     {
1341         return __builtin_ia32_cvtpd2ps(a);
1342     }
1343     else
1344     { 
1345         __m128 r = void;
1346         r.ptr[0] = a.array[0];
1347         r.ptr[1] = a.array[1];
1348         r.ptr[2] = 0;
1349         r.ptr[3] = 0;
1350         return r;
1351     }
1352 }
1353 unittest
1354 {
1355     __m128d A = _mm_set_pd(5.25, 4.0);
1356     __m128 B = _mm_cvtpd_ps(A);
1357     assert(B.array == [4.0f, 5.25f, 0, 0]);
1358 }
1359 
1360 /// Convert packed 32-bit integers in `v` to packed double-precision 
1361 /// (64-bit) floating-point elements.
1362 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1363 {
1364     return _mm_cvtepi32_pd(to_m128i(v));
1365 }
1366 unittest
1367 {
1368     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1369     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1370 }
1371 
1372 /// Convert packed single-precision (32-bit) floating-point elements 
1373 /// in `a` to packed 32-bit integers
1374 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1375 {
1376     static if (LDC_with_SSE2)
1377     {
1378         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1379     }
1380     else static if (GDC_with_SSE2)
1381     {
1382         return __builtin_ia32_cvtps2dq(a);
1383     }
1384     else static if (LDC_with_ARM64)
1385     {
1386         // Get current rounding mode.
1387         uint fpscr = arm_get_fpcr();
1388         switch(fpscr & _MM_ROUND_MASK_ARM)
1389         {
1390             default:
1391             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1392             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1393             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1394             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1395         }
1396     }
1397     else
1398     {
1399         __m128i r = void;
1400         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1401         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1402         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1403         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1404         return r;
1405     }
1406 }
1407 unittest
1408 {
1409     // GDC bug #98607
1410     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1411     // GDC does not provide optimization barrier for rounding mode.
1412     // Workarounded with different literals. This bug will likely only manifest in unittest.
1413     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1414 
1415     uint savedRounding = _MM_GET_ROUNDING_MODE();
1416 
1417     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1418     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1419     assert(A.array == [1, -2, 54, -3]);
1420 
1421     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1422     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1423     assert(A.array == [1, -3, 53, -3]);
1424 
1425     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1426     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1427     assert(A.array == [2, -2, 54, -2]);
1428 
1429     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1430     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1431     assert(A.array == [1, -2, 53, -2]);
1432 
1433     _MM_SET_ROUNDING_MODE(savedRounding);
1434 }
1435 
1436 /// Convert packed single-precision (32-bit) floating-point elements 
1437 /// in `a` to packed double-precision (64-bit) floating-point elements.
1438 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1439 {
1440     version(LDC)
1441     {
1442         // Generates cvtps2pd since LDC 1.0 -O0
1443         enum ir = `
1444             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1445             %r = fpext <2 x float> %v to <2 x double>
1446             ret <2 x double> %r`;
1447         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1448     }
1449     else static if (GDC_with_SSE2)
1450     {
1451         return __builtin_ia32_cvtps2pd(a);
1452     }
1453     else
1454     {
1455         double2 r = void;
1456         r.ptr[0] = a.array[0];
1457         r.ptr[1] = a.array[1];
1458         return r;
1459     }
1460 }
1461 unittest
1462 {
1463     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1464     assert(A.array[0] == 54.0);
1465     assert(A.array[1] == 54.0);
1466 }
1467 
1468 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1469 double _mm_cvtsd_f64 (__m128d a) pure @safe
1470 {
1471     return a.array[0];
1472 }
1473 
1474 /// Convert the lower double-precision (64-bit) floating-point element
1475 /// in `a` to a 32-bit integer.
1476 int _mm_cvtsd_si32 (__m128d a) @safe
1477 {
1478     static if (LDC_with_SSE2)
1479     {
1480         return __builtin_ia32_cvtsd2si(a);
1481     }
1482     else static if (GDC_with_SSE2)
1483     {
1484         return __builtin_ia32_cvtsd2si(a);
1485     }
1486     else
1487     {
1488         return convertDoubleToInt32UsingMXCSR(a[0]);
1489     }
1490 }
1491 unittest
1492 {
1493     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1494 }
1495 
1496 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1497 long _mm_cvtsd_si64 (__m128d a) @trusted
1498 {
1499     version (LDC)
1500     {
1501         version (X86_64)
1502         {
1503             return __builtin_ia32_cvtsd2si64(a);
1504         }
1505         else
1506         {
1507             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1508             // using SSE instructions only. So the builtin doesn't exit for this arch.
1509             return convertDoubleToInt64UsingMXCSR(a[0]);
1510         }
1511     }
1512     else
1513     {
1514         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1515     }
1516 }
1517 unittest
1518 {
1519     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1520 
1521     uint savedRounding = _MM_GET_ROUNDING_MODE();
1522 
1523     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1524     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1525 
1526     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1527     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1528 
1529     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1530     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1531 
1532     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1533     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1534 
1535     _MM_SET_ROUNDING_MODE(savedRounding);
1536 }
1537 
1538 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1539 
1540 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1541 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1542 /// to the upper elements of result.
1543 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1544 {
1545     static if (GDC_with_SSE2)
1546     {
1547         return __builtin_ia32_cvtsd2ss(a, b); 
1548     }
1549     else
1550     {
1551         // Generates cvtsd2ss since LDC 1.3 -O0
1552         a.ptr[0] = b.array[0];
1553         return a;
1554     }
1555 }
1556 unittest
1557 {
1558     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1559     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1560 }
1561 
1562 /// Get the lower 32-bit integer in `a`.
1563 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1564 {
1565     return a.array[0];
1566 }
1567 
1568 /// Get the lower 64-bit integer in `a`.
1569 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1570 {
1571     long2 la = cast(long2)a;
1572     return la.array[0];
1573 }
1574 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1575 
1576 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1577 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1578 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1579 {
1580     a.ptr[0] = cast(double)b;
1581     return a;
1582 }
1583 unittest
1584 {
1585     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1586     assert(a.array == [42.0, 0]);
1587 }
1588 
1589 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1590 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1591 {
1592     int4 r = [0, 0, 0, 0];
1593     r.ptr[0] = a;
1594     return r;
1595 }
1596 unittest
1597 {
1598     __m128i a = _mm_cvtsi32_si128(65);
1599     assert(a.array == [65, 0, 0, 0]);
1600 }
1601 
1602 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1603 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1604 
1605 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1606 {
1607     a.ptr[0] = cast(double)b;
1608     return a;
1609 }
1610 unittest
1611 {
1612     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1613     assert(a.array == [42.0, 0]);
1614 }
1615 
1616 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1617 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1618 {
1619     long2 r = [0, 0];
1620     r.ptr[0] = a;
1621     return cast(__m128i)(r);
1622 }
1623 
1624 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1625 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1626 
1627 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1628 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1629 // element of result.
1630 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1631 {
1632     a.ptr[0] = b.array[0];
1633     return a;
1634 }
1635 unittest
1636 {
1637     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1638     assert(a.array == [42.0, 0]);
1639 }
1640 
1641 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1642 long _mm_cvttss_si64 (__m128 a) pure @safe
1643 {
1644     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1645 }
1646 unittest
1647 {
1648     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1649 }
1650 
1651 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1652 /// Put zeroes in the upper elements of result.
1653 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1654 {
1655     static if (LDC_with_SSE2)
1656     {
1657         return __builtin_ia32_cvttpd2dq(a);
1658     }
1659     else static if (GDC_with_SSE2)
1660     {
1661         return __builtin_ia32_cvttpd2dq(a);
1662     }
1663     else
1664     {
1665         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1666         __m128i r; // PERF =void;
1667         r.ptr[0] = cast(int)a.array[0];
1668         r.ptr[1] = cast(int)a.array[1];
1669         r.ptr[2] = 0;
1670         r.ptr[3] = 0;
1671         return r;
1672     }
1673 }
1674 unittest
1675 {
1676     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1677     assert(R.array == [-4, 45641, 0, 0]);
1678 }
1679 
1680 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1681 /// to packed 32-bit integers with truncation.
1682 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1683 {
1684     return to_m64(_mm_cvttpd_epi32(v));
1685 }
1686 unittest
1687 {
1688     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1689     int[2] correct = [-4, 45641];
1690     assert(R.array == correct);
1691 }
1692 
1693 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1694 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1695 {
1696     // x86: Generates cvttps2dq since LDC 1.3 -O2
1697     // ARM64: generates fcvtze since LDC 1.8 -O2
1698     __m128i r; // PERF = void;
1699     r.ptr[0] = cast(int)a.array[0];
1700     r.ptr[1] = cast(int)a.array[1];
1701     r.ptr[2] = cast(int)a.array[2];
1702     r.ptr[3] = cast(int)a.array[3];
1703     return r;
1704 }
1705 unittest
1706 {
1707     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1708     assert(R.array == [-4, 45641, 0, 1]);
1709 }
1710 
1711 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1712 int _mm_cvttsd_si32 (__m128d a)
1713 {
1714     // Generates cvttsd2si since LDC 1.3 -O0
1715     return cast(int)a.array[0];
1716 }
1717 
1718 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1719 long _mm_cvttsd_si64 (__m128d a)
1720 {
1721     // Generates cvttsd2si since LDC 1.3 -O0
1722     // but in 32-bit instead, it's a long sequence that resort to FPU
1723     return cast(long)a.array[0];
1724 }
1725 
1726 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1727 
1728 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1729 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1730 {
1731     pragma(inline, true);
1732     return a / b;
1733 }
1734 
1735 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1736 {
1737     static if (GDC_with_SSE2)
1738     {
1739         return __builtin_ia32_divsd(a, b);
1740     }
1741     else version(DigitalMars)
1742     {
1743         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1744         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1745         asm pure nothrow @nogc @trusted { nop;}
1746         a.array[0] = a.array[0] / b.array[0];
1747         return a;
1748     }
1749     else
1750     {
1751         a.ptr[0] /= b.array[0];
1752         return a;
1753     }
1754 }
1755 unittest
1756 {
1757     __m128d a = [2.0, 4.5];
1758     a = _mm_div_sd(a, a);
1759     assert(a.array == [1.0, 4.5]);
1760 }
1761 
1762 /// Extract a 16-bit integer from `v`, selected with `index`.
1763 /// Warning: the returned value is zero-extended to 32-bits.
1764 int _mm_extract_epi16(__m128i v, int index) pure @safe
1765 {
1766     short8 r = cast(short8)v;
1767     return cast(ushort)(r.array[index & 7]);
1768 }
1769 unittest
1770 {
1771     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1772     assert(_mm_extract_epi16(A, 6) == 6);
1773     assert(_mm_extract_epi16(A, 0) == 65535);
1774     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1775 }
1776 
1777 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1778 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1779 {
1780     short8 r = cast(short8)v;
1781     r.ptr[index & 7] = cast(short)i;
1782     return cast(__m128i)r;
1783 }
1784 unittest
1785 {
1786     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1787     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1788     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1789     assert(R.array == correct);
1790 }
1791 
1792 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 
1793 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 
1794 /// is globally visible before any load instruction which follows the fence in program order.
1795 void _mm_lfence() @trusted
1796 {
1797     version(GNU)
1798     {
1799         static if (GDC_with_SSE2)
1800         {
1801             __builtin_ia32_lfence();
1802         }
1803         else version(X86)
1804         {
1805             asm pure nothrow @nogc @trusted
1806             {
1807                 "lfence;\n" : : : ;
1808             }
1809         }
1810         else
1811             static assert(false);
1812     }
1813     else static if (LDC_with_SSE2)
1814     {
1815         __builtin_ia32_lfence();
1816     }
1817     else static if (LDC_with_ARM64)
1818     {
1819          __builtin_arm_dmb(9);  // dmb ishld
1820     }
1821     else static if (DMD_with_asm)
1822     {
1823         asm nothrow @nogc pure @safe
1824         {
1825             lfence;
1826         }
1827     }
1828     else version(LDC)
1829     {
1830         // When the architecture is unknown, generate a full memory barrier,
1831         // as the semantics of sfence do not really match those of atomics.
1832         llvm_memory_fence();
1833     }
1834     else
1835         static assert(false);
1836 }
1837 unittest
1838 {
1839     _mm_lfence();
1840 }
1841 
1842 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1843 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1844 __m128d _mm_load_pd (const(double) * mem_addr) pure
1845 {
1846     pragma(inline, true);
1847     __m128d* aligned = cast(__m128d*)mem_addr;
1848     return *aligned;
1849 }
1850 unittest
1851 {
1852     align(16) double[2] S = [-5.0, 7.0];
1853     __m128d R = _mm_load_pd(S.ptr);
1854     assert(R.array == S);
1855 }
1856 
1857 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1858 /// `mem_addr` does not need to be aligned on any particular boundary.
1859 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1860 {
1861     double m = *mem_addr;
1862     __m128d r; // PERF =void;
1863     r.ptr[0] = m;
1864     r.ptr[1] = m;
1865     return r;
1866 }
1867 unittest
1868 {
1869     double what = 4;
1870     __m128d R = _mm_load_pd1(&what);
1871     double[2] correct = [4.0, 4];
1872     assert(R.array == correct);
1873 }
1874 
1875 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1876 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1877 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1878 {
1879     double2 r = [0, 0];
1880     r.ptr[0] = *mem_addr;
1881     return r;
1882 }
1883 unittest
1884 {
1885     double x = -42;
1886     __m128d a = _mm_load_sd(&x);
1887     assert(a.array == [-42.0, 0.0]);
1888 }
1889 
1890 /// Load 128-bits of integer data from memory into dst. 
1891 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1892 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe
1893 {
1894     pragma(inline, true);
1895     return *mem_addr;
1896 }
1897 unittest
1898 {
1899     align(16) int[4] correct = [-1, 2, 3, 4];
1900     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1901     assert(A.array == correct);
1902 }
1903 
1904 alias _mm_load1_pd = _mm_load_pd1; ///
1905 
1906 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1907 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1908 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1909 {
1910     pragma(inline, true);
1911     a.ptr[1] = *mem_addr;
1912     return a;
1913 }
1914 unittest
1915 {
1916     double A = 7.0;
1917     __m128d B = _mm_setr_pd(4.0, -5.0);
1918     __m128d R = _mm_loadh_pd(B, &A);
1919     double[2] correct = [ 4.0, 7.0 ];
1920     assert(R.array == correct);
1921 }
1922 
1923 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1924 /// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit.
1925 /// You may use `_mm_loadu_si64` instead.
1926 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
1927 {
1928     pragma(inline, true);
1929     static if (DMD_with_DSIMD)
1930     {
1931         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
1932     }
1933     else
1934     {
1935         auto pLong = cast(const(long)*)mem_addr;
1936         long2 r = [0, 0];
1937         r.ptr[0] = *pLong;
1938         return cast(__m128i)(r);
1939     }
1940 }
1941 unittest
1942 {
1943     long A = 0x7878787870707070;
1944     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1945     long[2] correct = [0x7878787870707070, 0];
1946     assert(R.array == correct);
1947 }
1948 
1949 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1950 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1951 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1952 {
1953     a.ptr[0] = *mem_addr;
1954     return a;
1955 }
1956 unittest
1957 {
1958     double A = 7.0;
1959     __m128d B = _mm_setr_pd(4.0, -5.0);
1960     __m128d R = _mm_loadl_pd(B, &A);
1961     double[2] correct = [ 7.0, -5.0 ];
1962     assert(R.array == correct);
1963 }
1964 
1965 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1966 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1967 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1968 {
1969     __m128d a = *cast(__m128d*)(mem_addr);
1970     __m128d r; // PERF =void;
1971     r.ptr[0] = a.array[1];
1972     r.ptr[1] = a.array[0];
1973     return r;
1974 }
1975 unittest
1976 {
1977     align(16) double[2] A = [56.0, -74.0];
1978     __m128d R = _mm_loadr_pd(A.ptr);
1979     double[2] correct = [-74.0, 56.0];
1980     assert(R.array == correct);
1981 }
1982 
1983 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1984 /// `mem_addr` does not need to be aligned on any particular boundary.
1985 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1986 {
1987     pragma(inline, true);
1988     static if (GDC_with_SSE2)
1989     {
1990         return __builtin_ia32_loadupd(mem_addr); 
1991     }
1992     else version(LDC)
1993     {
1994         return loadUnaligned!(double2)(mem_addr);
1995     }
1996     else version(DigitalMars)
1997     {
1998         // Apparently inside __simd you can use aligned dereferences without fear.
1999         // That was issue 23048 on dlang's Bugzilla.
2000         static if (DMD_with_DSIMD)
2001         {
2002             return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
2003         }
2004         else static if (SSESizedVectorsAreEmulated)
2005         {
2006             // Since this vector is emulated, it doesn't have alignement constraints
2007             // and as such we can just cast it.
2008             return *cast(__m128d*)(mem_addr);
2009         }
2010         else
2011         {
2012             __m128d result;
2013             result.ptr[0] = mem_addr[0];
2014             result.ptr[1] = mem_addr[1];
2015             return result;
2016         }
2017     }
2018     else
2019     {
2020         __m128d result;
2021         result.ptr[0] = mem_addr[0];
2022         result.ptr[1] = mem_addr[1];
2023         return result;
2024     }
2025 }
2026 unittest
2027 {
2028     double[2] A = [56.0, -75.0];
2029     __m128d R = _mm_loadu_pd(A.ptr);
2030     double[2] correct = [56.0, -75.0];
2031     assert(R.array == correct);
2032 }
2033 
2034 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
2035 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
2036 {
2037     // PERF DMD
2038     pragma(inline, true);
2039     static if (GDC_with_SSE2)
2040     {
2041         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
2042     }
2043     else version(LDC)
2044     {
2045         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2046     }
2047     else
2048     {
2049         const(int)* p = cast(const(int)*)mem_addr;
2050         __m128i r = void;
2051         r.ptr[0] = p[0];
2052         r.ptr[1] = p[1];
2053         r.ptr[2] = p[2];
2054         r.ptr[3] = p[3];
2055         return r;
2056     }
2057 }
2058 unittest
2059 {
2060     align(16) int[4] correct = [-1, 2, -3, 4];
2061     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2062     assert(A.array == correct);
2063 }
2064 
2065 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
2066 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2067 {
2068     static if (DMD_with_DSIMD)
2069     {
2070         int r = *cast(short*)(mem_addr);
2071         return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
2072     }
2073     else version(DigitalMars)
2074     {
2075         // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
2076         // DMD cannot handle the below code...
2077         align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
2078         r[0] = *cast(short*)(mem_addr);
2079         return *cast(int4*)(r.ptr);
2080     }
2081     else
2082     {
2083         short r = *cast(short*)(mem_addr);
2084         short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
2085         result.ptr[0] = r;
2086         return cast(__m128i)result;
2087     }
2088 }
2089 unittest
2090 {
2091     short r = 13;
2092     short8 A = cast(short8) _mm_loadu_si16(&r);
2093     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
2094     assert(A.array == correct);
2095 }
2096 
2097 /// Load unaligned 32-bit integer from memory into the first element of result.
2098 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2099 {
2100     pragma(inline, true);
2101     int r = *cast(int*)(mem_addr);
2102     int4 result = [0, 0, 0, 0];
2103     result.ptr[0] = r;
2104     return result;
2105 }
2106 unittest
2107 {
2108     int r = 42;
2109     __m128i A = _mm_loadu_si32(&r);
2110     int[4] correct = [42, 0, 0, 0];
2111     assert(A.array == correct);
2112 }
2113 
2114 /// Load unaligned 64-bit integer from memory into the first element of result.
2115 /// Upper 64-bit is zeroed.
2116 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system
2117 {
2118     pragma(inline, true);
2119     static if (DMD_with_DSIMD)
2120     {
2121         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2122     }
2123     else
2124     {    
2125         auto pLong = cast(const(long)*)mem_addr;
2126         long2 r = [0, 0];
2127         r.ptr[0] = *pLong;
2128         return cast(__m128i)r;
2129     }
2130 }
2131 unittest
2132 {
2133     long r = 446446446446;
2134     long2 A = cast(long2) _mm_loadu_si64(&r);
2135     long[2] correct = [446446446446, 0];
2136     assert(A.array == correct);
2137 }
2138 
2139 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2140 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2141 /// and pack the results in destination.
2142 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2143 {
2144     static if (GDC_with_SSE2)
2145     {
2146         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2147     }
2148     else static if (LDC_with_SSE2)
2149     {
2150         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2151     }
2152     else static if (LDC_with_ARM64)
2153     {
2154         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2155         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2156         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2157         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2158         return vcombine_s32(rl, rh);
2159     }
2160     else
2161     {
2162         short8 sa = cast(short8)a;
2163         short8 sb = cast(short8)b;
2164         int4 r;
2165         foreach(i; 0..4)
2166         {
2167             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2168         }
2169         return r;
2170     }
2171 }
2172 unittest
2173 {
2174     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2175     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2176     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2177     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2178     assert(R.array == correct);
2179 }
2180 
2181 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2182 /// (elements are not stored when the highest bit is not set in the corresponding element)
2183 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2184 /// boundary.
2185 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2186 {
2187     static if (GDC_with_SSE2)
2188     {    
2189         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2190     }
2191     else static if (LDC_with_SSE2)
2192     {
2193         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2194     }
2195     else static if (LDC_with_ARM64)
2196     {
2197         // PERF: catastrophic on ARM32
2198         byte16 bmask  = cast(byte16)mask;
2199         byte16 shift = 7;
2200         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2201         mask = cast(__m128i) bmask;
2202         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2203         dest = (a & mask) | (dest & ~mask);
2204         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2205     }
2206     else
2207     {
2208         byte16 b = cast(byte16)a;
2209         byte16 m = cast(byte16)mask;
2210         byte* dest = cast(byte*)(mem_addr);
2211         foreach(j; 0..16)
2212         {
2213             if (m.array[j] & 128)
2214             {
2215                 dest[j] = b.array[j];
2216             }
2217         }
2218     }
2219 }
2220 unittest
2221 {
2222     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2223     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2224     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2225     _mm_maskmoveu_si128(A, mask, dest.ptr);
2226     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2227     assert(dest == correct);
2228 }
2229 
2230 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2231 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2232 {
2233     static if (GDC_with_SSE2)
2234     {
2235         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2236     }
2237     else version(LDC)
2238     {
2239         // x86: pmaxsw since LDC 1.0 -O1
2240         // ARM: smax.8h since LDC 1.5 -01
2241         short8 sa = cast(short8)a;
2242         short8 sb = cast(short8)b;
2243         short8 greater = greaterMask!short8(sa, sb);
2244         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2245     }
2246     else
2247     {
2248         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2249         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2250         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2251         return _mm_xor_si128(b, mask);
2252     }
2253 }
2254 unittest
2255 {
2256     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2257                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2258     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2259     assert(R.array == correct);
2260 }
2261 
2262 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2263 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2264 {
2265     version(LDC)
2266     {
2267         // x86: pmaxub since LDC 1.0.0 -O1
2268         // ARM64: umax.16b since LDC 1.5.0 -O1
2269         // PERF: catastrophic on ARM32
2270         ubyte16 sa = cast(ubyte16)a;
2271         ubyte16 sb = cast(ubyte16)b;
2272         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2273         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2274     }
2275     else
2276     {
2277         __m128i value128 = _mm_set1_epi8(-128);
2278         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2279         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2280         __m128i mask = _mm_and_si128(aTob, higher);
2281         return _mm_xor_si128(b, mask);
2282     }
2283 }
2284 unittest
2285 {
2286     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2287                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2288     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2289     assert(R.array == correct);
2290 }
2291 
2292 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2293 /// packed maximum values.
2294 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2295 {
2296     static if (GDC_with_SSE2)
2297     {
2298         return __builtin_ia32_maxpd(a, b);
2299     }
2300     else
2301     {
2302         // x86: Generates maxpd starting with LDC 1.9 -O2
2303         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2304         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2305         return a;
2306     }
2307 }
2308 unittest
2309 {
2310     __m128d A = _mm_setr_pd(4.0, 1.0);
2311     __m128d B = _mm_setr_pd(1.0, 8.0);
2312     __m128d M = _mm_max_pd(A, B);
2313     assert(M.array[0] == 4.0);
2314     assert(M.array[1] == 8.0);
2315 }
2316 
2317 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2318 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2319 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2320 {
2321     static if (GDC_with_SSE2)
2322     {
2323         return __builtin_ia32_maxsd(a, b);
2324     }
2325     else
2326     {
2327          __m128d r = a;
2328         // Generates maxsd starting with LDC 1.3
2329         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2330         return r;
2331     }
2332 }
2333 unittest
2334 {
2335     __m128d A = _mm_setr_pd(1.0, 1.0);
2336     __m128d B = _mm_setr_pd(4.0, 2.0);
2337     __m128d M = _mm_max_sd(A, B);
2338     assert(M.array[0] == 4.0);
2339     assert(M.array[1] == 1.0);
2340 }
2341 
2342 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2343 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2344 /// is globally visible before any memory instruction which follows the fence in program order.
2345 void _mm_mfence() @trusted // not pure!
2346 {
2347     version(GNU)
2348     {
2349         static if (GDC_with_SSE2)
2350         {
2351             __builtin_ia32_mfence();
2352         }
2353         else version(X86)
2354         {
2355             asm pure nothrow @nogc @trusted
2356             {
2357                 "mfence;\n" : : : ;
2358             }
2359         }
2360         else
2361             static assert(false);
2362     }
2363     else static if (LDC_with_SSE2)
2364     {
2365         __builtin_ia32_mfence();
2366     }
2367     else static if (DMD_with_asm)
2368     {
2369         asm nothrow @nogc pure @safe
2370         {
2371             mfence;
2372         }
2373     }
2374     else version(LDC)
2375     {
2376         // Note: will generate the DMB ish instruction on ARM
2377         llvm_memory_fence();
2378     }
2379     else
2380         static assert(false);
2381 }
2382 unittest
2383 {
2384     _mm_mfence();
2385 }
2386 
2387 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2388 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2389 {
2390     static if (GDC_with_SSE2)
2391     {
2392         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2393     }
2394     else version(LDC)
2395     {
2396         // x86: pminsw since LDC 1.0 -O1
2397         // ARM64: smin.8h since LDC 1.5 -01
2398         short8 sa = cast(short8)a;
2399         short8 sb = cast(short8)b;
2400         short8 greater = greaterMask!short8(sa, sb);
2401         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2402     }
2403     else
2404     {
2405         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2406         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2407         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2408         return _mm_xor_si128(b, mask);
2409     }
2410 }
2411 unittest
2412 {
2413     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2414                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2415     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2416     assert(R.array == correct);
2417 }
2418 
2419 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2420 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2421 {
2422     version(LDC)
2423     {
2424         // x86: pminub since LDC 1.0.0 -O1
2425         // ARM: umin.16b since LDC 1.5.0 -O1
2426         // PERF: catastrophic on ARM32
2427         ubyte16 sa = cast(ubyte16)a;
2428         ubyte16 sb = cast(ubyte16)b;
2429         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2430         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2431     }
2432     else
2433     {
2434         __m128i value128 = _mm_set1_epi8(-128);
2435         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2436         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2437         __m128i mask = _mm_and_si128(aTob, lower);
2438         return _mm_xor_si128(b, mask);
2439     }
2440 }
2441 unittest
2442 {
2443     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2444                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2445     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2446     assert(R.array == correct);
2447 }
2448 
2449 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2450 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2451 {
2452     static if (GDC_with_SSE2)
2453     {
2454         return __builtin_ia32_minpd(a, b);
2455     }
2456     else
2457     {
2458         // Generates minpd starting with LDC 1.9
2459         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2460         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2461         return a;
2462     }
2463 }
2464 unittest
2465 {
2466     __m128d A = _mm_setr_pd(1.0, 2.0);
2467     __m128d B = _mm_setr_pd(4.0, 1.0);
2468     __m128d M = _mm_min_pd(A, B);
2469     assert(M.array[0] == 1.0);
2470     assert(M.array[1] == 1.0);
2471 }
2472 
2473 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2474 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2475 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2476 {
2477     static if (GDC_with_SSE2)
2478     {
2479         return __builtin_ia32_minsd(a, b);
2480     }
2481     else
2482     {
2483         // Generates minsd starting with LDC 1.3
2484         __m128d r = a;
2485         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2486         return r;
2487     }
2488 }
2489 unittest
2490 {
2491     __m128d A = _mm_setr_pd(1.0, 3.0);
2492     __m128d B = _mm_setr_pd(4.0, 2.0);
2493     __m128d M = _mm_min_sd(A, B);
2494     assert(M.array[0] == 1.0);
2495     assert(M.array[1] == 3.0);
2496 }
2497 
2498 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2499 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2500 {
2501     static if (GDC_with_SSE2)
2502     {
2503         // slightly better with GDC -O0
2504         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2505     }
2506     else
2507     {
2508         long2 result = [ 0, 0 ];
2509         long2 la = cast(long2) a;
2510         result.ptr[0] = la.array[0];
2511         return cast(__m128i)(result);
2512     }
2513 }
2514 unittest
2515 {
2516     long2 A = [13, 47];
2517     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2518     long[2] correct = [13, 0];
2519     assert(B.array == correct);
2520 }
2521 
2522 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2523 /// the upper element from `a` to the upper element of dst.
2524 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2525 {
2526     static if (GDC_with_SSE2)
2527     {
2528         return __builtin_ia32_movsd(a, b); 
2529     }
2530     else
2531     {
2532         b.ptr[1] = a.array[1];
2533         return b;
2534     }
2535 }
2536 unittest
2537 {
2538     double2 A = [13.0, 47.0];
2539     double2 B = [34.0, 58.0];
2540     double2 C = _mm_move_sd(A, B);
2541     double[2] correct = [34.0, 47.0];
2542     assert(C.array == correct);
2543 }
2544 
2545 /// Create mask from the most significant bit of each 8-bit element in `v`.
2546 int _mm_movemask_epi8 (__m128i a) pure @trusted
2547 {
2548     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2549     static if (GDC_with_SSE2)
2550     {
2551         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2552     }
2553     else static if (LDC_with_SSE2)
2554     {
2555         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2556     }
2557     else static if (LDC_with_ARM64)
2558     {
2559         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2560         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2561         // SO there might be something a bit faster, but this one is reasonable and branchless.
2562         byte8 mask_shift;
2563         mask_shift.ptr[0] = 7;
2564         mask_shift.ptr[1] = 6;
2565         mask_shift.ptr[2] = 5;
2566         mask_shift.ptr[3] = 4;
2567         mask_shift.ptr[4] = 3;
2568         mask_shift.ptr[5] = 2;
2569         mask_shift.ptr[6] = 1;
2570         mask_shift.ptr[7] = 0;
2571         byte8 mask_and = byte8(-128);
2572         byte8 lo = vget_low_u8(cast(byte16)a);
2573         byte8 hi = vget_high_u8(cast(byte16)a);
2574         lo = vand_u8(lo, mask_and);
2575         lo = vshr_u8(lo, mask_shift);
2576         hi = vand_u8(hi, mask_and);
2577         hi = vshr_u8(hi, mask_shift);
2578         lo = vpadd_u8(lo,lo);
2579         lo = vpadd_u8(lo,lo);
2580         lo = vpadd_u8(lo,lo);
2581         hi = vpadd_u8(hi,hi);
2582         hi = vpadd_u8(hi,hi);
2583         hi = vpadd_u8(hi,hi);
2584         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2585     }
2586     else
2587     {
2588         byte16 ai = cast(byte16)a;
2589         int r = 0;
2590         foreach(bit; 0..16)
2591         {
2592             if (ai.array[bit] < 0) r += (1 << bit);
2593         }
2594         return r;
2595     }
2596 }
2597 unittest
2598 {
2599     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2600 }
2601 
2602 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2603 int _mm_movemask_epi16 (__m128i a) pure @trusted
2604 {
2605     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2606 }
2607 unittest
2608 {
2609     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2610 }
2611 
2612 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2613 /// loating-point element in `v`.
2614 int _mm_movemask_pd(__m128d v) pure @safe
2615 {
2616     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2617     static if (GDC_or_LDC_with_SSE2)
2618     {
2619         return __builtin_ia32_movmskpd(v);
2620     }
2621     else
2622     {
2623         long2 lv = cast(long2)v;
2624         int r = 0;
2625         if (lv.array[0] < 0) r += 1;
2626         if (lv.array[1] < 0) r += 2;
2627         return r;
2628     }
2629 }
2630 unittest
2631 {
2632     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2633     assert(_mm_movemask_pd(A) == 2);
2634 }
2635 
2636 /// Copy the lower 64-bit integer in `v`.
2637 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2638 {
2639     long2 lv = cast(long2)v;
2640     return long1(lv.array[0]);
2641 }
2642 unittest
2643 {
2644     __m128i A = _mm_set_epi64x(-1, -2);
2645     __m64 R = _mm_movepi64_pi64(A);
2646     assert(R.array[0] == -2);
2647 }
2648 
2649 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2650 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2651 {
2652     long2 r;
2653     r.ptr[0] = a.array[0];
2654     r.ptr[1] = 0;
2655     return cast(__m128i)r;
2656 }
2657 
2658 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 
2659 /// and store the unsigned 64-bit results.
2660 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2661 {    
2662     // PERF DMD D_SIMD
2663     static if (GDC_with_SSE2)
2664     {
2665         return cast(__m128i) __builtin_ia32_pmuludq128 (a, b);
2666     }
2667     else
2668     {
2669         version(LDC)
2670         {
2671             static if (__VERSION__ >= 2088)
2672             {
2673                 // Need LLVM9 for proper optimization
2674                 long2 la, lb;
2675                 la.ptr[0] = cast(uint)a.array[0];
2676                 la.ptr[1] = cast(uint)a.array[2];
2677                 lb.ptr[0] = cast(uint)b.array[0];
2678                 lb.ptr[1] = cast(uint)b.array[2];
2679             }
2680             else
2681             {
2682                 __m128i zero;
2683                 zero = 0;
2684                 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero);
2685                 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero);
2686             }
2687         }
2688         else
2689         {
2690             long2 la, lb;
2691             la.ptr[0] = cast(uint)a.array[0];
2692             la.ptr[1] = cast(uint)a.array[2];
2693             lb.ptr[0] = cast(uint)b.array[0];
2694             lb.ptr[1] = cast(uint)b.array[2];
2695         }
2696 
2697         version(DigitalMars)
2698         {
2699             // DMD has no long2 mul
2700             la.ptr[0] *= lb.array[0];
2701             la.ptr[1] *= lb.array[1];
2702             return cast(__m128i)(la);
2703         }
2704         else
2705         {
2706             static if (__VERSION__ >= 2076)
2707             {
2708                 return cast(__m128i)(la * lb);
2709             }
2710             else
2711             {
2712                 // long2 mul not supported before LDC 1.5
2713                 la.ptr[0] *= lb.array[0];
2714                 la.ptr[1] *= lb.array[1];
2715                 return cast(__m128i)(la);
2716             }
2717         }
2718     }
2719 }
2720 unittest
2721 {
2722     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2723     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2724     __m128i C = _mm_mul_epu32(A, B);
2725     long2 LC = cast(long2)C;
2726     assert(LC.array[0] == 18446744065119617025uL);
2727     assert(LC.array[1] == 12723420444339690338uL);
2728 }
2729 
2730 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2731 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2732 {
2733     pragma(inline, true);
2734     return a * b;
2735 }
2736 unittest
2737 {
2738     __m128d a = [-2.0, 1.5];
2739     a = _mm_mul_pd(a, a);
2740     assert(a.array == [4.0, 2.25]);
2741 }
2742 
2743 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2744 /// element of result, and copy the upper element from `a` to the upper element of result.
2745 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2746 {
2747     version(DigitalMars)
2748     {    
2749         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2750         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2751         asm pure nothrow @nogc @trusted { nop;}
2752         a.array[0] = a.array[0] * b.array[0];
2753         return a;
2754     }
2755     else static if (GDC_with_SSE2)
2756     {
2757         return __builtin_ia32_mulsd(a, b);
2758     }
2759     else
2760     {
2761         a.ptr[0] *= b.array[0];
2762         return a;
2763     }
2764 }
2765 unittest
2766 {
2767     __m128d a = [-2.0, 1.5];
2768     a = _mm_mul_sd(a, a);
2769     assert(a.array == [4.0, 1.5]);
2770 }
2771 
2772 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2773 /// and get an unsigned 64-bit result.
2774 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2775 {
2776     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2777 }
2778 unittest
2779 {
2780     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2781     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2782     __m64 C = _mm_mul_su32(A, B);
2783     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2784 }
2785 
2786 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2787 /// high 16 bits of the intermediate integers.
2788 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2789 {
2790     static if (GDC_with_SSE2)
2791     {
2792         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2793     }
2794     else static if (LDC_with_SSE2)
2795     {
2796         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2797     }
2798     else
2799     {
2800         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2801         //        PERF: it seems the simde solution has one less instruction in ARM64.
2802         // PERF: Catastrophic in ARM32.
2803         short8 sa = cast(short8)a;
2804         short8 sb = cast(short8)b;
2805         short8 r = void;
2806         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2807         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2808         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2809         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2810         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2811         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2812         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2813         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2814         return cast(__m128i)r;
2815     }
2816 }
2817 unittest
2818 {
2819     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2820     __m128i B = _mm_set1_epi16(16384);
2821     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2822     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2823     assert(R.array == correct);
2824 }
2825 
2826 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2827 /// high 16 bits of the intermediate integers.
2828 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2829 {
2830     static if (GDC_with_SSE2)
2831     {
2832         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2833     }
2834     else static if (LDC_with_SSE2)
2835     {
2836         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2837     }
2838     else
2839     {
2840         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2841         //      it seems the simde solution has one less instruction in ARM64
2842         // PERF: Catastrophic in ARM32.
2843         short8 sa = cast(short8)a;
2844         short8 sb = cast(short8)b;
2845         short8 r = void;
2846         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2847         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2848         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2849         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2850         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2851         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2852         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2853         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2854         return cast(__m128i)r;
2855     }
2856 }
2857 unittest
2858 {
2859     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2860     __m128i B = _mm_set1_epi16(16384);
2861     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2862     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2863     assert(R.array == correct);
2864 }
2865 
2866 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2867 /// bits of the intermediate integers.
2868 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2869 {
2870     return cast(__m128i)(cast(short8)a * cast(short8)b);
2871 }
2872 unittest
2873 {
2874     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2875     __m128i B = _mm_set1_epi16(16384);
2876     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2877     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2878     assert(R.array == correct);
2879 }
2880 
2881 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
2882 __m128i _mm_not_si128 (__m128i a) pure @safe
2883 {
2884     return ~a;
2885 }
2886 unittest
2887 {
2888     __m128i A = _mm_set1_epi32(-748);
2889     int4 notA = cast(int4) _mm_not_si128(A);
2890     int[4] correct = [747, 747, 747, 747];
2891     assert(notA.array == correct);
2892 }
2893 
2894 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2895 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2896 {
2897     pragma(inline, true);
2898     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2899 }
2900 
2901 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2902 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2903 {
2904     pragma(inline, true);
2905     return a | b;
2906 }
2907 
2908 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2909 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2910 {
2911     static if (GDC_with_SSE2)
2912     {
2913         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2914     }    
2915     else static if (LDC_with_SSE2)
2916     {
2917         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2918     }
2919     else static if (LDC_with_ARM64)
2920     {
2921         short4 ra = vqmovn_s32(cast(int4)a);
2922         short4 rb = vqmovn_s32(cast(int4)b);
2923         return cast(__m128i)vcombine_s16(ra, rb);
2924     }
2925     else
2926     {
2927         // PERF: catastrophic on ARM32
2928         short8 r;
2929         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2930         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2931         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2932         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2933         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2934         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2935         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2936         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2937         return cast(__m128i)r;
2938     }
2939 }
2940 unittest
2941 {
2942     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2943     short8 R = cast(short8) _mm_packs_epi32(A, A);
2944     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2945     assert(R.array == correct);
2946 }
2947 
2948 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2949 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2950 {
2951     static if (GDC_with_SSE2)
2952     {
2953         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2954     }
2955     else static if (LDC_with_SSE2)
2956     {
2957         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2958     }
2959     else static if (LDC_with_ARM64)
2960     {
2961         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2962         byte8 ra = vqmovn_s16(cast(short8)a);
2963         byte8 rb = vqmovn_s16(cast(short8)b);
2964         return cast(__m128i)vcombine_s8(ra, rb);
2965     }
2966     else
2967     {
2968         // PERF: ARM32 is missing
2969         byte16 r;
2970         short8 sa = cast(short8)a;
2971         short8 sb = cast(short8)b;
2972         foreach(i; 0..8)
2973             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2974         foreach(i; 0..8)
2975             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2976         return cast(__m128i)r;
2977     }
2978 }
2979 unittest
2980 {
2981     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2982     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2983     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2984                         127, -128, 127, 0, 127, -128, 127, 0];
2985     assert(R.array == correct);
2986 }
2987 
2988 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2989 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2990 {
2991     // PERF DMD catastrophic
2992     static if (GDC_with_SSE2)
2993     {
2994         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2995     }
2996     else static if (LDC_with_SSE2)
2997     {
2998         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2999     }
3000     else static if (LDC_with_ARM64)
3001     {
3002         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
3003         byte8 ra = vqmovun_s16(cast(short8)a);
3004         byte8 rb = vqmovun_s16(cast(short8)b);
3005         return cast(__m128i)vcombine_s8(ra, rb);
3006     }
3007     else
3008     {
3009         short8 sa = cast(short8)a;
3010         short8 sb = cast(short8)b;
3011         align(16) ubyte[16] result = void;
3012         for (int i = 0; i < 8; ++i)
3013         {
3014             short s = sa[i];
3015             if (s < 0) s = 0;
3016             if (s > 255) s = 255;
3017             result[i] = cast(ubyte)s;
3018 
3019             s = sb[i];
3020             if (s < 0) s = 0;
3021             if (s > 255) s = 255;
3022             result[i+8] = cast(ubyte)s;
3023         }
3024         return *cast(__m128i*)(result.ptr);
3025     }
3026 }
3027 unittest
3028 {
3029     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
3030     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
3031     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
3032                                                 0, 255, 0, 255, 255, 2, 1, 0];
3033     foreach(i; 0..16)
3034         assert(AA.array[i] == cast(byte)(correctResult[i]));
3035 }
3036 
3037 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
3038 /// and power consumption of spin-wait loops.
3039 void _mm_pause() @trusted
3040 {
3041     version(GNU)
3042     {
3043         static if (GDC_with_SSE2)
3044         {
3045             __builtin_ia32_pause();
3046         }
3047         else version(X86)
3048         {
3049             asm pure nothrow @nogc @trusted
3050             {
3051                 "pause;\n" : : : ;
3052             }
3053         }
3054         else
3055             static assert(false);
3056     }
3057     else static if (LDC_with_SSE2)
3058     {
3059         __builtin_ia32_pause();
3060     }
3061     else static if (DMD_with_asm)
3062     {
3063         asm nothrow @nogc pure @safe
3064         {
3065             rep; nop; // F3 90 =  pause
3066         }
3067     }
3068     else version (LDC)
3069     {
3070         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
3071     }
3072     else
3073         static assert(false);
3074 }
3075 unittest
3076 {
3077     _mm_pause();
3078 }
3079 
3080 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
3081 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
3082 /// low 16 bits of 64-bit elements in result.
3083 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
3084 {
3085     static if (GDC_with_SSE2)
3086     {
3087         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
3088     }
3089     else static if (LDC_with_SSE2)
3090     {
3091         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
3092     }
3093     else static if (LDC_with_ARM64)
3094     {
3095         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
3096 
3097         // PERF: Looks suboptimal vs addp
3098         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
3099         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
3100         ushort8 r = 0;
3101         r[0] = r0;
3102         r[4] = r4;
3103         return cast(__m128i) r;
3104     }
3105     else
3106     {
3107         // PERF: ARM32 is lacking
3108         byte16 ab = cast(byte16)a;
3109         byte16 bb = cast(byte16)b;
3110         ubyte[16] t;
3111         foreach(i; 0..16)
3112         {
3113             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
3114             if (diff < 0) diff = -diff;
3115             t[i] = cast(ubyte)(diff);
3116         }
3117         int4 r = _mm_setzero_si128();
3118         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
3119         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
3120         return r;
3121     }
3122 }
3123 unittest
3124 {
3125     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3126     __m128i B = _mm_set1_epi8(1);
3127     __m128i R = _mm_sad_epu8(A, B);
3128     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3129                       0,
3130                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3131                       0];
3132     assert(R.array == correct);
3133 }
3134 
3135 /// Set packed 16-bit integers with the supplied values.
3136 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3137 {
3138     short8 r = void;
3139     r.ptr[0] = e0;
3140     r.ptr[1] = e1;
3141     r.ptr[2] = e2;
3142     r.ptr[3] = e3;
3143     r.ptr[4] = e4;
3144     r.ptr[5] = e5;
3145     r.ptr[6] = e6;
3146     r.ptr[7] = e7;
3147     return cast(__m128i) r;
3148 }
3149 unittest
3150 {
3151     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3152     short8 B = cast(short8) A;
3153     foreach(i; 0..8)
3154         assert(B.array[i] == i);
3155 }
3156 
3157 /// Set packed 32-bit integers with the supplied values.
3158 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3159 {
3160     // PERF: does a constant inline correctly? vs int4 field assignment
3161     align(16) int[4] r = [e0, e1, e2, e3];
3162     return *cast(int4*)&r;
3163 }
3164 unittest
3165 {
3166     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3167     foreach(i; 0..4)
3168         assert(A.array[i] == i);
3169 }
3170 
3171 /// Set packed 64-bit integers with the supplied values.
3172 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3173 {
3174     pragma(inline, true);
3175     long2 r = void;
3176     r.ptr[0] = e0.array[0];
3177     r.ptr[1] = e1.array[0];
3178     return cast(__m128i)(r);
3179 }
3180 unittest
3181 {
3182     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3183     long2 B = cast(long2) A;
3184     assert(B.array[0] == 5678);
3185     assert(B.array[1] == 1234);
3186 }
3187 
3188 /// Set packed 64-bit integers with the supplied values.
3189 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3190 {
3191     pragma(inline, true);
3192     long2 r = void;
3193     r.ptr[0] = e0;
3194     r.ptr[1] = e1;
3195     return cast(__m128i)(r);
3196 }
3197 unittest
3198 {
3199     __m128i A = _mm_set_epi64x(1234, -5678);
3200     long2 B = cast(long2) A;
3201     assert(B.array[0] == -5678);
3202     assert(B.array[1] == 1234);
3203 }
3204 
3205 /// Set packed 8-bit integers with the supplied values.
3206 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3207                       byte e11, byte e10, byte e9, byte e8,
3208                       byte e7, byte e6, byte e5, byte e4,
3209                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3210 {
3211     align(16) byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3212                                  e8, e9, e10, e11, e12, e13, e14, e15];
3213     return *cast(__m128i*)(result.ptr);
3214 }
3215 unittest
3216 {
3217     byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3218     byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3219     assert(R.array == correct);
3220 }
3221 
3222 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3223 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3224 {
3225     pragma(inline, true);
3226     double2 r = void;
3227     r.ptr[0] = e0;
3228     r.ptr[1] = e1;
3229     return r;
3230 }
3231 unittest
3232 {
3233     __m128d A = _mm_set_pd(61.0, 55.0);
3234     double[2] correct = [55.0, 61.0];
3235     assert(A.array == correct);
3236 }
3237 
3238 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3239 __m128d _mm_set_pd1 (double a) pure @trusted
3240 {
3241     pragma(inline, true);
3242     __m128d r = void;
3243     r.ptr[0] = a;
3244     r.ptr[1] = a;
3245     return r;
3246 }
3247 unittest
3248 {
3249     __m128d A = _mm_set_pd1(61.0);
3250     double[2] correct = [61.0, 61.0];
3251     assert(A.array == correct);
3252 }
3253 
3254 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3255 /// and zero the upper element.
3256 __m128d _mm_set_sd (double a) pure @trusted
3257 {
3258     double2 r = void;
3259     r.ptr[0] = a;
3260     r.ptr[1] = 0.0;
3261     return r;
3262 }
3263 unittest
3264 {
3265     __m128d A = _mm_set_sd(61.0);
3266     double[2] correct = [61.0, 0.0];
3267     assert(A.array == correct);
3268 }
3269 
3270 /// Broadcast 16-bit integer a to all elements of dst.
3271 __m128i _mm_set1_epi16 (short a) pure @trusted
3272 {
3273     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3274     {
3275         short8 v = a;
3276         return cast(__m128i) v;
3277     }
3278     else
3279     {
3280         pragma(inline, true);
3281         return cast(__m128i)(short8(a));
3282     }
3283 }
3284 unittest
3285 {
3286     short8 a = cast(short8) _mm_set1_epi16(31);
3287     for (int i = 0; i < 8; ++i)
3288         assert(a.array[i] == 31);
3289 }
3290 
3291 /// Broadcast 32-bit integer `a` to all elements.
3292 __m128i _mm_set1_epi32 (int a) pure @trusted
3293 {
3294     pragma(inline, true);
3295     return cast(__m128i)(int4(a));
3296 }
3297 unittest
3298 {
3299     int4 a = cast(int4) _mm_set1_epi32(31);
3300     for (int i = 0; i < 4; ++i)
3301         assert(a.array[i] == 31);
3302 }
3303 
3304 /// Broadcast 64-bit integer `a` to all elements.
3305 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3306 {
3307     return _mm_set_epi64(a, a);
3308 }
3309 unittest
3310 {
3311     long b = 0x1DEADCAFE; 
3312     __m64 a;
3313     a.ptr[0] = b;
3314     long2 c = cast(long2) _mm_set1_epi64(a);
3315     assert(c.array[0] == b);
3316     assert(c.array[1] == b);
3317 }
3318 
3319 /// Broadcast 64-bit integer `a` to all elements
3320 __m128i _mm_set1_epi64x (long a) pure @trusted
3321 {
3322     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3323     return cast(__m128i)(b);
3324 }
3325 unittest
3326 {
3327     long b = 0x1DEADCAFE;
3328     long2 c = cast(long2) _mm_set1_epi64x(b);
3329     for (int i = 0; i < 2; ++i)
3330         assert(c.array[i] == b);
3331 }
3332 
3333 /// Broadcast 8-bit integer `a` to all elements.
3334 __m128i _mm_set1_epi8 (byte a) pure @trusted
3335 {
3336     pragma(inline, true);
3337     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3338     return cast(__m128i)(b);
3339 }
3340 unittest
3341 {
3342     byte16 b = cast(byte16) _mm_set1_epi8(31);
3343     for (int i = 0; i < 16; ++i)
3344         assert(b.array[i] == 31);
3345 }
3346 
3347 alias _mm_set1_pd = _mm_set_pd1;
3348 
3349 /// Set packed 16-bit integers with the supplied values in reverse order.
3350 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3351                         short e3, short e2, short e1, short e0) pure @trusted
3352 {
3353     short8 r = void;
3354     r.ptr[0] = e7;
3355     r.ptr[1] = e6;
3356     r.ptr[2] = e5;
3357     r.ptr[3] = e4;
3358     r.ptr[4] = e3;
3359     r.ptr[5] = e2;
3360     r.ptr[6] = e1;
3361     r.ptr[7] = e0;
3362     return cast(__m128i)(r);
3363 }
3364 unittest
3365 {
3366     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3367     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3368     assert(A.array == correct);
3369 }
3370 
3371 /// Set packed 32-bit integers with the supplied values in reverse order.
3372 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3373 {
3374     // Performs better than = void; with GDC
3375     pragma(inline, true);
3376     align(16) int[4] result = [e3, e2, e1, e0];
3377     return *cast(__m128i*)(result.ptr);
3378 }
3379 unittest
3380 {
3381     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3382     int[4] correct = [-1, 0, -2147483648, 2147483647];
3383     assert(A.array == correct);
3384 }
3385 
3386 /// Set packed 64-bit integers with the supplied values in reverse order.
3387 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3388 {
3389     long2 r = void;
3390     r.ptr[0] = e1;
3391     r.ptr[1] = e0;
3392     return cast(__m128i)(r);
3393 }
3394 unittest
3395 {
3396     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3397     long[2] correct = [-1, 0];
3398     assert(A.array == correct);
3399 }
3400 
3401 /// Set packed 8-bit integers with the supplied values in reverse order.
3402 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3403                        byte e11, byte e10, byte e9,  byte e8,
3404                        byte e7,  byte e6,  byte e5,  byte e4,
3405                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3406 {
3407     align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3408                                  e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3409     return *cast(__m128i*)(result.ptr);
3410 }
3411 unittest
3412 {
3413     byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3414     byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
3415     assert(R.array == correct);
3416 }
3417 
3418 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3419 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3420 {
3421     pragma(inline, true);
3422     double2 result;
3423     result.ptr[0] = e1;
3424     result.ptr[1] = e0;
3425     return result;
3426 }
3427 unittest
3428 {
3429     __m128d A = _mm_setr_pd(61.0, 55.0);
3430     double[2] correct = [61.0, 55.0];
3431     assert(A.array == correct);
3432 }
3433 
3434 /// Return vector of type `__m128d` with all elements set to zero.
3435 __m128d _mm_setzero_pd() pure @trusted
3436 {
3437     pragma(inline, true);
3438     double2 r = void;
3439     r.ptr[0] = 0.0;
3440     r.ptr[1] = 0.0;
3441     return r;
3442 }
3443 unittest
3444 {
3445     __m128d A = _mm_setzero_pd();
3446     double[2] correct = [0.0, 0.0];
3447     assert(A.array == correct);
3448 }
3449 
3450 /// Return vector of type `__m128i` with all elements set to zero.
3451 __m128i _mm_setzero_si128() pure @trusted
3452 {
3453     pragma(inline, true);
3454     int4 r = void;
3455     r.ptr[0] = 0;
3456     r.ptr[1] = 0;
3457     r.ptr[2] = 0;
3458     r.ptr[3] = 0;
3459     return r;
3460 }
3461 unittest
3462 {
3463     __m128i A = _mm_setzero_si128();
3464     int[4] correct = [0, 0, 0, 0];
3465     assert(A.array == correct);
3466 }
3467 
3468 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
3469 /// See_also: `_MM_SHUFFLE`.
3470 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted
3471 {
3472     // PERF DMD D_SIMD
3473     static if (GDC_with_SSE2)
3474     {
3475         return __builtin_ia32_pshufd(a, imm8);
3476     }
3477     else version(LDC)
3478     {
3479         return shufflevectorLDC!(int4, (imm8 >> 0) & 3,
3480                                  (imm8 >> 2) & 3,
3481                                  (imm8 >> 4) & 3,
3482                                  (imm8 >> 6) & 3)(a, a);
3483     }
3484     else
3485     {
3486         int4 r = void;
3487         r.ptr[0] = a.ptr[(imm8 >> 0) & 3];
3488         r.ptr[1] = a.ptr[(imm8 >> 2) & 3];
3489         r.ptr[2] = a.ptr[(imm8 >> 4) & 3];
3490         r.ptr[3] = a.ptr[(imm8 >> 6) & 3];
3491         return r;
3492     }
3493 }
3494 unittest
3495 {
3496     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3497     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3498     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3499     int[4] expectedB = [ 3, 2, 1, 0 ];
3500     assert(B.array == expectedB);
3501 }
3502 
3503 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3504 /// See_also: `_MM_SHUFFLE2`.
3505 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted
3506 {
3507     // PERF DMD D_SIMD
3508     static if (GDC_with_SSE2)
3509     {
3510         return __builtin_ia32_shufpd(a, b, imm8);
3511     }
3512     else version(LDC)
3513     {
3514         return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ),
3515                                  2 + ( (imm8 >> 1) & 1 ))(a, b);
3516     }
3517     else
3518     {
3519         double2 r = void;
3520         r.ptr[0] = a.array[imm8 & 1];
3521         r.ptr[1] = b.array[(imm8 >> 1) & 1];
3522         return r;
3523     }
3524 }
3525 unittest
3526 {
3527     __m128d A = _mm_setr_pd(0.5, 2.0);
3528     __m128d B = _mm_setr_pd(4.0, 5.0);
3529     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3530     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3531     double[2] correct = [ 2.0, 5.0 ];
3532     assert(R.array == correct);
3533 }
3534 
3535 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3536 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3537 /// See also: `_MM_SHUFFLE`.
3538 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted
3539 {
3540     // PERF DMD D_SIMD
3541     static if (GDC_with_SSE2)
3542     {
3543         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3544     }
3545     else version(LDC)
3546     {
3547         return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3,
3548                                           4 + ( (imm8 >> 0) & 3 ),
3549                                           4 + ( (imm8 >> 2) & 3 ),
3550                                           4 + ( (imm8 >> 4) & 3 ),
3551                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3552     }
3553     else
3554     {
3555         short8 r = cast(short8)a;
3556         short8 sa = cast(short8)a;
3557         r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ];
3558         r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ];
3559         r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ];
3560         r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ];
3561         return cast(__m128i) r;
3562     }
3563 }
3564 unittest
3565 {
3566     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3567     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3568     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3569     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3570     assert(C.array == expectedC);
3571 }
3572 
3573 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3574 /// bits of result, with the high 64 bits being copied from from `a` to result.
3575 /// See_also: `_MM_SHUFFLE`.
3576 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted
3577 {
3578     // PERF DMD D_SIMD
3579     static if (GDC_with_SSE2)
3580     {
3581         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3582     }
3583     else version(LDC)
3584     {
3585         return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ),
3586                                                        ( (imm8 >> 2) & 3 ),
3587                                                        ( (imm8 >> 4) & 3 ),
3588                                                        ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3589     }
3590     else
3591     {
3592         short8 r = cast(short8)a;
3593         short8 sa = cast(short8)a;
3594         r.ptr[0] = sa.array[(imm8 >> 0) & 3];
3595         r.ptr[1] = sa.array[(imm8 >> 2) & 3];
3596         r.ptr[2] = sa.array[(imm8 >> 4) & 3];
3597         r.ptr[3] = sa.array[(imm8 >> 6) & 3];
3598         return cast(__m128i) r;
3599     }
3600 }
3601 unittest
3602 {
3603     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3604     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3605     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3606     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3607     assert(B.array == expectedB);
3608 }
3609 
3610 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3611 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3612 {
3613     static if (LDC_with_SSE2)
3614     {
3615         return __builtin_ia32_pslld128(a, count);
3616     }
3617     else static if (GDC_with_SSE2)
3618     {
3619         return __builtin_ia32_pslld128(a, count);
3620     }
3621     else static if (DMD_with_32bit_asm)
3622     {
3623         asm pure nothrow @nogc @trusted
3624         {
3625             movdqu XMM0, a;
3626             movdqu XMM1, count;
3627             pslld XMM0, XMM1;
3628             movdqu a, XMM0;
3629         }
3630         return a;
3631     }
3632     else
3633     {
3634         int4 r = void;
3635         long2 lc = cast(long2)count;
3636         int bits = cast(int)(lc.array[0]);
3637         foreach(i; 0..4)
3638             r[i] = cast(uint)(a[i]) << bits;
3639         return r;
3640     }
3641 }
3642 
3643 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3644 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3645 {
3646     static if (LDC_with_SSE2)
3647     {
3648         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3649     }
3650     else static if (GDC_with_SSE2)
3651     {
3652         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3653     }
3654     else static if (DMD_with_32bit_asm)
3655     {
3656         asm pure nothrow @nogc @trusted
3657         {
3658             movdqu XMM0, a;
3659             movdqu XMM1, count;
3660             psllq XMM0, XMM1;
3661             movdqu a, XMM0;
3662         }
3663         return a;
3664     }
3665     else
3666     {
3667         // ARM: good since LDC 1.12 -O2
3668         // ~but -O0 version is catastrophic
3669         long2 r = void;
3670         long2 sa = cast(long2)a;
3671         long2 lc = cast(long2)count;
3672         int bits = cast(int)(lc.array[0]);
3673         foreach(i; 0..2)
3674             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3675         return cast(__m128i)r;
3676     }
3677 }
3678 
3679 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3680 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3681 {
3682     static if (LDC_with_SSE2)
3683     {
3684         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3685     }
3686     else static if (GDC_with_SSE2)
3687     {
3688         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3689     }
3690     else static if (DMD_with_32bit_asm)
3691     {
3692         asm pure nothrow @nogc
3693         {
3694             movdqu XMM0, a;
3695             movdqu XMM1, count;
3696             psllw XMM0, XMM1;
3697             movdqu a, XMM0;
3698         }
3699         return a;
3700     }
3701     else
3702     {
3703         short8 sa = cast(short8)a;
3704         long2 lc = cast(long2)count;
3705         int bits = cast(int)(lc.array[0]);
3706         short8 r = void;
3707         foreach(i; 0..8)
3708             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3709         return cast(int4)r;
3710     }
3711 }
3712 
3713 
3714 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3715 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3716 {
3717     static if (GDC_with_SSE2)
3718     {
3719         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3720     }
3721     else static if (LDC_with_SSE2)
3722     {
3723         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3724     }
3725     else
3726     {
3727         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3728         //       D says "It's illegal to shift by the same or more bits 
3729         //       than the size of the quantity being shifted"
3730         //       and it's UB instead.
3731         int4 r = _mm_setzero_si128();
3732 
3733         ubyte count = cast(ubyte) imm8;
3734         if (count > 31)
3735             return r;
3736         
3737         foreach(i; 0..4)
3738             r.array[i] = cast(uint)(a.array[i]) << count;
3739         return r;
3740     }
3741 }
3742 unittest
3743 {
3744     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3745     __m128i B = _mm_slli_epi32(A, 1);
3746     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3747     int[4] expectedB = [ 0, 4, 6, -8];
3748     assert(B.array == expectedB);
3749     assert(B2.array == expectedB);
3750 
3751     __m128i C = _mm_slli_epi32(A, 0);
3752     int[4] expectedC = [ 0, 2, 3, -4];
3753     assert(C.array == expectedC);
3754 
3755     __m128i D = _mm_slli_epi32(A, 65);
3756     int[4] expectedD = [ 0, 0, 0, 0];
3757     assert(D.array == expectedD);
3758 }
3759 
3760 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3761 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3762 {
3763     static if (GDC_with_SSE2)
3764     {
3765         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3766     }
3767     else static if (LDC_with_SSE2)
3768     {
3769         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3770     }
3771     else
3772     {
3773         long2 sa = cast(long2)a;
3774 
3775         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3776         //       D says "It's illegal to shift by the same or more bits 
3777         //       than the size of the quantity being shifted"
3778         //       and it's UB instead.
3779         long2 r = cast(long2) _mm_setzero_si128();
3780         ubyte count = cast(ubyte) imm8;
3781         if (count > 63)
3782             return cast(__m128i)r;
3783 
3784         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3785         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3786         return cast(__m128i)r;
3787     }
3788 }
3789 unittest
3790 {
3791     __m128i A = _mm_setr_epi64(8, -4);
3792     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3793     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3794     long[2] expectedB = [ 16, -8];
3795     assert(B.array == expectedB);
3796     assert(B2.array == expectedB);
3797 
3798     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3799     long[2] expectedC = [ 8, -4];
3800     assert(C.array == expectedC);
3801 
3802     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3803     long[2] expectedD = [ 0, -0];
3804     assert(D.array == expectedD);
3805 }
3806 
3807 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3808 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3809 {
3810     static if (GDC_with_SSE2)
3811     {
3812         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3813     }
3814     else static if (LDC_with_SSE2)
3815     {
3816         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3817     }
3818     else static if (LDC_with_ARM64)
3819     {
3820         short8 sa = cast(short8)a;
3821         short8 r = cast(short8)_mm_setzero_si128();
3822         ubyte count = cast(ubyte) imm8;
3823         if (count > 15)
3824             return cast(__m128i)r;
3825         r = sa << short8(count);
3826         return cast(__m128i)r;
3827     }
3828     else
3829     {
3830         short8 sa = cast(short8)a;
3831         short8 r = cast(short8)_mm_setzero_si128();
3832         ubyte count = cast(ubyte) imm8;
3833         if (count > 15)
3834             return cast(__m128i)r;
3835         foreach(i; 0..8)
3836             r.ptr[i] = cast(short)(sa.array[i] << count);
3837         return cast(__m128i)r;
3838     }
3839 }
3840 unittest
3841 {
3842     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3843     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3844     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3845     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3846     assert(B.array == expectedB);
3847     assert(B2.array == expectedB);
3848 
3849     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3850     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3851     assert(C.array == expectedC);
3852 }
3853 
3854 
3855 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3856 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3857 {
3858     static if (bytes & 0xF0)
3859     {
3860         return _mm_setzero_si128();
3861     }
3862     else static if (DMD_with_DSIMD)
3863     {
3864         return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes);
3865     }
3866     else static if (GDC_with_SSE2)
3867     {
3868         pragma(inline, true); // else it doesn't seem to be inlined at all by GDC TODO _mm_srli_si128
3869         return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3870     }
3871     else version(LDC)
3872     {
3873         return cast(__m128i) shufflevectorLDC!(byte16,
3874                                                16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3875                                                22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3876                                                28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3877                                                (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3878     }
3879     else static if (DMD_with_32bit_asm)
3880     {
3881         asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3882         {
3883             movdqu XMM0, op;
3884             pslldq XMM0, bytes;
3885             movdqu op, XMM0;
3886         }
3887         return op;
3888     }
3889     else
3890     {
3891         byte16 A = cast(byte16)op;
3892         byte16 R = void;
3893         for (int n = 15; n >= bytes; --n)
3894             R.ptr[n] = A.array[n-bytes];
3895         for (int n = bytes-1; n >= 0; --n)
3896             R.ptr[n] = 0;
3897         return cast(__m128i)R;
3898     }
3899 }
3900 unittest
3901 {
3902     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3903     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3904     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3905     assert(R.array == correct);
3906 
3907     __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1));
3908     int[4] expectedB = [0, 0, 0, 0];
3909     assert(B.array == expectedB);
3910 }
3911 
3912 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3913 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3914 {
3915     version(LDC)
3916     {
3917         // Disappeared with LDC 1.11
3918         static if (__VERSION__ < 2081)
3919             return __builtin_ia32_sqrtpd(vec);
3920         else
3921         {
3922             // PERF: use llvm_sqrt on the vector
3923             vec.array[0] = llvm_sqrt(vec.array[0]); 
3924             vec.array[1] = llvm_sqrt(vec.array[1]);
3925             return vec;
3926         }
3927     }
3928     else static if (GDC_with_SSE2)    
3929     {
3930         return __builtin_ia32_sqrtpd(vec);
3931     }
3932     else
3933     {
3934         vec.ptr[0] = sqrt(vec.array[0]);
3935         vec.ptr[1] = sqrt(vec.array[1]);
3936         return vec;
3937     }
3938 }
3939 
3940 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3941 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3942 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3943 {
3944     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3945     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3946     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3947     version(LDC)
3948     {
3949         // Disappeared with LDC 1.11
3950         static if (__VERSION__ < 2081)
3951         {
3952             __m128d c = __builtin_ia32_sqrtsd(b);
3953             a[0] = c[0];
3954             return a;
3955         }
3956         else
3957         {
3958             a.array[0] = llvm_sqrt(b.array[0]);
3959             return a;
3960         }
3961     }
3962     else static if (GDC_with_SSE2)
3963     {
3964         __m128d c = __builtin_ia32_sqrtsd(b);
3965         a.ptr[0] = c.array[0];
3966         return a;
3967     }
3968     else
3969     {
3970         a.ptr[0] = sqrt(b.array[0]);
3971         return a;
3972     }
3973 }
3974 unittest
3975 {
3976     __m128d A = _mm_setr_pd(1.0, 3.0);
3977     __m128d B = _mm_setr_pd(4.0, 5.0);
3978     __m128d R = _mm_sqrt_sd(A, B);
3979     double[2] correct = [2.0, 3.0 ];
3980     assert(R.array == correct);
3981 }
3982 
3983 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3984 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3985 {
3986     static if (GDC_with_SSE2)
3987     {
3988         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3989     }
3990     else static if (LDC_with_SSE2)
3991     {
3992         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3993     }
3994     else
3995     {
3996         short8 sa = cast(short8)a;
3997         long2 lc = cast(long2)count;
3998         int bits = cast(int)(lc.array[0]);
3999         short8 r = void;
4000         foreach(i; 0..8)
4001             r.ptr[i] = cast(short)(sa.array[i] >> bits);
4002         return cast(int4)r;
4003     }
4004 }
4005 
4006 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
4007 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
4008 {
4009     static if (LDC_with_SSE2)
4010     {
4011         return __builtin_ia32_psrad128(a, count);
4012     }
4013     else static if (GDC_with_SSE2)
4014     {
4015         return __builtin_ia32_psrad128(a, count);
4016     }
4017     else
4018     {    
4019         int4 r = void;
4020         long2 lc = cast(long2)count;
4021         int bits = cast(int)(lc.array[0]);
4022         r.ptr[0] = (a.array[0] >> bits);
4023         r.ptr[1] = (a.array[1] >> bits);
4024         r.ptr[2] = (a.array[2] >> bits);
4025         r.ptr[3] = (a.array[3] >> bits);
4026         return r;
4027     }
4028 }
4029 
4030 
4031 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
4032 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
4033 {
4034     static if (GDC_with_SSE2)
4035     {
4036         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4037     }
4038     else static if (LDC_with_SSE2)
4039     {
4040         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4041     }
4042     else static if (LDC_with_ARM64)
4043     {
4044         short8 sa = cast(short8)a;
4045         ubyte count = cast(ubyte)imm8;
4046         if (count > 15) 
4047             count = 15;
4048         short8 r = sa >> short8(count);
4049         return cast(__m128i)r;
4050     }
4051     else
4052     {
4053         short8 sa = cast(short8)a;
4054         short8 r = void;
4055 
4056         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4057         //       D says "It's illegal to shift by the same or more bits 
4058         //       than the size of the quantity being shifted"
4059         //       and it's UB instead.
4060         ubyte count = cast(ubyte)imm8;
4061         if (count > 15) 
4062             count = 15;
4063         foreach(i; 0..8)
4064             r.ptr[i] = cast(short)(sa.array[i] >> count);
4065         return cast(int4)r;
4066     }
4067 }
4068 unittest
4069 {
4070     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4071     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
4072     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
4073     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
4074     assert(B.array == expectedB);
4075     assert(B2.array == expectedB);
4076 
4077     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
4078     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
4079     assert(C.array == expectedC);
4080 }
4081 
4082 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
4083 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
4084 {
4085     static if (LDC_with_SSE2)
4086     {
4087         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4088     }
4089     else static if (GDC_with_SSE2)
4090     {
4091         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4092     }
4093     else
4094     {
4095         int4 r = void;
4096 
4097         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4098         //       D says "It's illegal to shift by the same or more bits 
4099         //       than the size of the quantity being shifted"
4100         //       and it's UB instead.
4101         ubyte count = cast(ubyte) imm8;
4102         if (count > 31)
4103             count = 31;
4104 
4105         r.ptr[0] = (a.array[0] >> count);
4106         r.ptr[1] = (a.array[1] >> count);
4107         r.ptr[2] = (a.array[2] >> count);
4108         r.ptr[3] = (a.array[3] >> count);
4109         return r;
4110     }
4111 }
4112 unittest
4113 {
4114     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4115     __m128i B = _mm_srai_epi32(A, 1);
4116     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
4117     int[4] expectedB = [ 0, 1, 1, -2];
4118     assert(B.array == expectedB);
4119     assert(B2.array == expectedB);
4120 
4121     __m128i C = _mm_srai_epi32(A, 32);
4122     int[4] expectedC = [ 0, 0, 0, -1];
4123     assert(C.array == expectedC);
4124 
4125     __m128i D = _mm_srai_epi32(A, 0);
4126     int[4] expectedD = [ 0, 2, 3, -4];
4127     assert(D.array == expectedD);
4128 }
4129 
4130 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
4131 {
4132     static if (LDC_with_SSE2)
4133     {
4134         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4135     }
4136     else static if (GDC_with_SSE2)
4137     {
4138         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4139     }
4140     else
4141     {
4142         short8 sa = cast(short8)a;
4143         long2 lc = cast(long2)count;
4144         int bits = cast(int)(lc.array[0]);
4145         short8 r = void;
4146         foreach(i; 0..8)
4147             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
4148         return cast(int4)r;
4149     }
4150 }
4151 
4152 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
4153 {
4154     static if (LDC_with_SSE2)
4155     {
4156         return __builtin_ia32_psrld128(a, count);
4157     }
4158     else static if (GDC_with_SSE2)
4159     {
4160         return __builtin_ia32_psrld128(a, count);
4161     }
4162     else
4163     {
4164         int4 r = void;
4165         long2 lc = cast(long2)count;
4166         int bits = cast(int)(lc.array[0]);
4167         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
4168         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
4169         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
4170         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
4171         return r;
4172     }
4173 }
4174 
4175 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
4176 {
4177     static if (LDC_with_SSE2)
4178     {
4179         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4180     }
4181     else static if (GDC_with_SSE2)
4182     {
4183         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4184     }
4185     else
4186     {
4187         // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047
4188         // => avoid void initialization.
4189         long2 r;
4190         long2 sa = cast(long2)a;
4191         long2 lc = cast(long2)count;
4192         int bits = cast(int)(lc.array[0]);
4193         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
4194         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
4195         return cast(__m128i)r;
4196     }
4197 }
4198 
4199 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
4200 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
4201 {
4202     static if (GDC_with_SSE2)
4203     {
4204         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4205     }
4206     else static if (LDC_with_SSE2)
4207     {
4208         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4209     }
4210     else static if (LDC_with_ARM64)
4211     {
4212         short8 sa = cast(short8)a;
4213         short8 r = cast(short8) _mm_setzero_si128();
4214 
4215         ubyte count = cast(ubyte)imm8;
4216         if (count >= 16)
4217             return cast(__m128i)r;
4218 
4219         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
4220         return cast(__m128i)r;
4221     }
4222     else
4223     {
4224         short8 sa = cast(short8)a;
4225         ubyte count = cast(ubyte)imm8;
4226 
4227         short8 r = cast(short8) _mm_setzero_si128();
4228         if (count >= 16)
4229             return cast(__m128i)r;
4230 
4231         foreach(i; 0..8)
4232             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4233         return cast(__m128i)r;
4234     }
4235 }
4236 unittest
4237 {
4238     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4239     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4240     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4241     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4242     assert(B.array == expectedB);
4243     assert(B2.array == expectedB);
4244 
4245     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4246     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4247     assert(C.array == expectedC);
4248 
4249     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4250     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4251     assert(D.array == expectedD);
4252 }
4253 
4254 
4255 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4256 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4257 {
4258     static if (GDC_with_SSE2)
4259     {
4260         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4261     }
4262     else static if (LDC_with_SSE2)
4263     {
4264         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4265     }
4266     else
4267     {
4268         ubyte count = cast(ubyte) imm8;
4269 
4270         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4271         //       D says "It's illegal to shift by the same or more bits 
4272         //       than the size of the quantity being shifted"
4273         //       and it's UB instead.
4274         int4 r = _mm_setzero_si128();
4275         if (count >= 32)
4276             return r;
4277         r.ptr[0] = a.array[0] >>> count;
4278         r.ptr[1] = a.array[1] >>> count;
4279         r.ptr[2] = a.array[2] >>> count;
4280         r.ptr[3] = a.array[3] >>> count;
4281         return r;
4282     }
4283 }
4284 unittest
4285 {
4286     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4287     __m128i B = _mm_srli_epi32(A, 1);
4288     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4289     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4290     assert(B.array == expectedB);
4291     assert(B2.array == expectedB);
4292  
4293     __m128i C = _mm_srli_epi32(A, 255);
4294     int[4] expectedC = [ 0, 0, 0, 0 ];
4295     assert(C.array == expectedC);
4296 }
4297 
4298 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4299 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4300 {
4301     // PERF DMD
4302     static if (GDC_with_SSE2)
4303     {
4304         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4305     }
4306     else static if (LDC_with_SSE2)
4307     {
4308         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4309     }
4310     else
4311     {
4312         long2 r = cast(long2) _mm_setzero_si128();
4313         long2 sa = cast(long2)a;
4314 
4315         ubyte count = cast(ubyte) imm8;
4316         if (count >= 64)
4317             return cast(__m128i)r;
4318 
4319         r.ptr[0] = sa.array[0] >>> count;
4320         r.ptr[1] = sa.array[1] >>> count;
4321         return cast(__m128i)r;
4322     }
4323 }
4324 unittest
4325 {
4326     __m128i A = _mm_setr_epi64(8, -4);
4327     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4328     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4329     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4330     assert(B.array == expectedB);
4331     assert(B2.array == expectedB);
4332 
4333     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4334     long[2] expectedC = [ 0, 0 ];
4335     assert(C.array == expectedC);
4336 }
4337 
4338 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4339 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted
4340 {
4341     static if (bytes & 0xF0)
4342     {
4343         return _mm_setzero_si128();
4344     }
4345     else static if (DMD_with_DSIMD)
4346     {
4347         return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes);
4348     }
4349     else static if (GDC_with_SSE2)
4350     {
4351         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4352     }
4353     else static if (DMD_with_32bit_asm)
4354     {
4355         asm pure nothrow @nogc @trusted
4356         {
4357             movdqu XMM0, v;
4358             psrldq XMM0, bytes;
4359             movdqu v, XMM0;
4360         }
4361         return v;
4362     }
4363     else version(LDC)
4364     {
4365         return cast(__m128i) shufflevectorLDC!(byte16,
4366                                                bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4367                                                bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4368                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4369     }
4370     else
4371     {
4372         byte16 A = cast(byte16)v;
4373         byte16 R = void;
4374         for (int n = 0; n < bytes; ++n)
4375             R.ptr[15-n] = 0;
4376         for (int n = bytes; n < 16; ++n)
4377             R.ptr[15-n] = A.array[15 - n + bytes];
4378         return cast(__m128i)R;
4379     }
4380 }
4381 unittest
4382 {
4383     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1));
4384     int[4] correct = [-2, 3, 4, 0];
4385     assert(R.array == correct);
4386 
4387     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4388     int[4] expectedA = [0, 0, 0, 0];
4389     assert(A.array == expectedA);
4390 }
4391 
4392 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4393 /// #BONUS
4394 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4395 {
4396     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4397 }
4398 unittest
4399 {
4400     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4401     float[4] correct = [3.0f, 4.0f, 0, 0];
4402     assert(R.array == correct);
4403 }
4404 
4405 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4406 /// #BONUS
4407 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4408 {
4409     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4410 }
4411 
4412 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4413 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4414 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4415 {
4416     pragma(inline, true);
4417     __m128d* aligned = cast(__m128d*)mem_addr;
4418     *aligned = a;
4419 }
4420 unittest
4421 {
4422     align(16) double[2] A;
4423     __m128d B = _mm_setr_pd(-8.0, 9.0);
4424     _mm_store_pd(A.ptr, B);
4425     assert(A == [-8.0, 9.0]);
4426 }
4427 
4428 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4429 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4430 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4431 {
4432     __m128d* aligned = cast(__m128d*)mem_addr;
4433     __m128d r; // PERF =void;
4434     r.ptr[0] = a.array[0];
4435     r.ptr[1] = a.array[0];
4436     *aligned = r;
4437 }
4438 
4439 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4440 /// be aligned on any particular boundary.
4441 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4442 {
4443     pragma(inline, true);
4444     *mem_addr = a.array[0];
4445 }
4446 
4447 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4448 /// general-protection exception may be generated.
4449 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4450 {
4451     pragma(inline, true);
4452     *mem_addr = a;
4453 }
4454 
4455 alias _mm_store1_pd = _mm_store_pd1; ///
4456 
4457 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4458 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4459 {
4460     pragma(inline, true);
4461     *mem_addr = a.array[1];
4462 }
4463 
4464 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4465 // expectations from the user point of view. This problem also exist in C++.
4466 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4467 {
4468     pragma(inline, true);
4469     long* dest = cast(long*)mem_addr;
4470     long2 la = cast(long2)a;
4471     *dest = la.array[0];
4472 }
4473 unittest
4474 {
4475     long[3] A = [1, 2, 3];
4476     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4477     long[3] correct = [1, 0x1_0000_0000, 3];
4478     assert(A == correct);
4479 }
4480 
4481 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4482 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4483 {
4484     pragma(inline, true);
4485     *mem_addr = a.array[0];
4486 }
4487 
4488 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 
4489 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 
4490 /// may be generated.
4491 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
4492 {
4493     __m128d reversed = void;
4494     reversed.ptr[0] = a.array[1];
4495     reversed.ptr[1] = a.array[0];
4496     *cast(__m128d*)mem_addr = reversed;
4497 }
4498 unittest
4499 {
4500     align(16) double[2] A = [0.0, 1.0];
4501     _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0));
4502     assert(A[0] == 3.0 && A[1] == 2.0);
4503 }
4504 
4505 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 
4506 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
4507 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
4508 {
4509     // PERF DMD
4510     pragma(inline, true);
4511     static if (GDC_with_SSE2)
4512     {
4513         __builtin_ia32_storeupd(mem_addr, a);
4514     }
4515     else version(LDC)
4516     {
4517         storeUnaligned!double2(a, mem_addr);
4518     }
4519     else
4520     {
4521         mem_addr[0] = a.array[0];
4522         mem_addr[1] = a.array[1];
4523     }
4524 }
4525 unittest
4526 {
4527     __m128d A = _mm_setr_pd(3.0, 4.0);
4528     align(16) double[4] R = [0.0, 0, 0, 0];
4529     double[2] correct = [3.0, 4.0];
4530     _mm_storeu_pd(&R[1], A);
4531     assert(R[1..3] == correct);
4532 }
4533 
4534 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4535 /// boundary.
4536 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system
4537 {
4538     // PERF: DMD
4539     pragma(inline, true);
4540     static if (GDC_with_SSE2)
4541     {
4542         __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
4543     }
4544     else version(LDC)
4545     {
4546         storeUnaligned!__m128i(a, cast(int*)mem_addr);
4547     }
4548     else
4549     {
4550         int* p = cast(int*)mem_addr;
4551         p[0] = a.array[0];
4552         p[1] = a.array[1];
4553         p[2] = a.array[2];
4554         p[3] = a.array[3];
4555     }
4556 }
4557 unittest
4558 {
4559     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4560     align(16) int[6] R = [0, 0, 0, 0, 0, 0];
4561     int[4] correct = [1, 2, 3, 4];
4562     _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
4563     assert(R[1..5] == correct);
4564 }
4565 
4566 /// Store 16-bit integer from the first element of `a` into memory. 
4567 /// `mem_addr` does not need to be aligned on any particular boundary.
4568 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system
4569 {
4570     short* dest = cast(short*)mem_addr;
4571     *dest = (cast(short8)a).array[0];
4572 }
4573 unittest
4574 {
4575     short[2] arr = [-24, 12];
4576     _mm_storeu_si16(&arr[1], _mm_set1_epi16(26));
4577     short[2] correct = [-24, 26];
4578     assert(arr == correct);
4579 }
4580 
4581 /// Store 32-bit integer from the first element of `a` into memory. 
4582 /// `mem_addr` does not need to be aligned on any particular boundary.
4583 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem
4584 {
4585     pragma(inline, true);
4586     int* dest = cast(int*)mem_addr;
4587     *dest = a.array[0];
4588 }
4589 unittest
4590 {
4591     int[2] arr = [-24, 12];
4592     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4593     assert(arr == [-24, -1]);
4594 }
4595 
4596 /// Store 64-bit integer from the first element of `a` into memory. 
4597 /// `mem_addr` does not need to be aligned on any particular boundary.
4598 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system
4599 {
4600     pragma(inline, true);
4601     long* dest = cast(long*)mem_addr;
4602     long2 la = cast(long2)a;
4603     *dest = la.array[0];
4604 }
4605 unittest
4606 {
4607     long[3] A = [1, 2, 3];
4608     _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4609     long[3] correct = [1, 0x1_0000_0000, 3];
4610     assert(A == correct);
4611 }
4612 
4613 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4614 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte
4615 /// boundary or a general-protection exception may be generated.
4616 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4617 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system
4618 {
4619     // PERF DMD D_SIMD
4620     static if (GDC_with_SSE2)
4621     {
4622         return __builtin_ia32_movntpd(mem_addr, a); 
4623     }
4624     else static if (LDC_with_InlineIREx)
4625     {
4626         enum prefix = `!0 = !{ i32 1 }`;
4627         enum ir = `
4628             store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0
4629             ret void`;
4630         LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a);
4631     }
4632     else
4633     {
4634         // Regular store instead.
4635         __m128d* dest = cast(__m128d*)mem_addr;
4636         *dest = a;
4637     }
4638 }
4639 unittest
4640 {
4641     align(16) double[2] A;
4642     __m128d B = _mm_setr_pd(-8.0, 9.0);
4643     _mm_stream_pd(A.ptr, B);
4644     assert(A == [-8.0, 9.0]);
4645 }
4646 
4647 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4648 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
4649 /// may be generated.
4650 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4651 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted
4652 {
4653     // PERF DMD D_SIMD
4654     static if (GDC_with_SSE2)
4655     {
4656         return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 
4657     }
4658     else static if (LDC_with_InlineIREx)
4659     {
4660         enum prefix = `!0 = !{ i32 1 }`;
4661         enum ir = `
4662             store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0
4663             ret void`;
4664         LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a);
4665     }
4666     else
4667     {
4668         // Regular store instead.
4669         __m128i* dest = cast(__m128i*)mem_addr;
4670         *dest = a;
4671     }
4672 }
4673 unittest
4674 {
4675     align(16) int[4] A;
4676     __m128i B = _mm_setr_epi32(-8, 9, 10, -11);
4677     _mm_stream_si128(cast(__m128i*)A.ptr, B);
4678     assert(A == [-8, 9, 10, -11]);
4679 }
4680 
4681 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4682 /// pollution. If the cache line containing address `mem_addr` is already in the cache,
4683 /// the cache will be updated.
4684 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4685 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted
4686 {
4687     // PERF DMD D_SIMD
4688     static if (GDC_with_SSE2)
4689     {
4690         return __builtin_ia32_movnti(mem_addr, a);
4691     }
4692     else static if (LDC_with_InlineIREx)
4693     {
4694         enum prefix = `!0 = !{ i32 1 }`;
4695         enum ir = `
4696             store i32 %1, i32* %0, !nontemporal !0
4697             ret void`;
4698         LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a);
4699     }
4700     else
4701     {
4702         // Regular store instead.
4703         *mem_addr = a;
4704     }
4705 }
4706 unittest
4707 {
4708     int A;
4709     _mm_stream_si32(&A, -34);
4710     assert(A == -34);
4711 }
4712 
4713 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4714 /// cache pollution. If the cache line containing address `mem_addr` is already
4715 /// in the cache, the cache will be updated.
4716 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4717 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted
4718 {
4719     // PERF DMD D_SIMD
4720     static if (GDC_with_SSE2)
4721     {
4722         return __builtin_ia32_movnti64(mem_addr, a);
4723     }
4724     else static if (LDC_with_InlineIREx)
4725     {
4726         enum prefix = `!0 = !{ i32 1 }`;
4727         enum ir = `
4728             store i64 %1, i64* %0, !nontemporal !0
4729             ret void`;
4730         LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a);
4731 
4732     }
4733     else
4734     {
4735         // Regular store instead.
4736         *mem_addr = a;
4737     }
4738 }
4739 unittest
4740 {
4741     long A;
4742     _mm_stream_si64(&A, -46);
4743     assert(A == -46);
4744 }
4745 
4746 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4747 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4748 {
4749     pragma(inline, true);
4750     return cast(__m128i)(cast(short8)a - cast(short8)b);
4751 }
4752 unittest
4753 {
4754     __m128i A = _mm_setr_epi16(16,  32767, 1, 2,    3, 4, 6, 6);
4755     __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6);
4756     short8 C = cast(short8) _mm_sub_epi16(A, B);
4757     short[8] correct =        [ 1,     -1,-5,-6, -997, 3, 1, 0];
4758     assert(C.array == correct);
4759 }
4760 
4761 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4762 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4763 {
4764     pragma(inline, true);
4765     return cast(__m128i)(cast(int4)a - cast(int4)b);
4766 }
4767 unittest
4768 {
4769     __m128i A = _mm_setr_epi32(16, int.max, 1, 8);
4770     __m128i B = _mm_setr_epi32(15, int.min, 6, 2);
4771     int4 C = cast(int4) _mm_sub_epi32(A, B);
4772     int[4] correct =          [ 1,      -1,-5, 6];
4773     assert(C.array == correct);
4774 }
4775 
4776 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4777 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4778 {
4779     pragma(inline, true);
4780     return cast(__m128i)(cast(long2)a - cast(long2)b);
4781 }
4782 unittest
4783 {
4784     __m128i A = _mm_setr_epi64(  16, long.max);
4785     __m128i B = _mm_setr_epi64( 199, long.min);
4786     long2 C = cast(long2) _mm_sub_epi64(A, B);
4787     long[2] correct =         [-183,       -1];
4788     assert(C.array == correct);
4789 }
4790 
4791 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4792 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4793 {
4794     pragma(inline, true);
4795     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4796 }
4797 unittest
4798 {
4799     __m128i A = _mm_setr_epi8(16,  127, 1, 2, 3, 4, 6, 6, 16,  127, 1, 2, 3, 4, 6, 6);
4800     __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16,  127, 1, 2, 3, 4, 6, 6);
4801     byte16 C = cast(byte16) _mm_sub_epi8(A, B);
4802     byte[16] correct =       [ 1,   -1,-5,-6, 0, 3, 1, 0,  0,    0, 0, 0, 0, 0, 0, 0];
4803     assert(C.array == correct);
4804 }
4805 
4806 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4807 /// floating-point elements in `a`.
4808 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4809 {
4810     pragma(inline, true);
4811     return a - b;
4812 }
4813 unittest
4814 {
4815     __m128d A = _mm_setr_pd(4000.0, -8.0);
4816     __m128d B = _mm_setr_pd(12.0, -8450.0);
4817     __m128d C = _mm_sub_pd(A, B);
4818     double[2] correct =     [3988.0, 8442.0];
4819     assert(C.array == correct);
4820 }
4821 
4822 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4823 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4824 /// upper element of result.
4825 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4826 {
4827     version(DigitalMars)
4828     {
4829         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4830         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4831         asm pure nothrow @nogc @trusted { nop;}
4832         a[0] = a[0] - b[0];
4833         return a;
4834     }
4835     else static if (GDC_with_SSE2)
4836     {
4837         return __builtin_ia32_subsd(a, b);
4838     }
4839     else
4840     {
4841         a.ptr[0] -= b.array[0];
4842         return a;
4843     }
4844 }
4845 unittest
4846 {
4847     __m128d a = [1.5, -2.0];
4848     a = _mm_sub_sd(a, a);
4849     assert(a.array == [0.0, -2.0]);
4850 }
4851 
4852 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4853 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4854 {
4855     pragma(inline, true);
4856     return a - b;
4857 }
4858 unittest
4859 {
4860     __m64 A, B;
4861     A = -1214;
4862     B = 489415;
4863     __m64 C = _mm_sub_si64(B, A);
4864     assert(C.array[0] == 489415 + 1214);
4865 }
4866 
4867 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using
4868 /// saturation.
4869 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4870 {
4871     // PERF DMD psubsw
4872     version(LDC)
4873     {
4874         return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b);
4875     }
4876     else static if (GDC_with_SSE2)
4877     {
4878         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4879     }
4880     else
4881     {
4882         short[8] res; // PERF =void;
4883         short8 sa = cast(short8)a;
4884         short8 sb = cast(short8)b;
4885         foreach(i; 0..8)
4886             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4887         return _mm_loadu_si128(cast(int4*)res.ptr);
4888     }
4889 }
4890 unittest
4891 {
4892     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4893                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4894     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4895     assert(res.array == correctResult);
4896 }
4897 
4898 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using
4899 /// saturation.
4900 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4901 {
4902     version(LDC)
4903     {
4904         return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b);
4905     }
4906     else static if (GDC_with_SSE2)
4907     {
4908         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4909     }
4910     else
4911     {
4912         byte[16] res; // PERF =void;
4913         byte16 sa = cast(byte16)a;
4914         byte16 sb = cast(byte16)b;
4915         foreach(i; 0..16)
4916             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4917         return _mm_loadu_si128(cast(int4*)res.ptr);
4918     }
4919 }
4920 unittest
4921 {
4922     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4923                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4924     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4925     assert(res.array == correctResult);
4926 }
4927 
4928 /// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4929 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4930 {
4931     version(LDC)
4932     {
4933         return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b);
4934     }
4935     else static if (GDC_with_SSE2)
4936     {
4937         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4938     }
4939     else
4940     {
4941         short[8] res; // PERF =void;
4942         short8 sa = cast(short8)a;
4943         short8 sb = cast(short8)b;
4944         foreach(i; 0..8)
4945         {
4946             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4947             res[i] = saturateSignedIntToUnsignedShort(sum);
4948         }
4949         return _mm_loadu_si128(cast(int4*)res.ptr);
4950     }
4951 }
4952 unittest
4953 {
4954     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4955                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4956     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4957     assert(R.array == correct);
4958 }
4959 
4960 /// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4961 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4962 {
4963     version(LDC)
4964     {
4965         return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b);
4966     }
4967     else static if (GDC_with_SSE2)
4968     {
4969         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4970     }
4971     else
4972     {
4973         ubyte[16] res; // PERF =void;
4974         byte16 sa = cast(byte16)a;
4975         byte16 sb = cast(byte16)b;
4976         foreach(i; 0..16)
4977             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4978         return _mm_loadu_si128(cast(int4*)res.ptr);
4979     }
4980 }
4981 unittest
4982 {
4983     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4984                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4985     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4986     assert(res.array == correctResult);
4987 }
4988 
4989 // Note: the only difference between these intrinsics is the signalling
4990 //       behaviour of quiet NaNs. This is incorrect but the case where
4991 //       you would want to differentiate between qNaN and sNaN and then
4992 //       treat them differently on purpose seems extremely rare.
4993 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4994 alias _mm_ucomige_sd = _mm_comige_sd; ///
4995 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4996 alias _mm_ucomile_sd = _mm_comile_sd; ///
4997 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4998 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4999 
5000 /// Return vector of type `__m128d` with undefined elements.
5001 __m128d _mm_undefined_pd() pure @safe
5002 {
5003     pragma(inline, true);
5004     __m128d result = void;
5005     return result;
5006 }
5007 
5008 /// Return vector of type `__m128i` with undefined elements.
5009 __m128i _mm_undefined_si128() pure @safe
5010 {
5011     pragma(inline, true);
5012     __m128i result = void;
5013     return result;
5014 }
5015 
5016 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
5017 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted
5018 {
5019     // PERF DMD D_SIMD
5020     static if (GDC_with_SSE2)
5021     {
5022         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
5023     }
5024     else version(LDC)
5025     {
5026         return cast(__m128i) shufflevectorLDC!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
5027                                               (cast(short8)a, cast(short8)b);
5028     }
5029     else static if (DMD_with_32bit_asm)
5030     {
5031         asm pure nothrow @nogc @trusted
5032         {
5033             movdqu XMM0, a;
5034             movdqu XMM1, b;
5035             punpckhwd XMM0, XMM1;
5036             movdqu a, XMM0;
5037         }
5038         return a;
5039     }   
5040     else
5041     {
5042         short8 r = void;
5043         short8 sa = cast(short8)a;
5044         short8 sb = cast(short8)b;
5045         r.ptr[0] = sa.array[4];
5046         r.ptr[1] = sb.array[4];
5047         r.ptr[2] = sa.array[5];
5048         r.ptr[3] = sb.array[5];
5049         r.ptr[4] = sa.array[6];
5050         r.ptr[5] = sb.array[6];
5051         r.ptr[6] = sa.array[7];
5052         r.ptr[7] = sb.array[7];
5053         return cast(__m128i)r;
5054     }
5055 }
5056 unittest
5057 {
5058     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
5059     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
5060     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
5061     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
5062     assert(C.array == correct);
5063 }
5064 
5065 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
5066 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
5067 {
5068     static if (GDC_with_SSE2)
5069     {
5070         return __builtin_ia32_punpckhdq128(a, b);
5071     }
5072     else version(LDC)
5073     {
5074         return shufflevectorLDC!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
5075     }
5076     else
5077     {
5078         __m128i r = void;
5079         r.ptr[0] = a.array[2];
5080         r.ptr[1] = b.array[2];
5081         r.ptr[2] = a.array[3];
5082         r.ptr[3] = b.array[3];
5083         return r;
5084     }
5085 }
5086 unittest
5087 {
5088     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5089     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5090     __m128i C = _mm_unpackhi_epi32(A, B);
5091     int[4] correct = [3, 7, 4, 8];
5092     assert(C.array == correct);
5093 }
5094 
5095 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
5096 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
5097 {
5098     static if (GDC_with_SSE2)
5099     {
5100         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
5101     }
5102     else
5103     {
5104         __m128i r = cast(__m128i)b;
5105         r[0] = a[2];
5106         r[1] = a[3];
5107         return r; 
5108     }
5109 }
5110 unittest // Issue #36
5111 {
5112     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5113     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5114     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
5115     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
5116     assert(C.array == correct);
5117 }
5118 
5119 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
5120 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted
5121 {
5122     // PERF DMD D_SIMD
5123     static if (GDC_with_SSE2)
5124     {
5125         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
5126     }
5127     else static if (DMD_with_32bit_asm)
5128     {
5129         asm pure nothrow @nogc @trusted
5130         {
5131             movdqu XMM0, a;
5132             movdqu XMM1, b;
5133             punpckhbw XMM0, XMM1;
5134             movdqu a, XMM0;
5135         }
5136         return a;
5137     }
5138     else version(LDC)
5139     {
5140         return cast(__m128i)shufflevectorLDC!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
5141                                                       12, 28, 13, 29, 14, 30, 15, 31)
5142             (cast(byte16)a, cast(byte16)b);
5143     }
5144     else
5145     {
5146         byte16 r = void;
5147         byte16 ba = cast(byte16)a;
5148         byte16 bb = cast(byte16)b;
5149         r.ptr[0] = ba.array[8];
5150         r.ptr[1] = bb.array[8];
5151         r.ptr[2] = ba.array[9];
5152         r.ptr[3] = bb.array[9];
5153         r.ptr[4] = ba.array[10];
5154         r.ptr[5] = bb.array[10];
5155         r.ptr[6] = ba.array[11];
5156         r.ptr[7] = bb.array[11];
5157         r.ptr[8] = ba.array[12];
5158         r.ptr[9] = bb.array[12];
5159         r.ptr[10] = ba.array[13];
5160         r.ptr[11] = bb.array[13];
5161         r.ptr[12] = ba.array[14];
5162         r.ptr[13] = bb.array[14];
5163         r.ptr[14] = ba.array[15];
5164         r.ptr[15] = bb.array[15];
5165         return cast(__m128i)r;
5166     }
5167 }
5168 unittest
5169 {
5170     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5171     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5172     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
5173     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
5174     assert(C.array == correct);
5175 }
5176 
5177 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
5178 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted
5179 {
5180     // PERF DMD D_SIMD
5181     static if (GDC_with_SSE2)
5182     {
5183         return __builtin_ia32_unpckhpd(a, b);
5184     }
5185     else version(LDC)
5186     {
5187         return shufflevectorLDC!(__m128d, 1, 3)(a, b);
5188     }
5189     else
5190     {
5191         double2 r = void;
5192         r.ptr[0] = a.array[1];
5193         r.ptr[1] = b.array[1];
5194         return r;
5195     }
5196 }
5197 unittest
5198 {
5199     __m128d A = _mm_setr_pd(4.0, 6.0);
5200     __m128d B = _mm_setr_pd(7.0, 9.0);
5201     __m128d C = _mm_unpackhi_pd(A, B);
5202     double[2] correct = [6.0, 9.0];
5203     assert(C.array == correct);
5204 }
5205 
5206 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
5207 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted
5208 {
5209     // PERF DMD SIMD
5210     static if (GDC_with_SSE2)
5211     {
5212         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
5213     }
5214     else version(LDC)
5215     {
5216         return cast(__m128i) shufflevectorLDC!(short8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(short8)a, cast(short8)b);
5217     }
5218     else static if (DMD_with_32bit_asm)
5219     {
5220         asm pure nothrow @nogc @trusted
5221         {
5222             movdqu XMM0, a;
5223             movdqu XMM1, b;
5224             punpcklwd XMM0, XMM1;
5225             movdqu a, XMM0;
5226         }
5227         return a;
5228     }
5229     else
5230     {
5231         short8 r = void;
5232         short8 sa = cast(short8)a;
5233         short8 sb = cast(short8)b;
5234         r.ptr[0] = sa.array[0];
5235         r.ptr[1] = sb.array[0];
5236         r.ptr[2] = sa.array[1];
5237         r.ptr[3] = sb.array[1];
5238         r.ptr[4] = sa.array[2];
5239         r.ptr[5] = sb.array[2];
5240         r.ptr[6] = sa.array[3];
5241         r.ptr[7] = sb.array[3];
5242         return cast(__m128i)r;
5243     }
5244 }
5245 unittest
5246 {
5247     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
5248     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
5249     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
5250     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
5251     assert(C.array == correct);
5252 }
5253 
5254 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
5255 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
5256 {
5257     // PERF DMD
5258     static if (GDC_with_SSE2)
5259     {
5260         return __builtin_ia32_punpckldq128(a, b);
5261     }
5262     else version(LDC)
5263     {
5264         return shufflevectorLDC!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
5265     }
5266     else
5267     {
5268         __m128i r;
5269         r.ptr[0] = a.array[0];
5270         r.ptr[1] = b.array[0];
5271         r.ptr[2] = a.array[1];
5272         r.ptr[3] = b.array[1];
5273         return r;
5274     }
5275 }
5276 unittest
5277 {
5278     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5279     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5280     __m128i C = _mm_unpacklo_epi32(A, B);
5281     int[4] correct = [1, 5, 2, 6];
5282     assert(C.array == correct);
5283 }
5284 
5285 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
5286 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
5287 {
5288     static if (GDC_with_SSE2)
5289     {
5290         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
5291     }
5292     else
5293     {
5294         long2 lA = cast(long2)a;
5295         long2 lB = cast(long2)b;
5296         long2 R; // PERF =void;
5297         R.ptr[0] = lA.array[0];
5298         R.ptr[1] = lB.array[0];
5299         return cast(__m128i)R;
5300     }
5301 }
5302 unittest // Issue #36
5303 {
5304     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5305     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5306     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
5307     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
5308     assert(C.array == correct);
5309 }
5310 
5311 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
5312 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted
5313 {
5314     // PERF DMD D_SIMD
5315     static if (GDC_with_SSE2)
5316     {
5317         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
5318     }
5319     else static if (DMD_with_32bit_asm)
5320     {
5321         asm pure nothrow @nogc @trusted
5322         {
5323             movdqu XMM0, a;
5324             movdqu XMM1, b;
5325             punpcklbw XMM0, XMM1;
5326             movdqu a, XMM0;
5327         }
5328         return a;
5329     }
5330     else version(LDC)
5331     {
5332         return cast(__m128i) shufflevectorLDC!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
5333                                                        4, 20, 5, 21, 6, 22, 7, 23)
5334                                                        (cast(byte16)a, cast(byte16)b); 
5335     }
5336     else
5337     {
5338         byte16 r = void;
5339         byte16 ba = cast(byte16)a;
5340         byte16 bb = cast(byte16)b;
5341         r.ptr[0] = ba.array[0];
5342         r.ptr[1] = bb.array[0];
5343         r.ptr[2] = ba.array[1];
5344         r.ptr[3] = bb.array[1];
5345         r.ptr[4] = ba.array[2];
5346         r.ptr[5] = bb.array[2];
5347         r.ptr[6] = ba.array[3];
5348         r.ptr[7] = bb.array[3];
5349         r.ptr[8] = ba.array[4];
5350         r.ptr[9] = bb.array[4];
5351         r.ptr[10] = ba.array[5];
5352         r.ptr[11] = bb.array[5];
5353         r.ptr[12] = ba.array[6];
5354         r.ptr[13] = bb.array[6];
5355         r.ptr[14] = ba.array[7];
5356         r.ptr[15] = bb.array[7];
5357         return cast(__m128i)r;
5358     }
5359 }
5360 unittest
5361 {
5362     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5363     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5364     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
5365     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
5366     assert(C.array == correct);
5367 }
5368 
5369 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
5370 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted
5371 {
5372     // PERF DMD D_SIMD
5373     static if (GDC_with_SSE2)
5374     {
5375         return __builtin_ia32_unpcklpd(a, b);
5376     }
5377     else version(LDC)
5378     {
5379         return shufflevectorLDC!(__m128d, 0, 2)(a, b);
5380     }
5381     else
5382     {
5383         double2 r = void;
5384         r.ptr[0] = a.array[0];
5385         r.ptr[1] = b.array[0];
5386         return r;
5387     }
5388 }
5389 unittest
5390 {
5391     __m128d A = _mm_setr_pd(4.0, 6.0);
5392     __m128d B = _mm_setr_pd(7.0, 9.0);
5393     __m128d C = _mm_unpacklo_pd(A, B);
5394     double[2] correct = [4.0, 7.0];
5395     assert(C.array == correct);
5396 }
5397 
5398 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
5399 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
5400 {
5401     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
5402 }
5403 unittest
5404 {
5405     __m128d A = _mm_setr_pd(-4.0, 6.0);
5406     __m128d B = _mm_setr_pd(4.0, -6.0);
5407     long2 R = cast(long2) _mm_xor_pd(A, B);
5408     long[2] correct = [long.min, long.min];
5409     assert(R.array == correct);
5410 }
5411 
5412 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
5413 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
5414 {
5415     return a ^ b;
5416 }
5417 unittest
5418 {
5419     __m128i A = _mm_setr_epi64(975394, 619809709);
5420     __m128i B = _mm_setr_epi64(-920275025, -6);
5421     long2 R = cast(long2) _mm_xor_si128(A, B);
5422     long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6];
5423     assert(R.array == correct);
5424 }
5425 
5426 unittest
5427 {
5428     float distance(float[4] a, float[4] b) nothrow @nogc
5429     {
5430         __m128 va = _mm_loadu_ps(a.ptr);
5431         __m128 vb = _mm_loadu_ps(b.ptr);
5432         __m128 diffSquared = _mm_sub_ps(va, vb);
5433         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
5434         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
5435         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
5436         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
5437     }
5438     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
5439 }