1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (GDC_with_SSE2)
83     {
84         return __builtin_ia32_addsd(a, b);
85     }
86     else version(DigitalMars)
87     {
88         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
89         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
90         asm pure nothrow @nogc @trusted { nop;}
91         a[0] = a[0] + b[0];
92         return a;
93     }
94     else
95     {
96         a[0] += b[0];
97         return a;
98     }
99 }
100 unittest
101 {
102     __m128d a = [1.5, -2.0];
103     a = _mm_add_sd(a, a);
104     assert(a.array == [3.0, -2.0]);
105 }
106 
107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
109 {
110     pragma(inline, true);
111     return a + b;
112 }
113 unittest
114 {
115     __m128d a = [1.5, -2.0];
116     a = _mm_add_pd(a, a);
117     assert(a.array == [3.0, -4.0]);
118 }
119 
120 /// Add 64-bit integers `a` and `b`.
121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
122 {
123     pragma(inline, true);
124     return a + b;
125 }
126 
127 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
129 {
130     static if (GDC_with_SSE2)
131     {
132         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
133     }
134     else version(LDC)
135     {
136         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
137         {
138             // x86: Generates PADDSW since LDC 1.15 -O0
139             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
140             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
141             enum ir = `
142                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
143                 ret <8 x i16> %r`;
144             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
145         }
146         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
147         {
148             short[8] res;
149             short8 sa = cast(short8)a;
150             short8 sb = cast(short8)b;
151             foreach(i; 0..8)
152                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
153             return _mm_loadu_si128(cast(int4*)res.ptr);
154         }
155         else
156             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
157     }
158     else
159     {
160         short[8] res;
161         short8 sa = cast(short8)a;
162         short8 sb = cast(short8)b;
163         foreach(i; 0..8)
164             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
165         return _mm_loadu_si128(cast(int4*)res.ptr);
166     }
167 }
168 unittest
169 {
170     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
171                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
172     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
173     assert(res.array == correctResult);
174 }
175 
176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
178 {
179     static if (GDC_with_SSE2)
180     {
181         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
182     }
183     else version(LDC)
184     {
185         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
186         {
187             // x86: Generates PADDSB since LDC 1.15 -O0
188             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
189             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
190             enum ir = `
191                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
192                 ret <16 x i8> %r`;
193             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
194         }
195         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
196         {
197             byte[16] res;
198             byte16 sa = cast(byte16)a;
199             byte16 sb = cast(byte16)b;
200             foreach(i; 0..16)
201                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
202             return _mm_loadu_si128(cast(int4*)res.ptr);
203         }
204         else
205             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
206     }
207     else
208     {
209         byte[16] res;
210         byte16 sa = cast(byte16)a;
211         byte16 sb = cast(byte16)b;
212         foreach(i; 0..16)
213             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
214         return _mm_loadu_si128(cast(int4*)res.ptr);
215     }
216 }
217 unittest
218 {
219     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
220                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
221     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
222                                                16, 18, 20, 22, 24, 26, 28, 30];
223     assert(res.array == correctResult);
224 }
225 
226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
227 // PERF: #GDC version?
228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
229 {
230     version(LDC)
231     {
232         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
233         {
234             // x86: Generates PADDUSB since LDC 1.15 -O0
235             // ARM: Generates uqadd.16b since LDC 1.21 -O1
236             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
237             enum ir = `
238                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
239                 ret <16 x i8> %r`;
240             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
241         }
242         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
243         {
244             ubyte[16] res;
245             byte16 sa = cast(byte16)a;
246             byte16 sb = cast(byte16)b;
247             foreach(i; 0..16)
248                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
249             return _mm_loadu_si128(cast(int4*)res.ptr);
250         }
251         else
252             return __builtin_ia32_paddusb128(a, b);
253     }
254     else
255     {
256         ubyte[16] res;
257         byte16 sa = cast(byte16)a;
258         byte16 sb = cast(byte16)b;
259         foreach(i; 0..16)
260             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
261         return _mm_loadu_si128(cast(int4*)res.ptr);
262     }
263 }
264 unittest
265 {
266     byte16 res = cast(byte16) 
267         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
268                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
269     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
270                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
271     assert(res.array == correctResult);
272 }
273 
274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
275 // PERF: #GDC version?
276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
277 {
278     version(LDC)
279     {
280         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
281         {
282             // x86: Generates PADDUSW since LDC 1.15 -O0
283             // ARM: Generates uqadd.8h since LDC 1.21 -O1
284             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
285             enum ir = `
286                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
287                 ret <8 x i16> %r`;
288             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
289         }
290         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
291         {
292             ushort[8] res;
293             short8 sa = cast(short8)a;
294             short8 sb = cast(short8)b;
295             foreach(i; 0..8)
296                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
297             return _mm_loadu_si128(cast(int4*)res.ptr);
298         }
299         else
300             return __builtin_ia32_paddusw128(a, b);
301     }
302     else
303     {
304         ushort[8] res;
305         short8 sa = cast(short8)a;
306         short8 sb = cast(short8)b;
307         foreach(i; 0..8)
308             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
309         return _mm_loadu_si128(cast(int4*)res.ptr);
310     }
311 }
312 unittest
313 {
314     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
315                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
316     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
317     assert(res.array == correctResult);
318 }
319 
320 /// Compute the bitwise AND of packed double-precision (64-bit) 
321 /// floating-point elements in `a` and `b`.
322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
323 {
324     pragma(inline, true);
325     return cast(__m128d)( cast(long2)a & cast(long2)b );
326 }
327 unittest
328 {
329     double a = 4.32;
330     double b = -78.99;
331     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
332     __m128d A = _mm_set_pd(a, b);
333     __m128d B = _mm_set_pd(b, a);
334     long2 R = cast(long2)( _mm_and_pd(A, B) );
335     assert(R.array[0] == correct);
336     assert(R.array[1] == correct);
337 }
338 
339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
341 {
342     pragma(inline, true);
343     return a & b;
344 }
345 unittest
346 {
347     __m128i A = _mm_set1_epi32(7);
348     __m128i B = _mm_set1_epi32(14);
349     __m128i R = _mm_and_si128(A, B);
350     int[4] correct = [6, 6, 6, 6];
351     assert(R.array == correct);
352 }
353 
354 /// Compute the bitwise NOT of packed double-precision (64-bit) 
355 /// floating-point elements in `a` and then AND with `b`.
356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
357 {
358     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
359 }
360 unittest
361 {
362     double a = 4.32;
363     double b = -78.99;
364     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
365     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
366     __m128d A = _mm_setr_pd(a, b);
367     __m128d B = _mm_setr_pd(b, a);
368     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
369     assert(R.array[0] == correct);
370     assert(R.array[1] == correct2);
371 }
372 
373 /// Compute the bitwise NOT of 128 bits (representing integer data) 
374 /// in `a` and then AND with `b`.
375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
376 {
377     return (~a) & b;
378 }
379 unittest
380 {
381     __m128i A = _mm_set1_epi32(7);
382     __m128i B = _mm_set1_epi32(14);
383     __m128i R = _mm_andnot_si128(A, B);
384     int[4] correct = [8, 8, 8, 8];
385     assert(R.array == correct);
386 }
387 
388 /// Average packed unsigned 16-bit integers in `a` and `b`.
389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
390 {
391     static if (GDC_with_SSE2)
392     {
393         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
394     }
395     else static if (LDC_with_ARM64)
396     {
397         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
398     }
399     else version(LDC)
400     {
401         // Generates pavgw even in LDC 1.0, even in -O0
402         // But not in ARM
403         enum ir = `
404             %ia = zext <8 x i16> %0 to <8 x i32>
405             %ib = zext <8 x i16> %1 to <8 x i32>
406             %isum = add <8 x i32> %ia, %ib
407             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
408             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
409             %r = trunc <8 x i32> %isums to <8 x i16>
410             ret <8 x i16> %r`;
411         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
412     }
413     else
414     {
415         short8 sa = cast(short8)a;
416         short8 sb = cast(short8)b;
417         short8 sr = void;
418         foreach(i; 0..8)
419         {
420             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
421         }
422         return cast(int4)sr;
423     }
424 }
425 unittest
426 {
427     __m128i A = _mm_set1_epi16(31);
428     __m128i B = _mm_set1_epi16(64);
429     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
430     foreach(i; 0..8)
431         assert(avg.array[i] == 48);
432 }
433 
434 /// Average packed unsigned 8-bit integers in `a` and `b`.
435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
436 {
437     static if (GDC_with_SSE2)
438     {
439         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
440     }
441     else static if (LDC_with_ARM64)
442     {
443         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
444     }
445     else version(LDC)
446     {
447         // Generates pavgb even in LDC 1.0, even in -O0
448         // But not in ARM
449         enum ir = `
450             %ia = zext <16 x i8> %0 to <16 x i16>
451             %ib = zext <16 x i8> %1 to <16 x i16>
452             %isum = add <16 x i16> %ia, %ib
453             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
454             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
455             %r = trunc <16 x i16> %isums to <16 x i8>
456             ret <16 x i8> %r`;
457         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
458     }
459     else
460     {
461         byte16 sa = cast(byte16)a;
462         byte16 sb = cast(byte16)b;
463         byte16 sr = void;
464         foreach(i; 0..16)
465         {
466             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
467         }
468         return cast(int4)sr;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_set1_epi8(31);
474     __m128i B = _mm_set1_epi8(64);
475     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
476     foreach(i; 0..16)
477         assert(avg.array[i] == 48);
478 }
479 
480 /// Shift `a` left by `bytes` bytes while shifting in zeros.
481 alias _mm_bslli_si128 = _mm_slli_si128;
482 unittest
483 {
484     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
485     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
486     __m128i result = _mm_bslli_si128!5(toShift);
487     assert( (cast(byte16)result).array == exact);
488 }
489 
490 /// Shift `v` right by `bytes` bytes while shifting in zeros.
491 alias _mm_bsrli_si128 = _mm_srli_si128;
492 unittest
493 {
494     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
495     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
496     __m128i result = _mm_bsrli_si128!5(toShift);
497     assert( (cast(byte16)result).array == exact);
498 }
499 
500 /// Cast vector of type `__m128d` to type `__m128`. 
501 /// Note: Also possible with a regular `cast(__m128)(a)`.
502 __m128 _mm_castpd_ps (__m128d a) pure @safe
503 {
504     return cast(__m128)a;
505 }
506 
507 /// Cast vector of type `__m128d` to type `__m128i`. 
508 /// Note: Also possible with a regular `cast(__m128i)(a)`.
509 __m128i _mm_castpd_si128 (__m128d a) pure @safe
510 {
511     return cast(__m128i)a;
512 }
513 
514 /// Cast vector of type `__m128` to type `__m128d`. 
515 /// Note: Also possible with a regular `cast(__m128d)(a)`.
516 __m128d _mm_castps_pd (__m128 a) pure @safe
517 {
518     return cast(__m128d)a;
519 }
520 
521 /// Cast vector of type `__m128` to type `__m128i`. 
522 /// Note: Also possible with a regular `cast(__m128i)(a)`.
523 __m128i _mm_castps_si128 (__m128 a) pure @safe
524 {
525     return cast(__m128i)a;
526 }
527 
528 /// Cast vector of type `__m128i` to type `__m128d`. 
529 /// Note: Also possible with a regular `cast(__m128d)(a)`.
530 __m128d _mm_castsi128_pd (__m128i a) pure @safe
531 {
532     return cast(__m128d)a;
533 }
534 
535 /// Cast vector of type `__m128i` to type `__m128`. 
536 /// Note: Also possible with a regular `cast(__m128)(a)`.
537 __m128 _mm_castsi128_ps (__m128i a) pure @safe
538 {
539     return cast(__m128)a;
540 }
541 
542 /// Invalidate and flush the cache line that contains `p` 
543 /// from all levels of the cache hierarchy.
544 void _mm_clflush (const(void)* p) @trusted
545 {
546     static if (GDC_with_SSE2)
547     {
548         __builtin_ia32_clflush(p);
549     }
550     else static if (LDC_with_SSE2)
551     {
552         __builtin_ia32_clflush(cast(void*)p);
553     }
554     else version(D_InlineAsm_X86)
555     {
556         asm pure nothrow @nogc @safe
557         {
558             mov EAX, p;
559             clflush [EAX];
560         }
561     }
562     else version(D_InlineAsm_X86_64)
563     {
564         asm pure nothrow @nogc @safe
565         {
566             mov RAX, p;
567             clflush [RAX];
568         }
569     }
570     else 
571     {
572         // Do nothing. Invalidating cacheline does
573         // not affect correctness.
574     }
575 }
576 unittest
577 {
578     ubyte[64] cacheline;
579     _mm_clflush(cacheline.ptr);
580 }
581 
582 /// Compare packed 16-bit integers in `a` and `b` for equality.
583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
584 {
585     static if (GDC_with_SSE2)
586     {
587         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
588     }
589     else
590     {
591         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
592     }
593 }
594 unittest
595 {
596     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
597     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
598     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
599     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
600     assert(R.array == E);
601 }
602 
603 /// Compare packed 32-bit integers in `a` and `b` for equality.
604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
605 {
606     static if (GDC_with_SSE2)
607     {
608         return __builtin_ia32_pcmpeqd128(a, b);
609     }
610     else
611     {
612         return equalMask!__m128i(a, b);
613     }
614 }
615 unittest
616 {
617     int4   A = [-3, -2, -1,  0];
618     int4   B = [ 4, -2,  2,  0];
619     int[4] E = [ 0, -1,  0, -1];
620     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
621     assert(R.array == E);
622 }
623 
624 /// Compare packed 8-bit integers in `a` and `b` for equality.
625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
626 {
627     static if (GDC_with_SSE2)
628     {
629         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
630     }
631     else
632     {
633         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
634     }
635 }
636 unittest
637 {
638     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
639     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
640     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
641     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
642     assert(C.array == correct);
643 }
644 
645 /// Compare packed double-precision (64-bit) floating-point elements 
646 /// in `a` and `b` for equality.
647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
648 {
649     static if (GDC_with_SSE2)
650     {
651         return __builtin_ia32_cmpeqpd(a, b);
652     }
653     else
654     {
655         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
656     }
657 }
658 
659 /// Compare the lower double-precision (64-bit) floating-point elements
660 /// in `a` and `b` for equality, store the result in the lower element,
661 /// and copy the upper element from `a`.
662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
663 {
664     static if (GDC_with_SSE2)
665     {
666         return __builtin_ia32_cmpeqsd(a, b);
667     }
668     else
669     {
670         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
671     }
672 }
673 
674 /// Compare packed double-precision (64-bit) floating-point elements 
675 /// in `a` and `b` for greater-than-or-equal.
676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
677 {
678     static if (GDC_with_SSE2)
679     {
680         return __builtin_ia32_cmpgepd(a, b);
681     }
682     else
683     {
684         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
685     }
686 }
687 
688 /// Compare the lower double-precision (64-bit) floating-point elements 
689 /// in `a` and `b` for greater-than-or-equal, store the result in the 
690 /// lower element, and copy the upper element from `a`.
691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
692 {
693     // Note: There is no __builtin_ia32_cmpgesd builtin.
694     static if (GDC_with_SSE2)
695     {
696         return __builtin_ia32_cmpnltsd(b, a);
697     }
698     else
699     {
700         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
701     }
702 }
703 
704 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
706 {
707     static if (GDC_with_SSE2)
708     {
709         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
710     }
711     else
712     {
713         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
714     }
715 }
716 unittest
717 {
718     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
719     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
720     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
721     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
722     assert(R.array == E);
723 }
724 
725 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
727 {
728     static if (GDC_with_SSE2)
729     {
730         return __builtin_ia32_pcmpgtd128(a, b); 
731     }
732     else
733     {
734         return cast(__m128i)( greaterMask!int4(a, b));
735     }
736 }
737 unittest
738 {
739     int4   A = [-3,  2, -1,  0];
740     int4   B = [ 4, -2,  2,  0];
741     int[4] E = [ 0, -1,  0,  0];
742     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
743     assert(R.array == E);
744 }
745 
746 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
748 {
749     static if (GDC_with_SSE2)
750     {
751         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
752     }
753     else
754     {
755         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
756     }
757 }
758 unittest
759 {
760     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
761     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
762     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
763     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
764     __m128i D = _mm_cmpeq_epi8(A, B);
765     assert(C.array == correct);
766 }
767 
768 /// Compare packed double-precision (64-bit) floating-point elements 
769 /// in `a` and `b` for greater-than.
770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
771 {
772     static if (GDC_with_SSE2)
773     {
774         return __builtin_ia32_cmpgtpd(a, b); 
775     }
776     else
777     {
778         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
779     }
780 }
781 
782 /// Compare the lower double-precision (64-bit) floating-point elements 
783 /// in `a` and `b` for greater-than, store the result in the lower element,
784 /// and copy the upper element from `a`.
785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
786 {
787     // Note: There is no __builtin_ia32_cmpgtsd builtin.
788     static if (GDC_with_SSE2)
789     {
790         return __builtin_ia32_cmpnlesd(b, a);
791     }
792     else
793     {
794         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
795     }
796 }
797 
798 /// Compare packed double-precision (64-bit) floating-point elements 
799 /// in `a` and `b` for less-than-or-equal.
800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
801 {
802     static if (GDC_with_SSE2)
803     {
804         return __builtin_ia32_cmplepd(a, b); 
805     }
806     else
807     {
808         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
809     }
810 }
811 
812 /// Compare the lower double-precision (64-bit) floating-point elements 
813 /// in `a` and `b` for less-than-or-equal, store the result in the 
814 /// lower element, and copy the upper element from `a`.
815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
816 {
817     static if (GDC_with_SSE2)
818     {
819         return __builtin_ia32_cmplesd(a, b); 
820     }
821     else
822     {
823         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
824     }
825 }
826 
827 /// Compare packed 16-bit integers in `a` and `b` for less-than.
828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
829 {
830     return _mm_cmpgt_epi16(b, a);
831 }
832 
833 /// Compare packed 32-bit integers in `a` and `b` for less-than.
834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
835 {
836     return _mm_cmpgt_epi32(b, a);
837 }
838 
839 /// Compare packed 8-bit integers in `a` and `b` for less-than.
840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
841 {
842     return _mm_cmpgt_epi8(b, a);
843 }
844 
845 /// Compare packed double-precision (64-bit) floating-point elements
846 /// in `a` and `b` for less-than.
847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
848 {
849     static if (GDC_with_SSE2)
850     {
851         return __builtin_ia32_cmpltpd(a, b); 
852     }
853     else
854     {
855         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
856     }
857 }
858 
859 /// Compare the lower double-precision (64-bit) floating-point elements
860 /// in `a` and `b` for less-than, store the result in the lower 
861 /// element, and copy the upper element from `a`.
862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
863 {
864     static if (GDC_with_SSE2)
865     {
866         return __builtin_ia32_cmpltsd(a, b); 
867     }
868     else
869     {
870         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
871     }
872 }
873 
874 /// Compare packed double-precision (64-bit) floating-point elements
875 /// in `a` and `b` for not-equal.
876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
877 {
878     static if (GDC_with_SSE2)
879     {
880         return __builtin_ia32_cmpneqpd(a, b); 
881     }
882     else
883     {
884         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
885     }
886 }
887 
888 /// Compare the lower double-precision (64-bit) floating-point elements
889 /// in `a` and `b` for not-equal, store the result in the lower 
890 /// element, and copy the upper element from `a`.
891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
892 {
893     static if (GDC_with_SSE2)
894     {
895         return __builtin_ia32_cmpneqsd(a, b); 
896     }
897     else
898     {
899         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
900     }
901 }
902 
903 /// Compare packed double-precision (64-bit) floating-point elements 
904 /// in `a` and `b` for not-greater-than-or-equal.
905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
906 {
907     static if (GDC_with_SSE2)
908     {
909         return __builtin_ia32_cmpngepd(a, b); 
910     }
911     else
912     {
913         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
914     }
915 }
916 
917 /// Compare the lower double-precision (64-bit) floating-point elements 
918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
919 /// the lower element, and copy the upper element from `a`.
920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
921 {
922     // Note: There is no __builtin_ia32_cmpngesd builtin.
923     static if (GDC_with_SSE2)
924     {
925         return __builtin_ia32_cmpltsd(b, a); 
926     }
927     else
928     {
929         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
930     }
931 }
932 
933 /// Compare packed double-precision (64-bit) floating-point elements 
934 /// in `a` and `b` for not-greater-than.
935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
936 {
937     static if (GDC_with_SSE2)
938     {
939         return __builtin_ia32_cmpngtpd(a, b);
940     }
941     else
942     {
943         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
944     }
945 }
946 
947 /// Compare the lower double-precision (64-bit) floating-point elements 
948 /// in `a` and `b` for not-greater-than, store the result in the 
949 /// lower element, and copy the upper element from `a`.
950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
951 {
952     // Note: There is no __builtin_ia32_cmpngtsd builtin.
953     static if (GDC_with_SSE2)
954     {
955         return __builtin_ia32_cmplesd(b, a);
956     }
957     else
958     {
959         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
960     }
961 }
962 
963 /// Compare packed double-precision (64-bit) floating-point elements 
964 /// in `a` and `b` for not-less-than-or-equal.
965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
966 {
967     static if (GDC_with_SSE2)
968     {
969         return __builtin_ia32_cmpnlepd(a, b);
970     }
971     else
972     {
973         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
974     }
975 }
976 
977 /// Compare the lower double-precision (64-bit) floating-point elements 
978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
979 /// lower element, and copy the upper element from `a`.
980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
981 {
982     static if (GDC_with_SSE2)
983     {
984         return __builtin_ia32_cmpnlesd(a, b);
985     }
986     else
987     {
988         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
989     }
990 }
991  
992 /// Compare packed double-precision (64-bit) floating-point elements 
993 /// in `a` and `b` for not-less-than.
994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
995 {
996     static if (GDC_with_SSE2)
997     {
998         return __builtin_ia32_cmpnltpd(a, b);
999     }
1000     else
1001     {
1002         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1003     }
1004 }
1005 
1006 /// Compare the lower double-precision (64-bit) floating-point elements 
1007 /// in `a` and `b` for not-less-than, store the result in the lower 
1008 /// element, and copy the upper element from `a`.
1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1010 {
1011     static if (GDC_with_SSE2)
1012     {
1013         return __builtin_ia32_cmpnltsd(a, b);
1014     }
1015     else
1016     {
1017         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1018     }
1019 }
1020 
1021 /// Compare packed double-precision (64-bit) floating-point elements 
1022 /// in `a` and `b` to see if neither is NaN.
1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1024 {
1025     static if (GDC_with_SSE2)
1026     {
1027         return __builtin_ia32_cmpordpd(a, b);
1028     }
1029     else
1030     {
1031         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1032     }
1033 }
1034 
1035 /// Compare the lower double-precision (64-bit) floating-point elements 
1036 /// in `a` and `b` to see if neither is NaN, store the result in the 
1037 /// lower element, and copy the upper element from `a` to the upper element.
1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1039 {
1040     static if (GDC_with_SSE2)
1041     {
1042         return __builtin_ia32_cmpordsd(a, b);
1043     }
1044     else
1045     {
1046         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1047     }
1048 }
1049 
1050 /// Compare packed double-precision (64-bit) floating-point elements 
1051 /// in `a` and `b` to see if either is NaN.
1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1053 {
1054     static if (GDC_with_SSE2)
1055     {
1056         return __builtin_ia32_cmpunordpd(a, b);
1057     }
1058     else
1059     {
1060         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1061     }
1062 }
1063 
1064 /// Compare the lower double-precision (64-bit) floating-point elements 
1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1066 /// element, and copy the upper element from `a` to the upper element.
1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1068 {
1069     static if (GDC_with_SSE2)
1070     {
1071         return __builtin_ia32_cmpunordsd(a, b);
1072     }
1073     else
1074     {
1075         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1076     }
1077 }
1078 
1079 /// Compare the lower double-precision (64-bit) floating-point element 
1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1082 {
1083     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1084     // comisd instruction, it returns false in case of unordered instead.
1085     //
1086     // Actually C++ compilers disagree over the meaning of that instruction.
1087     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1088     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1089     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1090     return a.array[0] == b.array[0];
1091 }
1092 unittest
1093 {
1094     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1095     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1096     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1097     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1098     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1099 }
1100 
1101 /// Compare the lower double-precision (64-bit) floating-point element 
1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1103 /// result (0 or 1).
1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1105 {
1106     return a.array[0] >= b.array[0];
1107 }
1108 unittest
1109 {
1110     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1111     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1112     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1113     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1114     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1115     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1116 }
1117 
1118 /// Compare the lower double-precision (64-bit) floating-point element 
1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1121 {
1122     return a.array[0] > b.array[0];
1123 }
1124 unittest
1125 {
1126     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1127     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1128     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1129     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1130     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1131 }
1132 
1133 /// Compare the lower double-precision (64-bit) floating-point element 
1134 /// in `a` and `b` for less-than-or-equal.
1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1136 {
1137     return a.array[0] <= b.array[0];
1138 }
1139 unittest
1140 {
1141     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1142     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1143     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1144     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1145     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1146     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1147 }
1148 
1149 /// Compare the lower double-precision (64-bit) floating-point element 
1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1152 {
1153     return a.array[0] < b.array[0];
1154 }
1155 unittest
1156 {
1157     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1158     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1159     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1160     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1161     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1162     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1163 }
1164 
1165 /// Compare the lower double-precision (64-bit) floating-point element
1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1168 {
1169     return a.array[0] != b.array[0];
1170 }
1171 unittest
1172 {
1173     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1174     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1175     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1176     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1177     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1178 }
1179 
1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1181 /// floating-point elements.
1182  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1183 {
1184     version(LDC)
1185     {
1186         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1187         enum ir = `
1188             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1189             %r = sitofp <2 x i32> %v to <2 x double>
1190             ret <2 x double> %r`;
1191         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1192     }
1193     else static if (GDC_with_SSE2)
1194     {
1195         return __builtin_ia32_cvtdq2pd(a);
1196     }
1197     else
1198     {
1199         double2 r = void;
1200         r.ptr[0] = a.array[0];
1201         r.ptr[1] = a.array[1];
1202         return r;
1203     }
1204 }
1205 unittest
1206 {
1207     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1208     assert(A.array[0] == 54.0);
1209     assert(A.array[1] == 54.0);
1210 }
1211 
1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1213 /// floating-point elements.
1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1215 {
1216     static if (GDC_with_SSE2)
1217     {
1218         return __builtin_ia32_cvtdq2ps(a);
1219     }
1220     else
1221     {
1222         // x86: Generates cvtdq2ps since LDC 1.0.0 -O1
1223         // ARM: Generats scvtf.4s since LDC 1.8.0 -02
1224         __m128 res;
1225         res.ptr[0] = cast(float)a.array[0];
1226         res.ptr[1] = cast(float)a.array[1];
1227         res.ptr[2] = cast(float)a.array[2];
1228         res.ptr[3] = cast(float)a.array[3];
1229         return res;
1230     }
1231 }
1232 unittest
1233 {
1234     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1235     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1236 }
1237 
1238 /// Convert packed double-precision (64-bit) floating-point elements 
1239 /// in `a` to packed 32-bit integers.
1240 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1241 {
1242     // PERF ARM32
1243     static if (LDC_with_SSE2)
1244     {
1245         return __builtin_ia32_cvtpd2dq(a);
1246     }
1247     else static if (GDC_with_SSE2)
1248     {
1249         return __builtin_ia32_cvtpd2dq(a);
1250     }
1251     else static if (LDC_with_ARM64)
1252     {
1253         // Get current rounding mode.
1254         uint fpscr = arm_get_fpcr();
1255         long2 i;
1256         switch(fpscr & _MM_ROUND_MASK_ARM)
1257         {
1258             default:
1259             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1260             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1261             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1262             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1263         }
1264         int4 zero = 0;
1265         return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1266     }
1267     else
1268     {
1269         // PERF ARM32
1270         __m128i r = _mm_setzero_si128();
1271         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1272         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1273         return r;
1274     }
1275 }
1276 unittest
1277 {
1278     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1279     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1280 }
1281 
1282 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1283 /// to packed 32-bit integers
1284 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1285 {
1286     return to_m64(_mm_cvtpd_epi32(v));
1287 }
1288 unittest
1289 {
1290     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1291     assert(A.array[0] == 55 && A.array[1] == 61);
1292 }
1293 
1294 /// Convert packed double-precision (64-bit) floating-point elements 
1295 /// in `a` to packed single-precision (32-bit) floating-point elements.
1296 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1297 {
1298     static if (LDC_with_SSE2)
1299     {
1300         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1301     }
1302     else static if (GDC_with_SSE2)
1303     {
1304         return __builtin_ia32_cvtpd2ps(a);
1305     }
1306     else
1307     { 
1308         __m128 r = void;
1309         r.ptr[0] = a.array[0];
1310         r.ptr[1] = a.array[1];
1311         r.ptr[2] = 0;
1312         r.ptr[3] = 0;
1313         return r;
1314     }
1315 }
1316 unittest
1317 {
1318     __m128d A = _mm_set_pd(5.25, 4.0);
1319     __m128 B = _mm_cvtpd_ps(A);
1320     assert(B.array == [4.0f, 5.25f, 0, 0]);
1321 }
1322 
1323 /// Convert packed 32-bit integers in `v` to packed double-precision 
1324 /// (64-bit) floating-point elements.
1325 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1326 {
1327     return _mm_cvtepi32_pd(to_m128i(v));
1328 }
1329 unittest
1330 {
1331     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1332     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1333 }
1334 
1335 /// Convert packed single-precision (32-bit) floating-point elements 
1336 /// in `a` to packed 32-bit integers
1337 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1338 {
1339     static if (LDC_with_SSE2)
1340     {
1341         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1342     }
1343     else static if (GDC_with_SSE2)
1344     {
1345         return __builtin_ia32_cvtps2dq(a);
1346     }
1347     else static if (LDC_with_ARM64)
1348     {
1349         // Get current rounding mode.
1350         uint fpscr = arm_get_fpcr();
1351         switch(fpscr & _MM_ROUND_MASK_ARM)
1352         {
1353             default:
1354             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1355             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1356             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1357             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1358         }
1359     }
1360     else
1361     {
1362         __m128i r = void;
1363         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1364         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1365         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1366         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1367         return r;
1368     }
1369 }
1370 unittest
1371 {
1372     // GDC bug #98607
1373     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1374     // GDC does not provide optimization barrier for rounding mode.
1375     // Workarounded with different literals. This bug will likely only manifest in unittest.
1376     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1377 
1378     uint savedRounding = _MM_GET_ROUNDING_MODE();
1379 
1380     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1381     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1382     assert(A.array == [1, -2, 54, -3]);
1383 
1384     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1385     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1386     assert(A.array == [1, -3, 53, -3]);
1387 
1388     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1389     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1390     assert(A.array == [2, -2, 54, -2]);
1391 
1392     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1393     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1394     assert(A.array == [1, -2, 53, -2]);
1395 
1396     _MM_SET_ROUNDING_MODE(savedRounding);
1397 }
1398 
1399 /// Convert packed single-precision (32-bit) floating-point elements 
1400 /// in `a` to packed double-precision (64-bit) floating-point elements.
1401 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1402 {
1403     version(LDC)
1404     {
1405         // Generates cvtps2pd since LDC 1.0 -O0
1406         enum ir = `
1407             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1408             %r = fpext <2 x float> %v to <2 x double>
1409             ret <2 x double> %r`;
1410         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1411     }
1412     else static if (GDC_with_SSE2)
1413     {
1414         return __builtin_ia32_cvtps2pd(a);
1415     }
1416     else
1417     {
1418         double2 r = void;
1419         r.ptr[0] = a.array[0];
1420         r.ptr[1] = a.array[1];
1421         return r;
1422     }
1423 }
1424 unittest
1425 {
1426     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1427     assert(A.array[0] == 54.0);
1428     assert(A.array[1] == 54.0);
1429 }
1430 
1431 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1432 double _mm_cvtsd_f64 (__m128d a) pure @safe
1433 {
1434     return a.array[0];
1435 }
1436 
1437 /// Convert the lower double-precision (64-bit) floating-point element
1438 /// in `a` to a 32-bit integer.
1439 int _mm_cvtsd_si32 (__m128d a) @safe
1440 {
1441     static if (LDC_with_SSE2)
1442     {
1443         return __builtin_ia32_cvtsd2si(a);
1444     }
1445     else static if (GDC_with_SSE2)
1446     {
1447         return __builtin_ia32_cvtsd2si(a);
1448     }
1449     else
1450     {
1451         return convertDoubleToInt32UsingMXCSR(a[0]);
1452     }
1453 }
1454 unittest
1455 {
1456     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1457 }
1458 
1459 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1460 long _mm_cvtsd_si64 (__m128d a) @trusted
1461 {
1462     version (LDC)
1463     {
1464         version (X86_64)
1465         {
1466             return __builtin_ia32_cvtsd2si64(a);
1467         }
1468         else
1469         {
1470             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1471             // using SSE instructions only. So the builtin doesn't exit for this arch.
1472             return convertDoubleToInt64UsingMXCSR(a[0]);
1473         }
1474     }
1475     else
1476     {
1477         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1478     }
1479 }
1480 unittest
1481 {
1482     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1483 
1484     uint savedRounding = _MM_GET_ROUNDING_MODE();
1485 
1486     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1487     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1488 
1489     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1490     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1491 
1492     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1493     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1494 
1495     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1496     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1497 
1498     _MM_SET_ROUNDING_MODE(savedRounding);
1499 }
1500 
1501 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1502 
1503 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1504 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1505 /// to the upper elements of result.
1506 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1507 {
1508     static if (GDC_with_SSE2)
1509     {
1510         return __builtin_ia32_cvtsd2ss(a, b); 
1511     }
1512     else
1513     {
1514         // Generates cvtsd2ss since LDC 1.3 -O0
1515         a.ptr[0] = b.array[0];
1516         return a;
1517     }
1518 }
1519 unittest
1520 {
1521     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1522     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1523 }
1524 
1525 /// Get the lower 32-bit integer in `a`.
1526 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1527 {
1528     return a.array[0];
1529 }
1530 
1531 /// Get the lower 64-bit integer in `a`.
1532 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1533 {
1534     long2 la = cast(long2)a;
1535     return la.array[0];
1536 }
1537 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1538 
1539 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1540 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1541 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1542 {
1543     a.ptr[0] = cast(double)b;
1544     return a;
1545 }
1546 unittest
1547 {
1548     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1549     assert(a.array == [42.0, 0]);
1550 }
1551 
1552 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1553 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1554 {
1555     int4 r = [0, 0, 0, 0];
1556     r.ptr[0] = a;
1557     return r;
1558 }
1559 unittest
1560 {
1561     __m128i a = _mm_cvtsi32_si128(65);
1562     assert(a.array == [65, 0, 0, 0]);
1563 }
1564 
1565 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1566 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1567 
1568 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1569 {
1570     a.ptr[0] = cast(double)b;
1571     return a;
1572 }
1573 unittest
1574 {
1575     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1576     assert(a.array == [42.0, 0]);
1577 }
1578 
1579 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1580 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1581 {
1582     long2 r = [0, 0];
1583     r.ptr[0] = a;
1584     return cast(__m128i)(r);
1585 }
1586 
1587 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1588 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1589 
1590 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1591 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1592 // element of result.
1593 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1594 {
1595     a.ptr[0] = b.array[0];
1596     return a;
1597 }
1598 unittest
1599 {
1600     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1601     assert(a.array == [42.0, 0]);
1602 }
1603 
1604 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1605 long _mm_cvttss_si64 (__m128 a) pure @safe
1606 {
1607     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1608 }
1609 unittest
1610 {
1611     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1612 }
1613 
1614 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1615 /// Put zeroes in the upper elements of result.
1616 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1617 {
1618     static if (LDC_with_SSE2)
1619     {
1620         return __builtin_ia32_cvttpd2dq(a);
1621     }
1622     else static if (GDC_with_SSE2)
1623     {
1624         return __builtin_ia32_cvttpd2dq(a);
1625     }
1626     else
1627     {
1628         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1629         __m128i r;
1630         r.ptr[0] = cast(int)a.array[0];
1631         r.ptr[1] = cast(int)a.array[1];
1632         r.ptr[2] = 0;
1633         r.ptr[3] = 0;
1634         return r;
1635     }
1636 }
1637 unittest
1638 {
1639     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1640     assert(R.array == [-4, 45641, 0, 0]);
1641 }
1642 
1643 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1644 /// to packed 32-bit integers with truncation.
1645 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1646 {
1647     return to_m64(_mm_cvttpd_epi32(v));
1648 }
1649 unittest
1650 {
1651     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1652     int[2] correct = [-4, 45641];
1653     assert(R.array == correct);
1654 }
1655 
1656 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1657 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1658 {
1659     // x86: Generates cvttps2dq since LDC 1.3 -O2
1660     // ARM64: generates fcvtze since LDC 1.8 -O2
1661     __m128i r;
1662     r.ptr[0] = cast(int)a.array[0];
1663     r.ptr[1] = cast(int)a.array[1];
1664     r.ptr[2] = cast(int)a.array[2];
1665     r.ptr[3] = cast(int)a.array[3];
1666     return r;
1667 }
1668 unittest
1669 {
1670     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1671     assert(R.array == [-4, 45641, 0, 1]);
1672 }
1673 
1674 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1675 int _mm_cvttsd_si32 (__m128d a)
1676 {
1677     // Generates cvttsd2si since LDC 1.3 -O0
1678     return cast(int)a.array[0];
1679 }
1680 
1681 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1682 long _mm_cvttsd_si64 (__m128d a)
1683 {
1684     // Generates cvttsd2si since LDC 1.3 -O0
1685     // but in 32-bit instead, it's a long sequence that resort to FPU
1686     return cast(long)a.array[0];
1687 }
1688 
1689 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1690 
1691 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1692 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1693 {
1694     pragma(inline, true);
1695     return a / b;
1696 }
1697 
1698 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1699 {
1700     static if (GDC_with_SSE2)
1701     {
1702         return __builtin_ia32_divsd(a, b);
1703     }
1704     else version(DigitalMars)
1705     {
1706         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1707         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1708         asm pure nothrow @nogc @trusted { nop;}
1709         a.array[0] = a.array[0] / b.array[0];
1710         return a;
1711     }
1712     else
1713     {
1714         a.ptr[0] /= b.array[0];
1715         return a;
1716     }
1717 }
1718 unittest
1719 {
1720     __m128d a = [2.0, 4.5];
1721     a = _mm_div_sd(a, a);
1722     assert(a.array == [1.0, 4.5]);
1723 }
1724 
1725 /// Extract a 16-bit integer from `v`, selected with `index`.
1726 /// Warning: the returned value is zero-extended to 32-bits.
1727 int _mm_extract_epi16(__m128i v, int index) pure @safe
1728 {
1729     short8 r = cast(short8)v;
1730     return cast(ushort)(r.array[index & 7]);
1731 }
1732 unittest
1733 {
1734     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1735     assert(_mm_extract_epi16(A, 6) == 6);
1736     assert(_mm_extract_epi16(A, 0) == 65535);
1737     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1738 }
1739 
1740 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1741 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1742 {
1743     short8 r = cast(short8)v;
1744     r.ptr[index & 7] = cast(short)i;
1745     return cast(__m128i)r;
1746 }
1747 unittest
1748 {
1749     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1750     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1751     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1752     assert(R.array == correct);
1753 }
1754 
1755 
1756 void _mm_lfence() @trusted
1757 {
1758     version(GNU)
1759     {
1760     
1761         static if (GDC_with_SSE2)
1762         {
1763             __builtin_ia32_lfence();
1764         }
1765         else version(X86)
1766         {
1767             asm pure nothrow @nogc @trusted
1768             {
1769                 "lfence;\n" : : : ;
1770             }
1771         }
1772         else
1773             static assert(false);
1774     }
1775     else static if (LDC_with_SSE2)
1776     {
1777         __builtin_ia32_lfence();
1778     }
1779     else static if (DMD_with_asm)
1780     {
1781         asm nothrow @nogc pure @safe
1782         {
1783             lfence;
1784         }
1785     }
1786     else version(LDC)
1787     {
1788         llvm_memory_fence(); // PERF actually generates mfence
1789     }
1790     else
1791         static assert(false);
1792 }
1793 unittest
1794 {
1795     _mm_lfence();
1796 }
1797 
1798 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1799 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1800 __m128d _mm_load_pd (const(double) * mem_addr) pure
1801 {
1802     pragma(inline, true);
1803     __m128d* aligned = cast(__m128d*)mem_addr;
1804     return *aligned;
1805 }
1806 unittest
1807 {
1808     align(16) double[2] S = [-5.0, 7.0];
1809     __m128d R = _mm_load_pd(S.ptr);
1810     assert(R.array == S);
1811 }
1812 
1813 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1814 /// `mem_addr` does not need to be aligned on any particular boundary.
1815 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1816 {
1817     double m = *mem_addr;
1818     __m128d r;
1819     r.ptr[0] = m;
1820     r.ptr[1] = m;
1821     return r;
1822 }
1823 unittest
1824 {
1825     double what = 4;
1826     __m128d R = _mm_load_pd1(&what);
1827     double[2] correct = [4.0, 4];
1828     assert(R.array == correct);
1829 }
1830 
1831 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1832 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1833 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1834 {
1835     double2 r = [0, 0];
1836     r.ptr[0] = *mem_addr;
1837     return r;
1838 }
1839 unittest
1840 {
1841     double x = -42;
1842     __m128d a = _mm_load_sd(&x);
1843     assert(a.array == [-42.0, 0.0]);
1844 }
1845 
1846 /// Load 128-bits of integer data from memory into dst. 
1847 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1848 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1849 {
1850     pragma(inline, true);
1851     return *mem_addr;
1852 }
1853 unittest
1854 {
1855     align(16) int[4] correct = [-1, 2, 3, 4];
1856     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1857     assert(A.array == correct);
1858 }
1859 
1860 alias _mm_load1_pd = _mm_load_pd1; ///
1861 
1862 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1863 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1864 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1865 {
1866     pragma(inline, true);
1867     a.ptr[1] = *mem_addr;
1868     return a;
1869 }
1870 unittest
1871 {
1872     double A = 7.0;
1873     __m128d B = _mm_setr_pd(4.0, -5.0);
1874     __m128d R = _mm_loadh_pd(B, &A);
1875     double[2] correct = [ 4.0, 7.0 ];
1876     assert(R.array == correct);
1877 }
1878 
1879 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1880 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1881 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1882 {
1883     pragma(inline, true);
1884     auto pLong = cast(const(long)*)mem_addr;
1885     long2 r = [0, 0];
1886     r.ptr[0] = *pLong;
1887     return cast(__m128i)(r);
1888 }
1889 unittest
1890 {
1891     long A = 0x7878787870707070;
1892     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1893     long[2] correct = [0x7878787870707070, 0];
1894     assert(R.array == correct);
1895 }
1896 
1897 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1898 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1899 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1900 {
1901     a.ptr[0] = *mem_addr;
1902     return a;
1903 }
1904 unittest
1905 {
1906     double A = 7.0;
1907     __m128d B = _mm_setr_pd(4.0, -5.0);
1908     __m128d R = _mm_loadl_pd(B, &A);
1909     double[2] correct = [ 7.0, -5.0 ];
1910     assert(R.array == correct);
1911 }
1912 
1913 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1914 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1915 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1916 {
1917     __m128d a = *cast(__m128d*)(mem_addr);
1918     __m128d r;
1919     r.ptr[0] = a.array[1];
1920     r.ptr[1] = a.array[0];
1921     return r;
1922 }
1923 unittest
1924 {
1925     align(16) double[2] A = [56.0, -74.0];
1926     __m128d R = _mm_loadr_pd(A.ptr);
1927     double[2] correct = [-74.0, 56.0];
1928     assert(R.array == correct);
1929 }
1930 
1931 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1932 /// `mem_addr` does not need to be aligned on any particular boundary.
1933 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1934 {
1935     pragma(inline, true);
1936     static if (GDC_with_SSE2)
1937     {
1938         return __builtin_ia32_loadupd(mem_addr); 
1939     }
1940     else version(LDC)
1941     {
1942         return loadUnaligned!(double2)(mem_addr);
1943     }
1944     else version(DigitalMars)
1945     {
1946         static if (DMD_with_DSIMD)
1947         {
1948             return cast(__m128d)__simd(XMM.LODUPD, *mem_addr);
1949         }
1950         else static if (SSESizedVectorsAreEmulated)
1951         {
1952             // Since this vector is emulated, it doesn't have alignement constraints
1953             // and as such we can just cast it.
1954             return *cast(__m128d*)(mem_addr);
1955         }
1956         else
1957         {
1958             __m128d result;
1959             result.ptr[0] = mem_addr[0];
1960             result.ptr[1] = mem_addr[1];
1961             return result;
1962         }
1963     }
1964     else
1965     {
1966         __m128d result;
1967         result.ptr[0] = mem_addr[0];
1968         result.ptr[1] = mem_addr[1];
1969         return result;
1970     }
1971 }
1972 unittest
1973 {
1974     double[2] A = [56.0, -75.0];
1975     __m128d R = _mm_loadu_pd(A.ptr);
1976     double[2] correct = [56.0, -75.0];
1977     assert(R.array == correct);
1978 }
1979 
1980 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
1981 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1982 {
1983     pragma(inline, true);
1984     static if (GDC_with_SSE2)
1985     {
1986         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1987     }
1988     else
1989     {
1990         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1991     }
1992 }
1993 unittest
1994 {
1995     align(16) int[4] correct = [-1, 2, -3, 4];
1996     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
1997     assert(A.array == correct);
1998 }
1999 
2000 /// Load unaligned 32-bit integer from memory into the first element of result.
2001 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
2002 {
2003     pragma(inline, true);
2004     int r = *cast(int*)(mem_addr);
2005     int4 result = [0, 0, 0, 0];
2006     result.ptr[0] = r;
2007     return result;
2008 }
2009 unittest
2010 {
2011     int r = 42;
2012     __m128i A = _mm_loadu_si32(&r);
2013     int[4] correct = [42, 0, 0, 0];
2014     assert(A.array == correct);
2015 }
2016 
2017 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2018 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2019 /// and pack the results in destination.
2020 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2021 {
2022     static if (GDC_with_SSE2)
2023     {
2024         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2025     }
2026     else static if (LDC_with_SSE2)
2027     {
2028         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2029     }
2030     else static if (LDC_with_ARM64)
2031     {
2032         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2033         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2034         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2035         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2036         return vcombine_s32(rl, rh);
2037     }
2038     else
2039     {
2040         short8 sa = cast(short8)a;
2041         short8 sb = cast(short8)b;
2042         int4 r;
2043         foreach(i; 0..4)
2044         {
2045             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2046         }
2047         return r;
2048     }
2049 }
2050 unittest
2051 {
2052     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2053     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2054     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2055     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2056     assert(R.array == correct);
2057 }
2058 
2059 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2060 /// (elements are not stored when the highest bit is not set in the corresponding element)
2061 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2062 /// boundary.
2063 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2064 {
2065     static if (GDC_with_SSE2)
2066     {    
2067         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2068     }
2069     else static if (LDC_with_SSE2)
2070     {
2071         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2072     }
2073     else static if (LDC_with_ARM64)
2074     {
2075         // PERF: catastrophic on ARM32
2076         byte16 bmask  = cast(byte16)mask;
2077         byte16 shift = 7;
2078         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2079         mask = cast(__m128i) bmask;
2080         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2081         dest = (a & mask) | (dest & ~mask);
2082         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2083     }
2084     else
2085     {
2086         byte16 b = cast(byte16)a;
2087         byte16 m = cast(byte16)mask;
2088         byte* dest = cast(byte*)(mem_addr);
2089         foreach(j; 0..16)
2090         {
2091             if (m.array[j] & 128)
2092             {
2093                 dest[j] = b.array[j];
2094             }
2095         }
2096     }
2097 }
2098 unittest
2099 {
2100     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2101     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2102     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2103     _mm_maskmoveu_si128(A, mask, dest.ptr);
2104     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2105     assert(dest == correct);
2106 }
2107 
2108 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2109 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2110 {
2111     static if (GDC_with_SSE2)
2112     {
2113         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2114     }
2115     else version(LDC)
2116     {
2117         // x86: pmaxsw since LDC 1.0 -O1
2118         // ARM: smax.8h since LDC 1.5 -01
2119         short8 sa = cast(short8)a;
2120         short8 sb = cast(short8)b;
2121         short8 greater = greaterMask!short8(sa, sb);
2122         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2123     }
2124     else
2125     {
2126         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2127         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2128         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2129         return _mm_xor_si128(b, mask);
2130     }
2131 }
2132 unittest
2133 {
2134     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2135                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2136     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2137     assert(R.array == correct);
2138 }
2139 
2140 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2141 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2142 {
2143     version(LDC)
2144     {
2145         // x86: pmaxub since LDC 1.0.0 -O1
2146         // ARM64: umax.16b since LDC 1.5.0 -O1
2147         // PERF: catastrophic on ARM32
2148         ubyte16 sa = cast(ubyte16)a;
2149         ubyte16 sb = cast(ubyte16)b;
2150         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2151         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2152     }
2153     else
2154     {
2155         __m128i value128 = _mm_set1_epi8(-128);
2156         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2157         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2158         __m128i mask = _mm_and_si128(aTob, higher);
2159         return _mm_xor_si128(b, mask);
2160     }
2161 }
2162 unittest
2163 {
2164     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2165                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2166     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2167     assert(R.array == correct);
2168 }
2169 
2170 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2171 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2172 {
2173     static if (GDC_with_SSE2)
2174     {
2175         return __builtin_ia32_maxpd(a, b);
2176     }
2177     else
2178     {
2179         // x86: Generates maxpd starting with LDC 1.9 -O2
2180         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2181         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2182         return a;
2183     }
2184 }
2185 unittest
2186 {
2187     __m128d A = _mm_setr_pd(4.0, 1.0);
2188     __m128d B = _mm_setr_pd(1.0, 8.0);
2189     __m128d M = _mm_max_pd(A, B);
2190     assert(M.array[0] == 4.0);
2191     assert(M.array[1] == 8.0);
2192 }
2193 
2194 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2195 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2196 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2197 {
2198     static if (GDC_with_SSE2)
2199     {
2200         return __builtin_ia32_maxsd(a, b);
2201     }
2202     else
2203     {
2204          __m128d r = a;
2205         // Generates maxsd starting with LDC 1.3
2206         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2207         return r;
2208     }
2209 }
2210 unittest
2211 {
2212     __m128d A = _mm_setr_pd(1.0, 1.0);
2213     __m128d B = _mm_setr_pd(4.0, 2.0);
2214     __m128d M = _mm_max_sd(A, B);
2215     assert(M.array[0] == 4.0);
2216     assert(M.array[1] == 1.0);
2217 }
2218 
2219 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2220 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2221 /// is globally visible before any memory instruction which follows the fence in program order.
2222 void _mm_mfence() @trusted
2223 {
2224     version(GNU)
2225     {
2226         static if (GDC_with_SSE2)
2227         {
2228             __builtin_ia32_mfence();
2229         }
2230         else version(X86)
2231         {
2232             asm pure nothrow @nogc @trusted
2233             {
2234                 "mfence;\n" : : : ;
2235             }
2236         }
2237         else
2238             static assert(false);
2239     }
2240     else static if (LDC_with_SSE2)
2241     {
2242         __builtin_ia32_mfence();
2243     }
2244     else static if (DMD_with_asm)
2245     {
2246         asm nothrow @nogc pure @safe
2247         {
2248             mfence;
2249         }
2250     }
2251     else version(LDC)
2252     {
2253         void _mm_mfence() pure @safe
2254         {
2255             // Note: will generate the DMB instruction on ARM
2256             llvm_memory_fence();
2257         }
2258     }
2259     else
2260         static assert(false);
2261 }
2262 unittest
2263 {
2264     _mm_mfence();
2265 }
2266 
2267 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2268 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2269 {
2270     static if (GDC_with_SSE2)
2271     {
2272         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2273     }
2274     else version(LDC)
2275     {
2276         // x86: pminsw since LDC 1.0 -O1
2277         // ARM64: smin.8h since LDC 1.5 -01
2278         short8 sa = cast(short8)a;
2279         short8 sb = cast(short8)b;
2280         short8 greater = greaterMask!short8(sa, sb);
2281         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2282     }
2283     else
2284     {
2285         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2286         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2287         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2288         return _mm_xor_si128(b, mask);
2289     }
2290 }
2291 unittest
2292 {
2293     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2294                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2295     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2296     assert(R.array == correct);
2297 }
2298 
2299 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2300 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2301 {
2302     version(LDC)
2303     {
2304         // x86: pminub since LDC 1.0.0 -O1
2305         // ARM: umin.16b since LDC 1.5.0 -O1
2306         // PERF: catastrophic on ARM32
2307         ubyte16 sa = cast(ubyte16)a;
2308         ubyte16 sb = cast(ubyte16)b;
2309         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2310         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2311     }
2312     else
2313     {
2314         __m128i value128 = _mm_set1_epi8(-128);
2315         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2316         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2317         __m128i mask = _mm_and_si128(aTob, lower);
2318         return _mm_xor_si128(b, mask);
2319     }
2320 }
2321 unittest
2322 {
2323     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2324                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2325     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2326     assert(R.array == correct);
2327 }
2328 
2329 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2330 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2331 {
2332     static if (GDC_with_SSE2)
2333     {
2334         return __builtin_ia32_minpd(a, b);
2335     }
2336     else
2337     {
2338         // Generates minpd starting with LDC 1.9
2339         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2340         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2341         return a;
2342     }
2343 }
2344 unittest
2345 {
2346     __m128d A = _mm_setr_pd(1.0, 2.0);
2347     __m128d B = _mm_setr_pd(4.0, 1.0);
2348     __m128d M = _mm_min_pd(A, B);
2349     assert(M.array[0] == 1.0);
2350     assert(M.array[1] == 1.0);
2351 }
2352 
2353 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2354 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2355 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2356 {
2357     static if (GDC_with_SSE2)
2358     {
2359         return __builtin_ia32_minsd(a, b);
2360     }
2361     else
2362     {
2363         // Generates minsd starting with LDC 1.3
2364         __m128d r = a;
2365         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2366         return r;
2367     }
2368 }
2369 unittest
2370 {
2371     __m128d A = _mm_setr_pd(1.0, 3.0);
2372     __m128d B = _mm_setr_pd(4.0, 2.0);
2373     __m128d M = _mm_min_sd(A, B);
2374     assert(M.array[0] == 1.0);
2375     assert(M.array[1] == 3.0);
2376 }
2377 
2378 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2379 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2380 {
2381     static if (GDC_with_SSE2)
2382     {
2383         // slightly better with GDC -O0
2384         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2385     }
2386     else
2387     {
2388         long2 result = [ 0, 0 ];
2389         long2 la = cast(long2) a;
2390         result.ptr[0] = la.array[0];
2391         return cast(__m128i)(result);
2392     }
2393 }
2394 unittest
2395 {
2396     long2 A = [13, 47];
2397     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2398     long[2] correct = [13, 0];
2399     assert(B.array == correct);
2400 }
2401 
2402 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2403 /// the upper element from `a` to the upper element of dst.
2404 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2405 {
2406     static if (GDC_with_SSE2)
2407     {
2408         return __builtin_ia32_movsd(a, b); 
2409     }
2410     else
2411     {
2412         b.ptr[1] = a.array[1];
2413         return b;
2414     }
2415 }
2416 unittest
2417 {
2418     double2 A = [13.0, 47.0];
2419     double2 B = [34.0, 58.0];
2420     double2 C = _mm_move_sd(A, B);
2421     double[2] correct = [34.0, 47.0];
2422     assert(C.array == correct);
2423 }
2424 
2425 /// Create mask from the most significant bit of each 8-bit element in `v`.
2426 int _mm_movemask_epi8 (__m128i a) pure @trusted
2427 {
2428     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2429     static if (GDC_with_SSE2)
2430     {
2431         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2432     }
2433     else static if (LDC_with_SSE2)
2434     {
2435         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2436     }
2437     else static if (LDC_with_ARM64)
2438     {
2439         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2440         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2441         // SO there might be something a bit faster, but this one is reasonable and branchless.
2442         byte8 mask_shift;
2443         mask_shift.ptr[0] = 7;
2444         mask_shift.ptr[1] = 6;
2445         mask_shift.ptr[2] = 5;
2446         mask_shift.ptr[3] = 4;
2447         mask_shift.ptr[4] = 3;
2448         mask_shift.ptr[5] = 2;
2449         mask_shift.ptr[6] = 1;
2450         mask_shift.ptr[7] = 0;
2451         byte8 mask_and = byte8(-128);
2452         byte8 lo = vget_low_u8(cast(byte16)a);
2453         byte8 hi = vget_high_u8(cast(byte16)a);
2454         lo = vand_u8(lo, mask_and);
2455         lo = vshr_u8(lo, mask_shift);
2456         hi = vand_u8(hi, mask_and);
2457         hi = vshr_u8(hi, mask_shift);
2458         lo = vpadd_u8(lo,lo);
2459         lo = vpadd_u8(lo,lo);
2460         lo = vpadd_u8(lo,lo);
2461         hi = vpadd_u8(hi,hi);
2462         hi = vpadd_u8(hi,hi);
2463         hi = vpadd_u8(hi,hi);
2464         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2465     }
2466     else
2467     {
2468         byte16 ai = cast(byte16)a;
2469         int r = 0;
2470         foreach(bit; 0..16)
2471         {
2472             if (ai.array[bit] < 0) r += (1 << bit);
2473         }
2474         return r;
2475     }
2476 }
2477 unittest
2478 {
2479     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2480 }
2481 
2482 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2483 /// loating-point element in `v`.
2484 int _mm_movemask_pd(__m128d v) pure @safe
2485 {
2486     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2487     static if (GDC_with_SSE2)
2488     {
2489         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2490         /// packed double-precision (64-bit) floating-point element in `v`.
2491         return __builtin_ia32_movmskpd(v);
2492     }
2493     else static if (LDC_with_SSE2)
2494     {
2495         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2496         /// packed double-precision (64-bit) floating-point element in `v`.
2497         return __builtin_ia32_movmskpd(v);
2498     }
2499     else
2500     {
2501         long2 lv = cast(long2)v;
2502         int r = 0;
2503         if (lv.array[0] < 0) r += 1;
2504         if (lv.array[1] < 0) r += 2;
2505         return r;
2506     }
2507 }
2508 unittest
2509 {
2510     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2511     assert(_mm_movemask_pd(A) == 2);
2512 }
2513 
2514 /// Copy the lower 64-bit integer in `v`.
2515 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2516 {
2517     long2 lv = cast(long2)v;
2518     return long1(lv.array[0]);
2519 }
2520 unittest
2521 {
2522     __m128i A = _mm_set_epi64x(-1, -2);
2523     __m64 R = _mm_movepi64_pi64(A);
2524     assert(R.array[0] == -2);
2525 }
2526 
2527 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2528 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2529 {
2530     long2 r;
2531     r.ptr[0] = a.array[0];
2532     r.ptr[1] = 0;
2533     return cast(__m128i)r;
2534 }
2535 
2536 // Note: generates pmuludq in LDC with -O1
2537 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2538 {
2539     __m128i zero = _mm_setzero_si128();
2540 
2541     static if (__VERSION__ >= 2088)
2542     {
2543         // Need LLVM9 to avoid this shufflevector
2544         long2 la, lb;
2545         la.ptr[0] = cast(uint)a.array[0];
2546         la.ptr[1] = cast(uint)a.array[2];
2547         lb.ptr[0] = cast(uint)b.array[0];
2548         lb.ptr[1] = cast(uint)b.array[2];
2549     }
2550     else
2551     {
2552         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2553         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2554     }
2555 
2556     version(DigitalMars)
2557     {
2558         // DMD has no long2 mul
2559         // long2 mul not supported before LDC 1.5
2560         la.ptr[0] *= lb.array[0];
2561         la.ptr[1] *= lb.array[1];
2562         return cast(__m128i)(la);
2563     }
2564     else
2565     {
2566         static if (__VERSION__ >= 2076)
2567         {
2568             return cast(__m128i)(la * lb);
2569         }
2570         else
2571         {
2572             // long2 mul not supported before LDC 1.5
2573             la.ptr[0] *= lb.array[0];
2574             la.ptr[1] *= lb.array[1];
2575             return cast(__m128i)(la);
2576         }
2577     }
2578 }
2579 unittest
2580 {
2581     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2582     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2583     __m128i C = _mm_mul_epu32(A, B);
2584     long2 LC = cast(long2)C;
2585     assert(LC.array[0] == 18446744065119617025uL);
2586     assert(LC.array[1] == 12723420444339690338uL);
2587 }
2588 
2589 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2590 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2591 {
2592     pragma(inline, true);
2593     return a * b;
2594 }
2595 unittest
2596 {
2597     __m128d a = [-2.0, 1.5];
2598     a = _mm_mul_pd(a, a);
2599     assert(a.array == [4.0, 2.25]);
2600 }
2601 
2602 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2603 /// element of result, and copy the upper element from `a` to the upper element of result.
2604 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2605 {
2606     version(DigitalMars)
2607     {    
2608         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2609         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2610         asm pure nothrow @nogc @trusted { nop;}
2611         a.array[0] = a.array[0] * b.array[0];
2612         return a;
2613     }
2614     else static if (GDC_with_SSE2)
2615     {
2616         return __builtin_ia32_mulsd(a, b);
2617     }
2618     else
2619     {
2620         a.ptr[0] *= b.array[0];
2621         return a;
2622     }
2623 }
2624 unittest
2625 {
2626     __m128d a = [-2.0, 1.5];
2627     a = _mm_mul_sd(a, a);
2628     assert(a.array == [4.0, 1.5]);
2629 }
2630 
2631 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2632 /// and get an unsigned 64-bit result.
2633 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2634 {
2635     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2636 }
2637 unittest
2638 {
2639     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2640     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2641     __m64 C = _mm_mul_su32(A, B);
2642     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2643 }
2644 
2645 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2646 /// high 16 bits of the intermediate integers.
2647 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2648 {
2649     static if (GDC_with_SSE2)
2650     {
2651         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2652     }
2653     else static if (LDC_with_SSE2)
2654     {
2655         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2656     }
2657     else
2658     {
2659         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2660         //        PERF: it seems the simde solution has one less instruction in ARM64.
2661         // PERF: Catastrophic in ARM32.
2662         short8 sa = cast(short8)a;
2663         short8 sb = cast(short8)b;
2664         short8 r = void;
2665         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2666         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2667         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2668         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2669         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2670         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2671         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2672         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2673         return cast(__m128i)r;
2674     }
2675 }
2676 unittest
2677 {
2678     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2679     __m128i B = _mm_set1_epi16(16384);
2680     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2681     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2682     assert(R.array == correct);
2683 }
2684 
2685 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2686 /// high 16 bits of the intermediate integers.
2687 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2688 {
2689     static if (GDC_with_SSE2)
2690     {
2691         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2692     }
2693     else static if (LDC_with_SSE2)
2694     {
2695         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2696     }
2697     else
2698     {
2699         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2700         //      it seems the simde solution has one less instruction in ARM64
2701         // PERF: Catastrophic in ARM32.
2702         short8 sa = cast(short8)a;
2703         short8 sb = cast(short8)b;
2704         short8 r = void;
2705         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2706         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2707         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2708         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2709         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2710         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2711         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2712         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2713         return cast(__m128i)r;
2714     }
2715 }
2716 unittest
2717 {
2718     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2719     __m128i B = _mm_set1_epi16(16384);
2720     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2721     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2722     assert(R.array == correct);
2723 }
2724 
2725 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2726 /// bits of the intermediate integers.
2727 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2728 {
2729     return cast(__m128i)(cast(short8)a * cast(short8)b);
2730 }
2731 unittest
2732 {
2733     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2734     __m128i B = _mm_set1_epi16(16384);
2735     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2736     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2737     assert(R.array == correct);
2738 }
2739 
2740 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2741 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2742 {
2743     pragma(inline, true);
2744     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2745 }
2746 
2747 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2748 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2749 {
2750     pragma(inline, true);
2751     return a | b;
2752 }
2753 
2754 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2755 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2756 {
2757     static if (GDC_with_SSE2)
2758     {
2759         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2760     }    
2761     else static if (LDC_with_SSE2)
2762     {
2763         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2764     }
2765     else static if (LDC_with_ARM64)
2766     {
2767         short4 ra = vqmovn_s32(cast(int4)a);
2768         short4 rb = vqmovn_s32(cast(int4)b);
2769         return cast(__m128i)vcombine_s16(ra, rb);
2770     }
2771     else
2772     {
2773         // PERF: catastrophic on ARM32
2774         short8 r;
2775         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2776         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2777         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2778         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2779         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2780         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2781         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2782         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2783         return cast(__m128i)r;
2784     }
2785 }
2786 unittest
2787 {
2788     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2789     short8 R = cast(short8) _mm_packs_epi32(A, A);
2790     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2791     assert(R.array == correct);
2792 }
2793 
2794 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2795 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2796 {
2797     static if (GDC_with_SSE2)
2798     {
2799         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2800     }
2801     else static if (LDC_with_SSE2)
2802     {
2803         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2804     }
2805     else static if (LDC_with_ARM64)
2806     {
2807         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2808         byte8 ra = vqmovn_s16(cast(short8)a);
2809         byte8 rb = vqmovn_s16(cast(short8)b);
2810         return cast(__m128i)vcombine_s8(ra, rb);
2811     }
2812     else
2813     {
2814         // PERF: ARM32 is missing
2815         byte16 r;
2816         short8 sa = cast(short8)a;
2817         short8 sb = cast(short8)b;
2818         foreach(i; 0..8)
2819             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2820         foreach(i; 0..8)
2821             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2822         return cast(__m128i)r;
2823     }
2824 }
2825 unittest
2826 {
2827     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2828     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2829     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2830                         127, -128, 127, 0, 127, -128, 127, 0];
2831     assert(R.array == correct);
2832 }
2833 
2834 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2835 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2836 {
2837     static if (GDC_with_SSE2)
2838     {
2839         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2840     }
2841     else static if (LDC_with_SSE2)
2842     {
2843         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2844     }
2845     else static if (LDC_with_ARM64)
2846     {
2847         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2848         byte8 ra = vqmovun_s16(cast(short8)a);
2849         byte8 rb = vqmovun_s16(cast(short8)b);
2850         return cast(__m128i)vcombine_s8(ra, rb);
2851     }
2852     else
2853     {
2854         short8 sa = cast(short8)a;
2855         short8 sb = cast(short8)b;
2856         ubyte[16] result = void;
2857         for (int i = 0; i < 8; ++i)
2858         {
2859             short s = sa[i];
2860             if (s < 0) s = 0;
2861             if (s > 255) s = 255;
2862             result[i] = cast(ubyte)s;
2863 
2864             s = sb[i];
2865             if (s < 0) s = 0;
2866             if (s > 255) s = 255;
2867             result[i+8] = cast(ubyte)s;
2868         }
2869         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2870     }
2871 }
2872 unittest
2873 {
2874     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2875     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2876     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2877                                                 0, 255, 0, 255, 255, 2, 1, 0];
2878     foreach(i; 0..16)
2879         assert(AA.array[i] == cast(byte)(correctResult[i]));
2880 }
2881 
2882 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2883 /// and power consumption of spin-wait loops.
2884 void _mm_pause() @trusted
2885 {
2886     version(GNU)
2887     {
2888         static if (GDC_with_SSE2)
2889         {
2890             __builtin_ia32_pause();
2891         }
2892         else version(X86)
2893         {
2894             asm pure nothrow @nogc @trusted
2895             {
2896                 "pause;\n" : : : ;
2897             }
2898         }
2899         else
2900             static assert(false);
2901     }
2902     else static if (LDC_with_SSE2)
2903     {
2904         __builtin_ia32_pause();
2905     }
2906     else static if (DMD_with_asm)
2907     {
2908         asm nothrow @nogc pure @safe
2909         {
2910             rep; nop; // F3 90 =  pause
2911         }
2912     }
2913     else version (LDC)
2914     {
2915         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2916     }
2917     else
2918         static assert(false);
2919 }
2920 unittest
2921 {
2922     _mm_pause();
2923 }
2924 
2925 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2926 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2927 /// low 16 bits of 64-bit elements in result.
2928 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2929 {
2930     static if (GDC_with_SSE2)
2931     {
2932         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
2933     }
2934     else static if (LDC_with_SSE2)
2935     {
2936         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2937     }
2938     else static if (LDC_with_ARM64)
2939     {
2940         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
2941 
2942         // PERF: Looks suboptimal vs addp
2943         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
2944         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
2945         ushort8 r = 0;
2946         r[0] = r0;
2947         r[4] = r4;
2948         return cast(__m128i) r;
2949     }
2950     else
2951     {
2952         // PERF: ARM32 is lacking
2953         byte16 ab = cast(byte16)a;
2954         byte16 bb = cast(byte16)b;
2955         ubyte[16] t;
2956         foreach(i; 0..16)
2957         {
2958             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2959             if (diff < 0) diff = -diff;
2960             t[i] = cast(ubyte)(diff);
2961         }
2962         int4 r = _mm_setzero_si128();
2963         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2964         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2965         return r;
2966     }
2967 }
2968 unittest
2969 {
2970     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2971     __m128i B = _mm_set1_epi8(1);
2972     __m128i R = _mm_sad_epu8(A, B);
2973     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2974                       0,
2975                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2976                       0];
2977     assert(R.array == correct);
2978 }
2979 
2980 /// Set packed 16-bit integers with the supplied values.
2981 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2982 {
2983     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2984     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2985 }
2986 unittest
2987 {
2988     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2989     short8 B = cast(short8) A;
2990     foreach(i; 0..8)
2991         assert(B.array[i] == i);
2992 }
2993 
2994 /// Set packed 32-bit integers with the supplied values.
2995 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2996 {
2997     pragma(inline, true);
2998     int[4] result = [e0, e1, e2, e3];
2999     return loadUnaligned!(int4)(result.ptr);
3000 }
3001 unittest
3002 {
3003     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3004     foreach(i; 0..4)
3005         assert(A.array[i] == i);
3006 }
3007 
3008 /// Set packed 64-bit integers with the supplied values.
3009 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3010 {
3011     pragma(inline, true);
3012     long[2] result = [e0.array[0], e1.array[0]];
3013     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3014 }
3015 unittest
3016 {
3017     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3018     long2 B = cast(long2) A;
3019     assert(B.array[0] == 5678);
3020     assert(B.array[1] == 1234);
3021 }
3022 
3023 /// Set packed 64-bit integers with the supplied values.
3024 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3025 {
3026     pragma(inline, true);
3027     long[2] result = [e0, e1];
3028     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3029 }
3030 unittest
3031 {
3032     __m128i A = _mm_set_epi64x(1234, 5678);
3033     long2 B = cast(long2) A;
3034     assert(B.array[0] == 5678);
3035     assert(B.array[1] == 1234);
3036 }
3037 
3038 /// Set packed 8-bit integers with the supplied values.
3039 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3040                       byte e11, byte e10, byte e9, byte e8,
3041                       byte e7, byte e6, byte e5, byte e4,
3042                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3043 {
3044     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3045                      e8, e9, e10, e11, e12, e13, e14, e15];
3046     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3047 }
3048 
3049 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3050 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3051 {
3052     pragma(inline, true);
3053     double[2] result = [e0, e1];
3054     return loadUnaligned!(double2)(result.ptr);
3055 }
3056 unittest
3057 {
3058     __m128d A = _mm_set_pd(61.0, 55.0);
3059     double[2] correct = [55.0, 61.0];
3060     assert(A.array == correct);
3061 }
3062 
3063 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3064 __m128d _mm_set_pd1 (double a) pure @trusted
3065 {
3066     pragma(inline, true);
3067     double[2] result = [a, a];
3068     return loadUnaligned!(double2)(result.ptr);
3069 }
3070 unittest
3071 {
3072     __m128d A = _mm_set_pd1(61.0);
3073     double[2] correct = [61.0, 61.0];
3074     assert(A.array == correct);
3075 }
3076 
3077 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3078 /// and zero the upper element.
3079 __m128d _mm_set_sd (double a) pure @trusted
3080 {
3081     double[2] result = [a, 0];
3082     return loadUnaligned!(double2)(result.ptr);
3083 }
3084 
3085 /// Broadcast 16-bit integer a to all elements of dst.
3086 __m128i _mm_set1_epi16 (short a) pure @trusted
3087 {
3088     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3089     {
3090         short8 v = a;
3091         return cast(__m128i) v;
3092     }
3093     else
3094     {
3095         pragma(inline, true);
3096         return cast(__m128i)(short8(a));
3097     }
3098 }
3099 unittest
3100 {
3101     short8 a = cast(short8) _mm_set1_epi16(31);
3102     for (int i = 0; i < 8; ++i)
3103         assert(a.array[i] == 31);
3104 }
3105 
3106 /// Broadcast 32-bit integer `a` to all elements.
3107 __m128i _mm_set1_epi32 (int a) pure @trusted
3108 {
3109     pragma(inline, true);
3110     return cast(__m128i)(int4(a));
3111 }
3112 unittest
3113 {
3114     int4 a = cast(int4) _mm_set1_epi32(31);
3115     for (int i = 0; i < 4; ++i)
3116         assert(a.array[i] == 31);
3117 }
3118 
3119 /// Broadcast 64-bit integer `a` to all elements.
3120 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3121 {
3122     return _mm_set_epi64(a, a);
3123 }
3124 unittest
3125 {
3126     long b = 0x1DEADCAFE; 
3127     __m64 a;
3128     a.ptr[0] = b;
3129     long2 c = cast(long2) _mm_set1_epi64(a);
3130     assert(c.array[0] == b);
3131     assert(c.array[1] == b);
3132 }
3133 
3134 /// Broadcast 64-bit integer `a` to all elements
3135 __m128i _mm_set1_epi64x (long a) pure @trusted
3136 {
3137     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3138     return cast(__m128i)(b);
3139 }
3140 unittest
3141 {
3142     long b = 0x1DEADCAFE;
3143     long2 c = cast(long2) _mm_set1_epi64x(b);
3144     for (int i = 0; i < 2; ++i)
3145         assert(c.array[i] == b);
3146 }
3147 
3148 /// Broadcast 8-bit integer `a` to all elements.
3149 __m128i _mm_set1_epi8 (byte a) pure @trusted
3150 {
3151     pragma(inline, true);
3152     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3153     return cast(__m128i)(b);
3154 }
3155 unittest
3156 {
3157     byte16 b = cast(byte16) _mm_set1_epi8(31);
3158     for (int i = 0; i < 16; ++i)
3159         assert(b.array[i] == 31);
3160 }
3161 
3162 alias _mm_set1_pd = _mm_set_pd1;
3163 
3164 /// Set packed 16-bit integers with the supplied values in reverse order.
3165 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3166                         short e3, short e2, short e1, short e0) pure @trusted
3167 {
3168     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
3169     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
3170 }
3171 unittest
3172 {
3173     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3174     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3175     assert(A.array == correct);
3176 }
3177 
3178 /// Set packed 32-bit integers with the supplied values in reverse order.
3179 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3180 {
3181     pragma(inline, true);
3182     int[4] result = [e3, e2, e1, e0];
3183     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3184 }
3185 unittest
3186 {
3187     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3188     int[4] correct = [-1, 0, -2147483648, 2147483647];
3189     assert(A.array == correct);
3190 }
3191 
3192 /// Set packed 64-bit integers with the supplied values in reverse order.
3193 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3194 {
3195     long[2] result = [e1, e0];
3196     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3197 }
3198 unittest
3199 {
3200     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3201     long[2] correct = [-1, 0];
3202     assert(A.array == correct);
3203 }
3204 
3205 /// Set packed 8-bit integers with the supplied values in reverse order.
3206 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3207                        byte e11, byte e10, byte e9,  byte e8,
3208                        byte e7,  byte e6,  byte e5,  byte e4,
3209                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3210 {
3211     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3212                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3213     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3214 }
3215 
3216 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3217 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3218 {
3219     pragma(inline, true);
3220     double2 result;
3221     result.ptr[0] = e1;
3222     result.ptr[1] = e0;
3223     return result;
3224 }
3225 unittest
3226 {
3227     __m128d A = _mm_setr_pd(61.0, 55.0);
3228     double[2] correct = [61.0, 55.0];
3229     assert(A.array == correct);
3230 }
3231 
3232 /// Return vector of type `__m128d` with all elements set to zero.
3233 __m128d _mm_setzero_pd () pure @trusted
3234 {
3235     pragma(inline, true);
3236     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3237     double[2] result = [0.0, 0.0];
3238     return loadUnaligned!(double2)(result.ptr);
3239 }
3240 
3241 /// Return vector of type `__m128i` with all elements set to zero.
3242 __m128i _mm_setzero_si128() pure @trusted
3243 {
3244     pragma(inline, true);
3245     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3246     int[4] result = [0, 0, 0, 0];
3247     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3248 }
3249 
3250 /// Shuffle 32-bit integers in a using the control in `imm8`.
3251 /// See_also: `_MM_SHUFFLE`.
3252 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3253 {
3254     static if (GDC_with_SSE2)
3255     {
3256         return __builtin_ia32_pshufd(a, imm8);
3257     }
3258     else
3259     {
3260         return shufflevector!(int4, (imm8 >> 0) & 3,
3261                                     (imm8 >> 2) & 3,
3262                                     (imm8 >> 4) & 3,
3263                                     (imm8 >> 6) & 3)(a, a);
3264     }
3265 }
3266 unittest
3267 {
3268     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3269     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3270     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3271     int[4] expectedB = [ 3, 2, 1, 0 ];
3272     assert(B.array == expectedB);
3273 }
3274 
3275 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3276 /// See_also: `_MM_SHUFFLE2`.
3277 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3278 {
3279     static if (GDC_with_SSE2)
3280     {
3281         return __builtin_ia32_shufpd(a, b, imm8);
3282     }
3283     else
3284     {
3285         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3286                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
3287     }
3288 }
3289 unittest
3290 {
3291     __m128d A = _mm_setr_pd(0.5, 2.0);
3292     __m128d B = _mm_setr_pd(4.0, 5.0);
3293     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3294     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3295     double[2] correct = [ 2.0, 5.0 ];
3296     assert(R.array == correct);
3297 }
3298 
3299 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3300 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3301 /// See also: `_MM_SHUFFLE`.
3302 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3303 {
3304     static if (GDC_with_SSE2)
3305     {
3306         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3307     }
3308     else
3309     {
3310         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3311                                           4 + ( (imm8 >> 0) & 3 ),
3312                                           4 + ( (imm8 >> 2) & 3 ),
3313                                           4 + ( (imm8 >> 4) & 3 ),
3314                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3315     }
3316 }
3317 unittest
3318 {
3319     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3320     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3321     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3322     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3323     assert(C.array == expectedC);
3324 }
3325 
3326 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3327 /// bits of result, with the high 64 bits being copied from from `a` to result.
3328 /// See_also: `_MM_SHUFFLE`.
3329 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3330 {
3331     static if (GDC_with_SSE2)
3332     {
3333         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3334     }
3335     else
3336     {
3337         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3338                                                     ( (imm8 >> 2) & 3 ),
3339                                                     ( (imm8 >> 4) & 3 ),
3340                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3341     }
3342 }
3343 unittest
3344 {
3345     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3346     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3347     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3348     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3349     assert(B.array == expectedB);
3350 }
3351 
3352 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3353 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3354 {
3355     static if (LDC_with_SSE2)
3356     {
3357         return __builtin_ia32_pslld128(a, count);
3358     }
3359     else static if (GDC_with_SSE2)
3360     {
3361         return __builtin_ia32_pslld128(a, count);
3362     }
3363     else static if (DMD_with_32bit_asm)
3364     {
3365         asm pure nothrow @nogc @trusted
3366         {
3367             movdqu XMM0, a;
3368             movdqu XMM1, count;
3369             pslld XMM0, XMM1;
3370             movdqu a, XMM0;
3371         }
3372         return a;
3373     }
3374     else
3375     {
3376         int4 r = void;
3377         long2 lc = cast(long2)count;
3378         int bits = cast(int)(lc.array[0]);
3379         foreach(i; 0..4)
3380             r[i] = cast(uint)(a[i]) << bits;
3381         return r;
3382     }
3383 }
3384 
3385 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3386 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3387 {
3388     static if (LDC_with_SSE2)
3389     {
3390         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3391     }
3392     else static if (GDC_with_SSE2)
3393     {
3394         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3395     }
3396     else static if (DMD_with_32bit_asm)
3397     {
3398         asm pure nothrow @nogc @trusted
3399         {
3400             movdqu XMM0, a;
3401             movdqu XMM1, count;
3402             psllq XMM0, XMM1;
3403             movdqu a, XMM0;
3404         }
3405         return a;
3406     }
3407     else
3408     {
3409         // ARM: good since LDC 1.12 -O2
3410         // ~but -O0 version is catastrophic
3411         long2 r = void;
3412         long2 sa = cast(long2)a;
3413         long2 lc = cast(long2)count;
3414         int bits = cast(int)(lc.array[0]);
3415         foreach(i; 0..2)
3416             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3417         return cast(__m128i)r;
3418     }
3419 }
3420 
3421 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3422 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3423 {
3424     static if (LDC_with_SSE2)
3425     {
3426         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3427     }
3428     else static if (GDC_with_SSE2)
3429     {
3430         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3431     }
3432     else static if (DMD_with_32bit_asm)
3433     {
3434         asm pure nothrow @nogc
3435         {
3436             movdqu XMM0, a;
3437             movdqu XMM1, count;
3438             psllw XMM0, XMM1;
3439             movdqu a, XMM0;
3440         }
3441         return a;
3442     }
3443     else
3444     {
3445         short8 sa = cast(short8)a;
3446         long2 lc = cast(long2)count;
3447         int bits = cast(int)(lc.array[0]);
3448         short8 r = void;
3449         foreach(i; 0..8)
3450             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3451         return cast(int4)r;
3452     }
3453 }
3454 
3455 
3456 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3457 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3458 {
3459     static if (GDC_with_SSE2)
3460     {
3461         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3462     }
3463     else static if (LDC_with_SSE2)
3464     {
3465         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3466     }
3467     else
3468     {
3469         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3470         //       D says "It's illegal to shift by the same or more bits 
3471         //       than the size of the quantity being shifted"
3472         //       and it's UB instead.
3473         int4 r = _mm_setzero_si128();
3474 
3475         ubyte count = cast(ubyte) imm8;
3476         if (count > 31)
3477             return r;
3478         
3479         foreach(i; 0..4)
3480             r.array[i] = cast(uint)(a.array[i]) << count;
3481         return r;
3482     }
3483 }
3484 unittest
3485 {
3486     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3487     __m128i B = _mm_slli_epi32(A, 1);
3488     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3489     int[4] expectedB = [ 0, 4, 6, -8];
3490     assert(B.array == expectedB);
3491     assert(B2.array == expectedB);
3492 
3493     __m128i C = _mm_slli_epi32(A, 0);
3494     int[4] expectedC = [ 0, 2, 3, -4];
3495     assert(C.array == expectedC);
3496 
3497     __m128i D = _mm_slli_epi32(A, 65);
3498     int[4] expectedD = [ 0, 0, 0, 0];
3499     assert(D.array == expectedD);
3500 }
3501 
3502 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3503 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3504 {
3505     static if (GDC_with_SSE2)
3506     {
3507         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3508     }
3509     else static if (LDC_with_SSE2)
3510     {
3511         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3512     }
3513     else
3514     {
3515         long2 sa = cast(long2)a;
3516 
3517         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3518         //       D says "It's illegal to shift by the same or more bits 
3519         //       than the size of the quantity being shifted"
3520         //       and it's UB instead.
3521         long2 r = cast(long2) _mm_setzero_si128();
3522         ubyte count = cast(ubyte) imm8;
3523         if (count > 63)
3524             return cast(__m128i)r;
3525 
3526         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3527         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3528         return cast(__m128i)r;
3529     }
3530 }
3531 unittest
3532 {
3533     __m128i A = _mm_setr_epi64(8, -4);
3534     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3535     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3536     long[2] expectedB = [ 16, -8];
3537     assert(B.array == expectedB);
3538     assert(B2.array == expectedB);
3539 
3540     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3541     long[2] expectedC = [ 8, -4];
3542     assert(C.array == expectedC);
3543 
3544     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3545     long[2] expectedD = [ 0, -0];
3546     assert(D.array == expectedD);
3547 }
3548 
3549 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3550 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3551 {
3552     static if (GDC_with_SSE2)
3553     {
3554         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3555     }
3556     else static if (LDC_with_SSE2)
3557     {
3558         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3559     }
3560     else static if (LDC_with_ARM64)
3561     {
3562         short8 sa = cast(short8)a;
3563         short8 r = cast(short8)_mm_setzero_si128();
3564         ubyte count = cast(ubyte) imm8;
3565         if (count > 15)
3566             return cast(__m128i)r;
3567         r = sa << short8(count);
3568         return cast(__m128i)r;
3569     }
3570     else
3571     {
3572         short8 sa = cast(short8)a;
3573         short8 r = cast(short8)_mm_setzero_si128();
3574         ubyte count = cast(ubyte) imm8;
3575         if (count > 15)
3576             return cast(__m128i)r;
3577         foreach(i; 0..8)
3578             r.ptr[i] = cast(short)(sa.array[i] << count);
3579         return cast(__m128i)r;
3580     }
3581 }
3582 unittest
3583 {
3584     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3585     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3586     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3587     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3588     assert(B.array == expectedB);
3589     assert(B2.array == expectedB);
3590 
3591     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3592     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3593     assert(C.array == expectedC);
3594 }
3595 
3596 
3597 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3598 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3599 {
3600     static if (bytes & 0xF0)
3601     {
3602         return _mm_setzero_si128();
3603     }
3604     else
3605     {
3606         static if (GDC_with_SSE2)
3607         {
3608             return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3609         }
3610         else version(DigitalMars)
3611         {
3612             version(D_InlineAsm_X86)
3613             {
3614                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3615                 {
3616                     movdqu XMM0, op;
3617                     pslldq XMM0, bytes;
3618                     movdqu op, XMM0;
3619                 }
3620                 return op;
3621             }
3622             else
3623             {
3624                 byte16 A = cast(byte16)op;
3625                 byte16 R;
3626                 for (int n = 15; n >= bytes; --n)
3627                     R.ptr[n] = A.array[n-bytes];
3628                 for (int n = bytes-1; n >= 0; --n)
3629                     R.ptr[n] = 0;
3630                 return cast(__m128i)R;
3631             }
3632         }
3633         else
3634         {
3635             return cast(__m128i) shufflevector!(byte16,
3636             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3637             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3638             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3639             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3640         }
3641     }
3642 }
3643 unittest
3644 {
3645     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3646     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3647     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3648     assert(R.array == correct);
3649 
3650     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3651     int[4] expectedB = [0, 0, 0, 0];
3652     assert(B.array == expectedB);
3653 }
3654 
3655 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3656 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3657 {
3658     version(LDC)
3659     {
3660         // Disappeared with LDC 1.11
3661         static if (__VERSION__ < 2081)
3662             return __builtin_ia32_sqrtpd(vec);
3663         else
3664         {
3665             vec.array[0] = llvm_sqrt(vec.array[0]);
3666             vec.array[1] = llvm_sqrt(vec.array[1]);
3667             return vec;
3668         }
3669     }
3670     else static if (GDC_with_SSE2)    
3671     {
3672         return __builtin_ia32_sqrtpd(vec);
3673     }
3674     else
3675     {
3676         vec.ptr[0] = sqrt(vec.array[0]);
3677         vec.ptr[1] = sqrt(vec.array[1]);
3678         return vec;
3679     }
3680 }
3681 
3682 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3683 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3684 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3685 {
3686     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3687     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3688     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3689     version(LDC)
3690     {
3691         // Disappeared with LDC 1.11
3692         static if (__VERSION__ < 2081)
3693         {
3694             __m128d c = __builtin_ia32_sqrtsd(b);
3695             a[0] = c[0];
3696             return a;
3697         }
3698         else
3699         {
3700             a.array[0] = llvm_sqrt(b.array[0]);
3701             return a;
3702         }
3703     }
3704     else static if (GDC_with_SSE2)
3705     {
3706         __m128d c = __builtin_ia32_sqrtsd(b);
3707         a.ptr[0] = c.array[0];
3708         return a;
3709     }
3710     else
3711     {
3712         a.ptr[0] = sqrt(b.array[0]);
3713         return a;
3714     }
3715 }
3716 unittest
3717 {
3718     __m128d A = _mm_setr_pd(1.0, 3.0);
3719     __m128d B = _mm_setr_pd(4.0, 5.0);
3720     __m128d R = _mm_sqrt_sd(A, B);
3721     double[2] correct = [2.0, 3.0 ];
3722     assert(R.array == correct);
3723 }
3724 
3725 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3726 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3727 {
3728     static if (GDC_with_SSE2)
3729     {
3730         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3731     }
3732     else static if (LDC_with_SSE2)
3733     {
3734         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3735     }
3736     else
3737     {
3738         short8 sa = cast(short8)a;
3739         long2 lc = cast(long2)count;
3740         int bits = cast(int)(lc.array[0]);
3741         short8 r = void;
3742         foreach(i; 0..8)
3743             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3744         return cast(int4)r;
3745     }
3746 }
3747 
3748 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3749 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3750 {
3751     static if (LDC_with_SSE2)
3752     {
3753         return __builtin_ia32_psrad128(a, count);
3754     }
3755     else static if (GDC_with_SSE2)
3756     {
3757         return __builtin_ia32_psrad128(a, count);
3758     }
3759     else
3760     {    
3761         int4 r = void;
3762         long2 lc = cast(long2)count;
3763         int bits = cast(int)(lc.array[0]);
3764         r.ptr[0] = (a.array[0] >> bits);
3765         r.ptr[1] = (a.array[1] >> bits);
3766         r.ptr[2] = (a.array[2] >> bits);
3767         r.ptr[3] = (a.array[3] >> bits);
3768         return r;
3769     }
3770 }
3771 
3772 
3773 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3774 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3775 {
3776     static if (GDC_with_SSE2)
3777     {
3778         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3779     }
3780     else static if (LDC_with_SSE2)
3781     {
3782         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3783     }
3784     else static if (LDC_with_ARM64)
3785     {
3786         short8 sa = cast(short8)a;
3787         ubyte count = cast(ubyte)imm8;
3788         if (count > 15) 
3789             count = 15;
3790         short8 r = sa >> short8(count);
3791         return cast(__m128i)r;
3792     }
3793     else
3794     {
3795         short8 sa = cast(short8)a;
3796         short8 r = void;
3797 
3798         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3799         //       D says "It's illegal to shift by the same or more bits 
3800         //       than the size of the quantity being shifted"
3801         //       and it's UB instead.
3802         ubyte count = cast(ubyte)imm8;
3803         if (count > 15) 
3804             count = 15;
3805         foreach(i; 0..8)
3806             r.ptr[i] = cast(short)(sa.array[i] >> count);
3807         return cast(int4)r;
3808     }
3809 }
3810 unittest
3811 {
3812     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3813     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3814     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3815     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3816     assert(B.array == expectedB);
3817     assert(B2.array == expectedB);
3818 
3819     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3820     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3821     assert(C.array == expectedC);
3822 }
3823 
3824 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3825 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3826 {
3827     static if (LDC_with_SSE2)
3828     {
3829         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3830     }
3831     else static if (GDC_with_SSE2)
3832     {
3833         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3834     }
3835     else
3836     {
3837         int4 r = void;
3838 
3839         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3840         //       D says "It's illegal to shift by the same or more bits 
3841         //       than the size of the quantity being shifted"
3842         //       and it's UB instead.
3843         ubyte count = cast(ubyte) imm8;
3844         if (count > 31)
3845             count = 31;
3846 
3847         r.ptr[0] = (a.array[0] >> count);
3848         r.ptr[1] = (a.array[1] >> count);
3849         r.ptr[2] = (a.array[2] >> count);
3850         r.ptr[3] = (a.array[3] >> count);
3851         return r;
3852     }
3853 }
3854 unittest
3855 {
3856     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3857     __m128i B = _mm_srai_epi32(A, 1);
3858     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3859     int[4] expectedB = [ 0, 1, 1, -2];
3860     assert(B.array == expectedB);
3861     assert(B2.array == expectedB);
3862 
3863     __m128i C = _mm_srai_epi32(A, 32);
3864     int[4] expectedC = [ 0, 0, 0, -1];
3865     assert(C.array == expectedC);
3866 
3867     __m128i D = _mm_srai_epi32(A, 0);
3868     int[4] expectedD = [ 0, 2, 3, -4];
3869     assert(D.array == expectedD);
3870 }
3871 
3872 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3873 {
3874     static if (LDC_with_SSE2)
3875     {
3876         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3877     }
3878     else static if (GDC_with_SSE2)
3879     {
3880         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3881     }
3882     else
3883     {
3884         short8 sa = cast(short8)a;
3885         long2 lc = cast(long2)count;
3886         int bits = cast(int)(lc.array[0]);
3887         short8 r = void;
3888         foreach(i; 0..8)
3889             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3890         return cast(int4)r;
3891     }
3892 }
3893 
3894 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
3895 {
3896     static if (LDC_with_SSE2)
3897     {
3898         return __builtin_ia32_psrld128(a, count);
3899     }
3900     else static if (GDC_with_SSE2)
3901     {
3902         return __builtin_ia32_psrld128(a, count);
3903     }
3904     else
3905     {
3906         int4 r = void;
3907         long2 lc = cast(long2)count;
3908         int bits = cast(int)(lc.array[0]);
3909         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
3910         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
3911         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
3912         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
3913         return r;
3914     }
3915 }
3916 
3917 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
3918 {
3919     static if (LDC_with_SSE2)
3920     {
3921         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3922     }
3923     else static if (GDC_with_SSE2)
3924     {
3925         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3926     }
3927     else
3928     {
3929         long2 r = void;
3930         long2 sa = cast(long2)a;
3931         long2 lc = cast(long2)count;
3932         int bits = cast(int)(lc.array[0]);
3933         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
3934         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
3935         return cast(__m128i)r;
3936     }
3937 }
3938 
3939 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3940 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3941 {
3942     static if (GDC_with_SSE2)
3943     {
3944         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3945     }
3946     else static if (LDC_with_SSE2)
3947     {
3948         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3949     }
3950     else static if (LDC_with_ARM64)
3951     {
3952         short8 sa = cast(short8)a;
3953         short8 r = cast(short8) _mm_setzero_si128();
3954 
3955         ubyte count = cast(ubyte)imm8;
3956         if (count >= 16)
3957             return cast(__m128i)r;
3958 
3959         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3960         return cast(__m128i)r;
3961     }
3962     else
3963     {
3964         short8 sa = cast(short8)a;
3965         ubyte count = cast(ubyte)imm8;
3966 
3967         short8 r = cast(short8) _mm_setzero_si128();
3968         if (count >= 16)
3969             return cast(__m128i)r;
3970 
3971         foreach(i; 0..8)
3972             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
3973         return cast(__m128i)r;
3974     }
3975 }
3976 unittest
3977 {
3978     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3979     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3980     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
3981     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3982     assert(B.array == expectedB);
3983     assert(B2.array == expectedB);
3984 
3985     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
3986     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
3987     assert(C.array == expectedC);
3988 
3989     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
3990     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
3991     assert(D.array == expectedD);
3992 }
3993 
3994 
3995 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
3996 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3997 {
3998     static if (GDC_with_SSE2)
3999     {
4000         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4001     }
4002     else static if (LDC_with_SSE2)
4003     {
4004         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4005     }
4006     else
4007     {
4008         ubyte count = cast(ubyte) imm8;
4009 
4010         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4011         //       D says "It's illegal to shift by the same or more bits 
4012         //       than the size of the quantity being shifted"
4013         //       and it's UB instead.
4014         int4 r = _mm_setzero_si128();
4015         if (count >= 32)
4016             return r;
4017         r.ptr[0] = a.array[0] >>> count;
4018         r.ptr[1] = a.array[1] >>> count;
4019         r.ptr[2] = a.array[2] >>> count;
4020         r.ptr[3] = a.array[3] >>> count;
4021         return r;
4022     }
4023 }
4024 unittest
4025 {
4026     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4027     __m128i B = _mm_srli_epi32(A, 1);
4028     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4029     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4030     assert(B.array == expectedB);
4031     assert(B2.array == expectedB);
4032  
4033     __m128i C = _mm_srli_epi32(A, 255);
4034     int[4] expectedC = [ 0, 0, 0, 0 ];
4035     assert(C.array == expectedC);
4036 }
4037 
4038 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4039 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4040 {
4041     static if (GDC_with_SSE2)
4042     {
4043         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4044     }
4045     else static if (LDC_with_SSE2)
4046     {
4047         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4048     }
4049     else
4050     {
4051         long2 r = cast(long2) _mm_setzero_si128();
4052         long2 sa = cast(long2)a;
4053 
4054         ubyte count = cast(ubyte) imm8;
4055         if (count >= 64)
4056             return cast(__m128i)r;
4057 
4058         r.ptr[0] = sa.array[0] >>> count;
4059         r.ptr[1] = sa.array[1] >>> count;
4060         return cast(__m128i)r;
4061     }
4062 }
4063 unittest
4064 {
4065     __m128i A = _mm_setr_epi64(8, -4);
4066     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4067     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4068     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4069     assert(B.array == expectedB);
4070     assert(B2.array == expectedB);
4071 
4072     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4073     long[2] expectedC = [ 0, 0 ];
4074     assert(C.array == expectedC);
4075 }
4076 
4077 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4078 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
4079 {
4080     static if (bytes & 0xF0)
4081     {
4082         return _mm_setzero_si128();
4083     }
4084     else static if (GDC_with_SSE2)
4085     {
4086         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4087     }
4088     else static if (DMD_with_32bit_asm)
4089     {
4090         asm pure nothrow @nogc @trusted
4091         {
4092             movdqu XMM0, v;
4093             psrldq XMM0, bytes;
4094             movdqu v, XMM0;
4095         }
4096         return v;
4097     }
4098     else
4099     {
4100         return cast(__m128i) shufflevector!(byte16,
4101                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4102                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4103                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4104     }
4105 }
4106 unittest
4107 {
4108     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
4109     int[4] correct = [2, 3, 4, 0];
4110     assert(R.array == correct);
4111 
4112     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4113     int[4] expectedA = [0, 0, 0, 0];
4114     assert(A.array == expectedA);
4115 }
4116 
4117 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4118 /// #BONUS
4119 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4120 {
4121     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4122 }
4123 unittest
4124 {
4125     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4126     float[4] correct = [3.0f, 4.0f, 0, 0];
4127     assert(R.array == correct);
4128 }
4129 
4130 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4131 /// #BONUS
4132 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4133 {
4134     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4135 }
4136 
4137 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4138 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4139 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4140 {
4141     pragma(inline, true);
4142     __m128d* aligned = cast(__m128d*)mem_addr;
4143     *aligned = a;
4144 }
4145 
4146 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4147 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4148 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4149 {
4150     __m128d* aligned = cast(__m128d*)mem_addr;
4151     __m128d r;
4152     r.ptr[0] = a.array[0];
4153     r.ptr[1] = a.array[0];
4154     *aligned = r;
4155 }
4156 
4157 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4158 /// be aligned on any particular boundary.
4159 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4160 {
4161     pragma(inline, true);
4162     *mem_addr = a.array[0];
4163 }
4164 
4165 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4166 /// general-protection exception may be generated.
4167 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4168 {
4169     pragma(inline, true);
4170     *mem_addr = a;
4171 }
4172 
4173 alias _mm_store1_pd = _mm_store_pd1; ///
4174 
4175 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4176 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4177 {
4178     pragma(inline, true);
4179     *mem_addr = a.array[1];
4180 }
4181 
4182 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4183 // expectations from the user point of view. This problem also exist in C++.
4184 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4185 {
4186     pragma(inline, true);
4187     long* dest = cast(long*)mem_addr;
4188     long2 la = cast(long2)a;
4189     *dest = la.array[0];
4190 }
4191 unittest
4192 {
4193     long[3] A = [1, 2, 3];
4194     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4195     long[3] correct = [1, 0x1_0000_0000, 3];
4196     assert(A == correct);
4197 }
4198 
4199 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4200 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4201 {
4202     pragma(inline, true);
4203     *mem_addr = a.array[0];
4204 }
4205 
4206 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4207 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4208 void _mm_storer_pd (double* mem_addr, __m128d a) pure
4209 {
4210     __m128d* aligned = cast(__m128d*)mem_addr;
4211     *aligned = shufflevector!(double2, 1, 0)(a, a);
4212 }
4213 
4214 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4215 /// `mem_addr` does not need to be aligned on any particular boundary.
4216 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
4217 {
4218     pragma(inline, true);
4219     storeUnaligned!double2(a, mem_addr);
4220 }
4221 
4222 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4223 /// boundary.
4224 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4225 {
4226     pragma(inline, true);
4227     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4228 }
4229 
4230 /// Store 32-bit integer from the first element of `a` into memory. 
4231 /// `mem_addr` does not need to be aligned on any particular boundary.
4232 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4233 {
4234     pragma(inline, true);
4235     int* dest = cast(int*)mem_addr;
4236     *dest = a.array[0];
4237 }
4238 unittest
4239 {
4240     int[2] arr = [-24, 12];
4241     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4242     assert(arr == [-24, -1]);
4243 }
4244 
4245 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4246 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4247 /// boundary or a general-protection exception may be generated.
4248 void _mm_stream_pd (double* mem_addr, __m128d a)
4249 {
4250     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4251     __m128d* dest = cast(__m128d*)mem_addr;
4252     *dest = a;
4253 }
4254 
4255 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4256 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4257 /// may be generated.
4258 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4259 {
4260     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4261     __m128i* dest = cast(__m128i*)mem_addr;
4262     *dest = a;
4263 }
4264 
4265 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4266 /// pollution. If the cache line containing address mem_addr is already in the cache,
4267 /// the cache will be updated.
4268 void _mm_stream_si32 (int* mem_addr, int a)
4269 {
4270     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4271     *mem_addr = a;
4272 }
4273 
4274 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4275 /// cache pollution. If the cache line containing address mem_addr is already
4276 /// in the cache, the cache will be updated.
4277 void _mm_stream_si64 (long* mem_addr, long a)
4278 {
4279     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4280     *mem_addr = a;
4281 }
4282 
4283 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4284 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4285 {
4286     pragma(inline, true);
4287     return cast(__m128i)(cast(short8)a - cast(short8)b);
4288 }
4289 
4290 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4291 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4292 {
4293     pragma(inline, true);
4294     return cast(__m128i)(cast(int4)a - cast(int4)b);
4295 }
4296 
4297 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4298 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4299 {
4300     pragma(inline, true);
4301     return cast(__m128i)(cast(long2)a - cast(long2)b);
4302 }
4303 
4304 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4305 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4306 {
4307     pragma(inline, true);
4308     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4309 }
4310 
4311 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4312 /// floating-point elements in `a`.
4313 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4314 {
4315     pragma(inline, true);
4316     return a - b;
4317 }
4318 
4319 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4320 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4321 /// upper element of result.
4322 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4323 {
4324     version(DigitalMars)
4325     {
4326         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4327         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4328         asm pure nothrow @nogc @trusted { nop;}
4329         a[0] = a[0] - b[0];
4330         return a;
4331     }
4332     else static if (GDC_with_SSE2)
4333     {
4334         return __builtin_ia32_subsd(a, b);
4335     }
4336     else
4337     {
4338         a.ptr[0] -= b.array[0];
4339         return a;
4340     }
4341 }
4342 unittest
4343 {
4344     __m128d a = [1.5, -2.0];
4345     a = _mm_sub_sd(a, a);
4346     assert(a.array == [0.0, -2.0]);
4347 }
4348 
4349 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4350 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4351 {
4352     pragma(inline, true);
4353     return a - b;
4354 }
4355 
4356 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4357 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4358 {
4359     version(LDC)
4360     {
4361         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4362         {
4363             // Generates PSUBSW since LDC 1.15 -O0
4364             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4365             
4366             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4367             enum ir = `
4368                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4369                 ret <8 x i16> %r`;
4370             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4371         }
4372         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4373         {
4374             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4375             short[8] res;
4376             short8 sa = cast(short8)a;
4377             short8 sb = cast(short8)b;
4378             foreach(i; 0..8)
4379                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4380             return _mm_loadu_si128(cast(int4*)res.ptr);
4381         }
4382         else static if (LDC_with_SSE2)
4383         {
4384             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4385         }
4386         else
4387             static assert(false);
4388     }
4389     else static if (GDC_with_SSE2)
4390     {
4391         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4392     }
4393     else
4394     {
4395         short[8] res;
4396         short8 sa = cast(short8)a;
4397         short8 sb = cast(short8)b;
4398         foreach(i; 0..8)
4399             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4400         return _mm_loadu_si128(cast(int4*)res.ptr);
4401     }
4402 }
4403 unittest
4404 {
4405     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4406                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4407     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4408     assert(res.array == correctResult);
4409 }
4410 
4411 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4412 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4413 {
4414     version(LDC)
4415     {
4416         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4417         {
4418             // x86: Generates PSUBSB since LDC 1.15 -O0
4419             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4420             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4421             enum ir = `
4422                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4423                 ret <16 x i8> %r`;
4424             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4425         }
4426         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4427         {
4428             byte[16] res;
4429             byte16 sa = cast(byte16)a;
4430             byte16 sb = cast(byte16)b;
4431             foreach(i; 0..16)
4432                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4433             return _mm_loadu_si128(cast(int4*)res.ptr);
4434         }
4435         else static if (LDC_with_SSE2)
4436         {
4437             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4438         }
4439         else
4440             static assert(false);
4441     }
4442     else static if (GDC_with_SSE2)
4443     {
4444         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4445     }
4446     else
4447     {
4448         byte[16] res;
4449         byte16 sa = cast(byte16)a;
4450         byte16 sb = cast(byte16)b;
4451         foreach(i; 0..16)
4452             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4453         return _mm_loadu_si128(cast(int4*)res.ptr);
4454     }
4455 }
4456 unittest
4457 {
4458     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4459                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4460     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4461     assert(res.array == correctResult);
4462 }
4463 
4464 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4465 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4466 {
4467     version(LDC)
4468     {
4469         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4470         {
4471             // x86: Generates PSUBUSW since LDC 1.15 -O0
4472             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4473             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4474             enum ir = `
4475                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4476                 ret <8 x i16> %r`;
4477             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4478         }
4479         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4480         {
4481             short[8] res;
4482             short8 sa = cast(short8)a;
4483             short8 sb = cast(short8)b;
4484             foreach(i; 0..8)
4485             {
4486                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4487                 res[i] = saturateSignedIntToUnsignedShort(sum);
4488             }
4489             return _mm_loadu_si128(cast(int4*)res.ptr);
4490         }
4491         else static if (LDC_with_SSE2)
4492         {
4493             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4494         }
4495         else 
4496             static assert(false);
4497     }
4498     else static if (GDC_with_SSE2)
4499     {
4500         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4501     }
4502     else
4503     {
4504         short[8] res;
4505         short8 sa = cast(short8)a;
4506         short8 sb = cast(short8)b;
4507         foreach(i; 0..8)
4508         {
4509             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4510             res[i] = saturateSignedIntToUnsignedShort(sum);
4511         }
4512         return _mm_loadu_si128(cast(int4*)res.ptr);
4513     }
4514 }
4515 unittest
4516 {
4517     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4518                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4519     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4520     assert(R.array == correct);
4521 }
4522 
4523 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4524 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4525 {
4526     version(LDC)
4527     {
4528         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4529         {
4530             // x86: Generates PSUBUSB since LDC 1.15 -O0
4531             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4532             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4533             enum ir = `
4534                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4535                 ret <16 x i8> %r`;
4536             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4537         }
4538         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4539         {
4540             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4541             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4542             {
4543                 ubyte[16] res;
4544                 byte16 sa = cast(byte16)a;
4545                 byte16 sb = cast(byte16)b;
4546                 foreach(i; 0..16)
4547                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4548                 return _mm_loadu_si128(cast(int4*)res.ptr);
4549             }
4550         }
4551         else static if (LDC_with_SSE2)
4552         {
4553             return __builtin_ia32_psubusb128(a, b);
4554         }
4555         else 
4556             static assert(false);
4557     }
4558     else static if (GDC_with_SSE2)
4559     {
4560         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4561     }
4562     else
4563     {
4564         ubyte[16] res;
4565         byte16 sa = cast(byte16)a;
4566         byte16 sb = cast(byte16)b;
4567         foreach(i; 0..16)
4568             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4569         return _mm_loadu_si128(cast(int4*)res.ptr);
4570     }
4571 }
4572 unittest
4573 {
4574     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4575                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4576     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4577     assert(res.array == correctResult);
4578 }
4579 
4580 // Note: the only difference between these intrinsics is the signalling
4581 //       behaviour of quiet NaNs. This is incorrect but the case where
4582 //       you would want to differentiate between qNaN and sNaN and then
4583 //       treat them differently on purpose seems extremely rare.
4584 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4585 alias _mm_ucomige_sd = _mm_comige_sd; ///
4586 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4587 alias _mm_ucomile_sd = _mm_comile_sd; ///
4588 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4589 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4590 
4591 /// Return vector of type `__m128d` with undefined elements.
4592 __m128d _mm_undefined_pd() pure @safe
4593 {
4594     pragma(inline, true);
4595     __m128d result = void;
4596     return result;
4597 }
4598 
4599 /// Return vector of type `__m128i` with undefined elements.
4600 __m128i _mm_undefined_si128() pure @safe
4601 {
4602     pragma(inline, true);
4603     __m128i result = void;
4604     return result;
4605 }
4606 
4607 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4608 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4609 {
4610     static if (GDC_with_SSE2)
4611     {
4612         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4613     }
4614     else static if (DMD_with_32bit_asm)
4615     {
4616         asm pure nothrow @nogc @trusted
4617         {
4618             movdqu XMM0, a;
4619             movdqu XMM1, b;
4620             punpckhwd XMM0, XMM1;
4621             movdqu a, XMM0;
4622         }
4623         return a;
4624     }
4625     else
4626     {
4627         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4628                                            (cast(short8)a, cast(short8)b);
4629     }
4630 }
4631 unittest
4632 {
4633     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4634     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4635     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4636     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4637     assert(C.array == correct);
4638 }
4639 
4640 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4641 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
4642 {
4643     static if (GDC_with_SSE2)
4644     {
4645         return __builtin_ia32_punpckhdq128(a, b);
4646     }
4647     else version(DigitalMars)
4648     {
4649         __m128i r;
4650         r.ptr[0] = a.array[2];
4651         r.ptr[1] = b.array[2];
4652         r.ptr[2] = a.array[3];
4653         r.ptr[3] = b.array[3];
4654         return r;
4655     }
4656     else
4657     {
4658         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4659     }
4660 }
4661 unittest
4662 {
4663     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4664     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4665     __m128i C = _mm_unpackhi_epi32(A, B);
4666     int[4] correct = [3, 7, 4, 8];
4667     assert(C.array == correct);
4668 }
4669 
4670 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4671 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4672 {
4673     static if (GDC_with_SSE2)
4674     {
4675         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
4676     }
4677     else
4678     {
4679         __m128i r = cast(__m128i)b;
4680         r[0] = a[2];
4681         r[1] = a[3];
4682         return r; 
4683     }
4684 }
4685 unittest // Issue #36
4686 {
4687     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4688     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4689     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4690     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4691     assert(C.array == correct);
4692 }
4693 
4694 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4695 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4696 {
4697     static if (GDC_with_SSE2)
4698     {
4699         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
4700     }
4701     else static if (DMD_with_32bit_asm)
4702     {
4703         asm pure nothrow @nogc @trusted
4704         {
4705             movdqu XMM0, a;
4706             movdqu XMM1, b;
4707             punpckhbw XMM0, XMM1;
4708             movdqu a, XMM0;
4709         }
4710         return a;
4711     }
4712     else
4713     {
4714         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4715                                                    12, 28, 13, 29, 14, 30, 15, 31)
4716                                                    (cast(byte16)a, cast(byte16)b);
4717     }
4718 }
4719 unittest
4720 {
4721     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4722     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4723     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
4724     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
4725     assert(C.array == correct);
4726 }
4727 
4728 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4729 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4730 {
4731     static if (GDC_with_SSE2)
4732     {
4733         return __builtin_ia32_unpckhpd(a, b);
4734     }
4735     else
4736     {
4737         return shufflevector!(__m128d, 1, 3)(a, b);
4738     }
4739 }
4740 unittest
4741 {
4742     __m128d A = _mm_setr_pd(4.0, 6.0);
4743     __m128d B = _mm_setr_pd(7.0, 9.0);
4744     __m128d C = _mm_unpackhi_pd(A, B);
4745     double[2] correct = [6.0, 9.0];
4746     assert(C.array == correct);
4747 }
4748 
4749 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4750 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4751 {
4752     static if (GDC_with_SSE2)
4753     {
4754         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
4755     }
4756     else static if (DMD_with_32bit_asm)
4757     {
4758         asm pure nothrow @nogc @trusted
4759         {
4760             movdqu XMM0, a;
4761             movdqu XMM1, b;
4762             punpcklwd XMM0, XMM1;
4763             movdqu a, XMM0;
4764         }
4765         return a;
4766     }
4767     else
4768     {
4769         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4770                                            (cast(short8)a, cast(short8)b);
4771     }
4772 }
4773 unittest
4774 {
4775     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4776     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4777     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
4778     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
4779     assert(C.array == correct);
4780 }
4781 
4782 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4783 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
4784 {
4785     static if (GDC_with_SSE2)
4786     {
4787         return __builtin_ia32_punpckldq128(a, b);
4788     }
4789     else version(DigitalMars)
4790     {
4791         __m128i r;
4792         r.ptr[0] = a.array[0];
4793         r.ptr[1] = b.array[0];
4794         r.ptr[2] = a.array[1];
4795         r.ptr[3] = b.array[1];
4796         return r;
4797     }
4798     else
4799     {
4800         return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
4801     }
4802 }
4803 unittest
4804 {
4805     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4806     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4807     __m128i C = _mm_unpacklo_epi32(A, B);
4808     int[4] correct = [1, 5, 2, 6];
4809     assert(C.array == correct);
4810 }
4811 
4812 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4813 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4814 {
4815     static if (GDC_with_SSE2)
4816     {
4817         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
4818     }
4819     else
4820     {
4821         long2 lA = cast(long2)a;
4822         long2 lB = cast(long2)b;
4823         long2 R;
4824         R.ptr[0] = lA.array[0];
4825         R.ptr[1] = lB.array[0];
4826         return cast(__m128i)R;
4827     }
4828 }
4829 unittest // Issue #36
4830 {
4831     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4832     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4833     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4834     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4835     assert(C.array == correct);
4836 }
4837 
4838 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
4839 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4840 {
4841     static if (GDC_with_SSE2)
4842     {
4843         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
4844     }
4845     else static if (DMD_with_32bit_asm)
4846     {
4847         asm pure nothrow @nogc @trusted
4848         {
4849             movdqu XMM0, a;
4850             movdqu XMM1, b;
4851             punpcklbw XMM0, XMM1;
4852             movdqu a, XMM0;
4853         }
4854         return a;
4855     }
4856     else
4857     {
4858         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4859                                                     4, 20, 5, 21, 6, 22, 7, 23)
4860                                            (cast(byte16)a, cast(byte16)b);
4861     }
4862 }
4863 unittest
4864 {
4865     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4866     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4867     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
4868     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
4869     assert(C.array == correct);
4870 }
4871 
4872 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
4873 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4874 {
4875     static if (GDC_with_SSE2)
4876     {
4877         return __builtin_ia32_unpcklpd(a, b);
4878     }
4879     else
4880     {
4881         return shufflevector!(__m128d, 0, 2)(a, b);
4882     }
4883 }
4884 unittest
4885 {
4886     __m128d A = _mm_setr_pd(4.0, 6.0);
4887     __m128d B = _mm_setr_pd(7.0, 9.0);
4888     __m128d C = _mm_unpacklo_pd(A, B);
4889     double[2] correct = [4.0, 7.0];
4890     assert(C.array == correct);
4891 }
4892 
4893 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4894 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4895 {
4896     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4897 }
4898 // TODO unittest and thus force inline
4899 
4900 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
4901 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4902 {
4903     return a ^ b;
4904 }
4905 // TODO unittest and thus force inline
4906 
4907 unittest
4908 {
4909     float distance(float[4] a, float[4] b) nothrow @nogc
4910     {
4911         __m128 va = _mm_loadu_ps(a.ptr);
4912         __m128 vb = _mm_loadu_ps(b.ptr);
4913         __m128 diffSquared = _mm_sub_ps(va, vb);
4914         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4915         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4916         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4917         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4918     }
4919     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4920 }