1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (GDC_with_SSE2)
83     {
84         return __builtin_ia32_addsd(a, b);
85     }
86     else version(DigitalMars)
87     {
88         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
89         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
90         asm pure nothrow @nogc @trusted { nop;}
91         a[0] = a[0] + b[0];
92         return a;
93     }
94     else
95     {
96         a[0] += b[0];
97         return a;
98     }
99 }
100 unittest
101 {
102     __m128d a = [1.5, -2.0];
103     a = _mm_add_sd(a, a);
104     assert(a.array == [3.0, -2.0]);
105 }
106 
107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
109 {
110     pragma(inline, true);
111     return a + b;
112 }
113 unittest
114 {
115     __m128d a = [1.5, -2.0];
116     a = _mm_add_pd(a, a);
117     assert(a.array == [3.0, -4.0]);
118 }
119 
120 /// Add 64-bit integers `a` and `b`.
121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
122 {
123     pragma(inline, true);
124     return a + b;
125 }
126 
127 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
129 {
130     static if (GDC_with_SSE2)
131     {
132         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
133     }
134     else version(LDC)
135     {
136         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
137         {
138             // x86: Generates PADDSW since LDC 1.15 -O0
139             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
140             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
141             enum ir = `
142                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
143                 ret <8 x i16> %r`;
144             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
145         }
146         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
147         {
148             short[8] res;
149             short8 sa = cast(short8)a;
150             short8 sb = cast(short8)b;
151             foreach(i; 0..8)
152                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
153             return _mm_loadu_si128(cast(int4*)res.ptr);
154         }
155         else
156             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
157     }
158     else
159     {
160         short[8] res;
161         short8 sa = cast(short8)a;
162         short8 sb = cast(short8)b;
163         foreach(i; 0..8)
164             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
165         return _mm_loadu_si128(cast(int4*)res.ptr);
166     }
167 }
168 unittest
169 {
170     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
171                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
172     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
173     assert(res.array == correctResult);
174 }
175 
176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
178 {
179     static if (GDC_with_SSE2)
180     {
181         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
182     }
183     else version(LDC)
184     {
185         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
186         {
187             // x86: Generates PADDSB since LDC 1.15 -O0
188             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
189             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
190             enum ir = `
191                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
192                 ret <16 x i8> %r`;
193             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
194         }
195         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
196         {
197             byte[16] res;
198             byte16 sa = cast(byte16)a;
199             byte16 sb = cast(byte16)b;
200             foreach(i; 0..16)
201                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
202             return _mm_loadu_si128(cast(int4*)res.ptr);
203         }
204         else
205             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
206     }
207     else
208     {
209         byte[16] res;
210         byte16 sa = cast(byte16)a;
211         byte16 sb = cast(byte16)b;
212         foreach(i; 0..16)
213             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
214         return _mm_loadu_si128(cast(int4*)res.ptr);
215     }
216 }
217 unittest
218 {
219     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
220                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
221     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
222                                                16, 18, 20, 22, 24, 26, 28, 30];
223     assert(res.array == correctResult);
224 }
225 
226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
227 // PERF: #GDC version?
228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
229 {
230     version(LDC)
231     {
232         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
233         {
234             // x86: Generates PADDUSB since LDC 1.15 -O0
235             // ARM: Generates uqadd.16b since LDC 1.21 -O1
236             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
237             enum ir = `
238                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
239                 ret <16 x i8> %r`;
240             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
241         }
242         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
243         {
244             ubyte[16] res;
245             byte16 sa = cast(byte16)a;
246             byte16 sb = cast(byte16)b;
247             foreach(i; 0..16)
248                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
249             return _mm_loadu_si128(cast(int4*)res.ptr);
250         }
251         else
252             return __builtin_ia32_paddusb128(a, b);
253     }
254     else
255     {
256         ubyte[16] res;
257         byte16 sa = cast(byte16)a;
258         byte16 sb = cast(byte16)b;
259         foreach(i; 0..16)
260             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
261         return _mm_loadu_si128(cast(int4*)res.ptr);
262     }
263 }
264 unittest
265 {
266     byte16 res = cast(byte16) 
267         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
268                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
269     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
270                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
271     assert(res.array == correctResult);
272 }
273 
274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
275 // PERF: #GDC version?
276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
277 {
278     version(LDC)
279     {
280         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
281         {
282             // x86: Generates PADDUSW since LDC 1.15 -O0
283             // ARM: Generates uqadd.8h since LDC 1.21 -O1
284             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
285             enum ir = `
286                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
287                 ret <8 x i16> %r`;
288             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
289         }
290         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
291         {
292             ushort[8] res;
293             short8 sa = cast(short8)a;
294             short8 sb = cast(short8)b;
295             foreach(i; 0..8)
296                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
297             return _mm_loadu_si128(cast(int4*)res.ptr);
298         }
299         else
300             return __builtin_ia32_paddusw128(a, b);
301     }
302     else
303     {
304         ushort[8] res;
305         short8 sa = cast(short8)a;
306         short8 sb = cast(short8)b;
307         foreach(i; 0..8)
308             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
309         return _mm_loadu_si128(cast(int4*)res.ptr);
310     }
311 }
312 unittest
313 {
314     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
315                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
316     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
317     assert(res.array == correctResult);
318 }
319 
320 /// Compute the bitwise AND of packed double-precision (64-bit) 
321 /// floating-point elements in `a` and `b`.
322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
323 {
324     pragma(inline, true);
325     return cast(__m128d)( cast(long2)a & cast(long2)b );
326 }
327 unittest
328 {
329     double a = 4.32;
330     double b = -78.99;
331     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
332     __m128d A = _mm_set_pd(a, b);
333     __m128d B = _mm_set_pd(b, a);
334     long2 R = cast(long2)( _mm_and_pd(A, B) );
335     assert(R.array[0] == correct);
336     assert(R.array[1] == correct);
337 }
338 
339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
341 {
342     pragma(inline, true);
343     return a & b;
344 }
345 unittest
346 {
347     __m128i A = _mm_set1_epi32(7);
348     __m128i B = _mm_set1_epi32(14);
349     __m128i R = _mm_and_si128(A, B);
350     int[4] correct = [6, 6, 6, 6];
351     assert(R.array == correct);
352 }
353 
354 /// Compute the bitwise NOT of packed double-precision (64-bit) 
355 /// floating-point elements in `a` and then AND with `b`.
356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
357 {
358     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
359 }
360 unittest
361 {
362     double a = 4.32;
363     double b = -78.99;
364     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
365     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
366     __m128d A = _mm_setr_pd(a, b);
367     __m128d B = _mm_setr_pd(b, a);
368     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
369     assert(R.array[0] == correct);
370     assert(R.array[1] == correct2);
371 }
372 
373 /// Compute the bitwise NOT of 128 bits (representing integer data) 
374 /// in `a` and then AND with `b`.
375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
376 {
377     return (~a) & b;
378 }
379 unittest
380 {
381     __m128i A = _mm_set1_epi32(7);
382     __m128i B = _mm_set1_epi32(14);
383     __m128i R = _mm_andnot_si128(A, B);
384     int[4] correct = [8, 8, 8, 8];
385     assert(R.array == correct);
386 }
387 
388 /// Average packed unsigned 16-bit integers in `a` and `b`.
389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
390 {
391     static if (GDC_with_SSE2)
392     {
393         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
394     }
395     else static if (LDC_with_ARM64)
396     {
397         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
398     }
399     else version(LDC)
400     {
401         // Generates pavgw even in LDC 1.0, even in -O0
402         // But not in ARM
403         enum ir = `
404             %ia = zext <8 x i16> %0 to <8 x i32>
405             %ib = zext <8 x i16> %1 to <8 x i32>
406             %isum = add <8 x i32> %ia, %ib
407             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
408             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
409             %r = trunc <8 x i32> %isums to <8 x i16>
410             ret <8 x i16> %r`;
411         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
412     }
413     else
414     {
415         short8 sa = cast(short8)a;
416         short8 sb = cast(short8)b;
417         short8 sr = void;
418         foreach(i; 0..8)
419         {
420             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
421         }
422         return cast(int4)sr;
423     }
424 }
425 unittest
426 {
427     __m128i A = _mm_set1_epi16(31);
428     __m128i B = _mm_set1_epi16(64);
429     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
430     foreach(i; 0..8)
431         assert(avg.array[i] == 48);
432 }
433 
434 /// Average packed unsigned 8-bit integers in `a` and `b`.
435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
436 {
437     static if (GDC_with_SSE2)
438     {
439         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
440     }
441     else static if (LDC_with_ARM64)
442     {
443         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
444     }
445     else version(LDC)
446     {
447         // Generates pavgb even in LDC 1.0, even in -O0
448         // But not in ARM
449         enum ir = `
450             %ia = zext <16 x i8> %0 to <16 x i16>
451             %ib = zext <16 x i8> %1 to <16 x i16>
452             %isum = add <16 x i16> %ia, %ib
453             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
454             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
455             %r = trunc <16 x i16> %isums to <16 x i8>
456             ret <16 x i8> %r`;
457         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
458     }
459     else
460     {
461         byte16 sa = cast(byte16)a;
462         byte16 sb = cast(byte16)b;
463         byte16 sr = void;
464         foreach(i; 0..16)
465         {
466             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
467         }
468         return cast(int4)sr;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_set1_epi8(31);
474     __m128i B = _mm_set1_epi8(64);
475     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
476     foreach(i; 0..16)
477         assert(avg.array[i] == 48);
478 }
479 
480 /// Shift `a` left by `bytes` bytes while shifting in zeros.
481 alias _mm_bslli_si128 = _mm_slli_si128;
482 unittest
483 {
484     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
485     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
486     __m128i result = _mm_bslli_si128!5(toShift);
487     assert( (cast(byte16)result).array == exact);
488 }
489 
490 /// Shift `v` right by `bytes` bytes while shifting in zeros.
491 alias _mm_bsrli_si128 = _mm_srli_si128;
492 unittest
493 {
494     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
495     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
496     __m128i result = _mm_bsrli_si128!5(toShift);
497     assert( (cast(byte16)result).array == exact);
498 }
499 
500 /// Cast vector of type `__m128d` to type `__m128`. 
501 /// Note: Also possible with a regular `cast(__m128)(a)`.
502 __m128 _mm_castpd_ps (__m128d a) pure @safe
503 {
504     return cast(__m128)a;
505 }
506 
507 /// Cast vector of type `__m128d` to type `__m128i`. 
508 /// Note: Also possible with a regular `cast(__m128i)(a)`.
509 __m128i _mm_castpd_si128 (__m128d a) pure @safe
510 {
511     return cast(__m128i)a;
512 }
513 
514 /// Cast vector of type `__m128` to type `__m128d`. 
515 /// Note: Also possible with a regular `cast(__m128d)(a)`.
516 __m128d _mm_castps_pd (__m128 a) pure @safe
517 {
518     return cast(__m128d)a;
519 }
520 
521 /// Cast vector of type `__m128` to type `__m128i`. 
522 /// Note: Also possible with a regular `cast(__m128i)(a)`.
523 __m128i _mm_castps_si128 (__m128 a) pure @safe
524 {
525     return cast(__m128i)a;
526 }
527 
528 /// Cast vector of type `__m128i` to type `__m128d`. 
529 /// Note: Also possible with a regular `cast(__m128d)(a)`.
530 __m128d _mm_castsi128_pd (__m128i a) pure @safe
531 {
532     return cast(__m128d)a;
533 }
534 
535 /// Cast vector of type `__m128i` to type `__m128`. 
536 /// Note: Also possible with a regular `cast(__m128)(a)`.
537 __m128 _mm_castsi128_ps (__m128i a) pure @safe
538 {
539     return cast(__m128)a;
540 }
541 
542 /// Invalidate and flush the cache line that contains `p` 
543 /// from all levels of the cache hierarchy.
544 void _mm_clflush (const(void)* p) @trusted
545 {
546     static if (GDC_with_SSE2)
547     {
548         __builtin_ia32_clflush(p);
549     }
550     else static if (LDC_with_SSE2)
551     {
552         __builtin_ia32_clflush(cast(void*)p);
553     }
554     else version(D_InlineAsm_X86)
555     {
556         asm pure nothrow @nogc @safe
557         {
558             mov EAX, p;
559             clflush [EAX];
560         }
561     }
562     else version(D_InlineAsm_X86_64)
563     {
564         asm pure nothrow @nogc @safe
565         {
566             mov RAX, p;
567             clflush [RAX];
568         }
569     }
570     else 
571     {
572         // Do nothing. Invalidating cacheline does
573         // not affect correctness.
574     }
575 }
576 unittest
577 {
578     ubyte[64] cacheline;
579     _mm_clflush(cacheline.ptr);
580 }
581 
582 /// Compare packed 16-bit integers in `a` and `b` for equality.
583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
584 {
585     static if (GDC_with_SSE2)
586     {
587         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
588     }
589     else
590     {
591         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
592     }
593 }
594 unittest
595 {
596     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
597     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
598     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
599     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
600     assert(R.array == E);
601 }
602 
603 /// Compare packed 32-bit integers in `a` and `b` for equality.
604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
605 {
606     static if (GDC_with_SSE2)
607     {
608         return __builtin_ia32_pcmpeqd128(a, b);
609     }
610     else
611     {
612         return equalMask!__m128i(a, b);
613     }
614 }
615 unittest
616 {
617     int4   A = [-3, -2, -1,  0];
618     int4   B = [ 4, -2,  2,  0];
619     int[4] E = [ 0, -1,  0, -1];
620     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
621     assert(R.array == E);
622 }
623 
624 /// Compare packed 8-bit integers in `a` and `b` for equality.
625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
626 {
627     static if (GDC_with_SSE2)
628     {
629         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
630     }
631     else
632     {
633         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
634     }
635 }
636 unittest
637 {
638     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
639     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
640     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
641     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
642     assert(C.array == correct);
643 }
644 
645 /// Compare packed double-precision (64-bit) floating-point elements 
646 /// in `a` and `b` for equality.
647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
648 {
649     static if (GDC_with_SSE2)
650     {
651         return __builtin_ia32_cmpeqpd(a, b);
652     }
653     else
654     {
655         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
656     }
657 }
658 
659 /// Compare the lower double-precision (64-bit) floating-point elements
660 /// in `a` and `b` for equality, store the result in the lower element,
661 /// and copy the upper element from `a`.
662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
663 {
664     static if (GDC_with_SSE2)
665     {
666         return __builtin_ia32_cmpeqsd(a, b);
667     }
668     else
669     {
670         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
671     }
672 }
673 
674 /// Compare packed double-precision (64-bit) floating-point elements 
675 /// in `a` and `b` for greater-than-or-equal.
676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
677 {
678     static if (GDC_with_SSE2)
679     {
680         return __builtin_ia32_cmpgepd(a, b);
681     }
682     else
683     {
684         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
685     }
686 }
687 
688 /// Compare the lower double-precision (64-bit) floating-point elements 
689 /// in `a` and `b` for greater-than-or-equal, store the result in the 
690 /// lower element, and copy the upper element from `a`.
691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
692 {
693     // Note: There is no __builtin_ia32_cmpgesd builtin.
694     static if (GDC_with_SSE2)
695     {
696         return __builtin_ia32_cmpnltsd(b, a);
697     }
698     else
699     {
700         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
701     }
702 }
703 
704 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
706 {
707     static if (GDC_with_SSE2)
708     {
709         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
710     }
711     else
712     {
713         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
714     }
715 }
716 unittest
717 {
718     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
719     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
720     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
721     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
722     assert(R.array == E);
723 }
724 
725 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
727 {
728     static if (GDC_with_SSE2)
729     {
730         return __builtin_ia32_pcmpgtd128(a, b); 
731     }
732     else
733     {
734         return cast(__m128i)( greaterMask!int4(a, b));
735     }
736 }
737 unittest
738 {
739     int4   A = [-3,  2, -1,  0];
740     int4   B = [ 4, -2,  2,  0];
741     int[4] E = [ 0, -1,  0,  0];
742     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
743     assert(R.array == E);
744 }
745 
746 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
748 {
749     static if (GDC_with_SSE2)
750     {
751         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
752     }
753     else
754     {
755         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
756     }
757 }
758 unittest
759 {
760     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
761     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
762     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
763     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
764     __m128i D = _mm_cmpeq_epi8(A, B);
765     assert(C.array == correct);
766 }
767 
768 /// Compare packed double-precision (64-bit) floating-point elements 
769 /// in `a` and `b` for greater-than.
770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
771 {
772     static if (GDC_with_SSE2)
773     {
774         return __builtin_ia32_cmpgtpd(a, b); 
775     }
776     else
777     {
778         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
779     }
780 }
781 
782 /// Compare the lower double-precision (64-bit) floating-point elements 
783 /// in `a` and `b` for greater-than, store the result in the lower element,
784 /// and copy the upper element from `a`.
785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
786 {
787     // Note: There is no __builtin_ia32_cmpgtsd builtin.
788     static if (GDC_with_SSE2)
789     {
790         return __builtin_ia32_cmpnlesd(b, a);
791     }
792     else
793     {
794         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
795     }
796 }
797 
798 /// Compare packed double-precision (64-bit) floating-point elements 
799 /// in `a` and `b` for less-than-or-equal.
800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
801 {
802     static if (GDC_with_SSE2)
803     {
804         return __builtin_ia32_cmplepd(a, b); 
805     }
806     else
807     {
808         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
809     }
810 }
811 
812 /// Compare the lower double-precision (64-bit) floating-point elements 
813 /// in `a` and `b` for less-than-or-equal, store the result in the 
814 /// lower element, and copy the upper element from `a`.
815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
816 {
817     static if (GDC_with_SSE2)
818     {
819         return __builtin_ia32_cmplesd(a, b); 
820     }
821     else
822     {
823         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
824     }
825 }
826 
827 /// Compare packed 16-bit integers in `a` and `b` for less-than.
828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
829 {
830     return _mm_cmpgt_epi16(b, a);
831 }
832 
833 /// Compare packed 32-bit integers in `a` and `b` for less-than.
834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
835 {
836     return _mm_cmpgt_epi32(b, a);
837 }
838 
839 /// Compare packed 8-bit integers in `a` and `b` for less-than.
840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
841 {
842     return _mm_cmpgt_epi8(b, a);
843 }
844 
845 /// Compare packed double-precision (64-bit) floating-point elements
846 /// in `a` and `b` for less-than.
847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
848 {
849     static if (GDC_with_SSE2)
850     {
851         return __builtin_ia32_cmpltpd(a, b); 
852     }
853     else
854     {
855         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
856     }
857 }
858 
859 /// Compare the lower double-precision (64-bit) floating-point elements
860 /// in `a` and `b` for less-than, store the result in the lower 
861 /// element, and copy the upper element from `a`.
862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
863 {
864     static if (GDC_with_SSE2)
865     {
866         return __builtin_ia32_cmpltsd(a, b); 
867     }
868     else
869     {
870         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
871     }
872 }
873 
874 /// Compare packed double-precision (64-bit) floating-point elements
875 /// in `a` and `b` for not-equal.
876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
877 {
878     static if (GDC_with_SSE2)
879     {
880         return __builtin_ia32_cmpneqpd(a, b); 
881     }
882     else
883     {
884         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
885     }
886 }
887 
888 /// Compare the lower double-precision (64-bit) floating-point elements
889 /// in `a` and `b` for not-equal, store the result in the lower 
890 /// element, and copy the upper element from `a`.
891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
892 {
893     static if (GDC_with_SSE2)
894     {
895         return __builtin_ia32_cmpneqsd(a, b); 
896     }
897     else
898     {
899         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
900     }
901 }
902 
903 /// Compare packed double-precision (64-bit) floating-point elements 
904 /// in `a` and `b` for not-greater-than-or-equal.
905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
906 {
907     static if (GDC_with_SSE2)
908     {
909         return __builtin_ia32_cmpngepd(a, b); 
910     }
911     else
912     {
913         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
914     }
915 }
916 
917 /// Compare the lower double-precision (64-bit) floating-point elements 
918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
919 /// the lower element, and copy the upper element from `a`.
920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
921 {
922     // Note: There is no __builtin_ia32_cmpngesd builtin.
923     static if (GDC_with_SSE2)
924     {
925         return __builtin_ia32_cmpltsd(b, a); 
926     }
927     else
928     {
929         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
930     }
931 }
932 
933 /// Compare packed double-precision (64-bit) floating-point elements 
934 /// in `a` and `b` for not-greater-than.
935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
936 {
937     static if (GDC_with_SSE2)
938     {
939         return __builtin_ia32_cmpngtpd(a, b);
940     }
941     else
942     {
943         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
944     }
945 }
946 
947 /// Compare the lower double-precision (64-bit) floating-point elements 
948 /// in `a` and `b` for not-greater-than, store the result in the 
949 /// lower element, and copy the upper element from `a`.
950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
951 {
952     // Note: There is no __builtin_ia32_cmpngtsd builtin.
953     static if (GDC_with_SSE2)
954     {
955         return __builtin_ia32_cmplesd(b, a);
956     }
957     else
958     {
959         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
960     }
961 }
962 
963 /// Compare packed double-precision (64-bit) floating-point elements 
964 /// in `a` and `b` for not-less-than-or-equal.
965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
966 {
967     static if (GDC_with_SSE2)
968     {
969         return __builtin_ia32_cmpnlepd(a, b);
970     }
971     else
972     {
973         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
974     }
975 }
976 
977 /// Compare the lower double-precision (64-bit) floating-point elements 
978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
979 /// lower element, and copy the upper element from `a`.
980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
981 {
982     static if (GDC_with_SSE2)
983     {
984         return __builtin_ia32_cmpnlesd(a, b);
985     }
986     else
987     {
988         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
989     }
990 }
991  
992 /// Compare packed double-precision (64-bit) floating-point elements 
993 /// in `a` and `b` for not-less-than.
994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
995 {
996     static if (GDC_with_SSE2)
997     {
998         return __builtin_ia32_cmpnltpd(a, b);
999     }
1000     else
1001     {
1002         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1003     }
1004 }
1005 
1006 /// Compare the lower double-precision (64-bit) floating-point elements 
1007 /// in `a` and `b` for not-less-than, store the result in the lower 
1008 /// element, and copy the upper element from `a`.
1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1010 {
1011     static if (GDC_with_SSE2)
1012     {
1013         return __builtin_ia32_cmpnltsd(a, b);
1014     }
1015     else
1016     {
1017         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1018     }
1019 }
1020 
1021 /// Compare packed double-precision (64-bit) floating-point elements 
1022 /// in `a` and `b` to see if neither is NaN.
1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1024 {
1025     static if (GDC_with_SSE2)
1026     {
1027         return __builtin_ia32_cmpordpd(a, b);
1028     }
1029     else
1030     {
1031         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1032     }
1033 }
1034 
1035 /// Compare the lower double-precision (64-bit) floating-point elements 
1036 /// in `a` and `b` to see if neither is NaN, store the result in the 
1037 /// lower element, and copy the upper element from `a` to the upper element.
1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1039 {
1040     static if (GDC_with_SSE2)
1041     {
1042         return __builtin_ia32_cmpordsd(a, b);
1043     }
1044     else
1045     {
1046         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1047     }
1048 }
1049 
1050 /// Compare packed double-precision (64-bit) floating-point elements 
1051 /// in `a` and `b` to see if either is NaN.
1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1053 {
1054     static if (GDC_with_SSE2)
1055     {
1056         return __builtin_ia32_cmpunordpd(a, b);
1057     }
1058     else
1059     {
1060         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1061     }
1062 }
1063 
1064 /// Compare the lower double-precision (64-bit) floating-point elements 
1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1066 /// element, and copy the upper element from `a` to the upper element.
1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1068 {
1069     static if (GDC_with_SSE2)
1070     {
1071         return __builtin_ia32_cmpunordsd(a, b);
1072     }
1073     else
1074     {
1075         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1076     }
1077 }
1078 
1079 /// Compare the lower double-precision (64-bit) floating-point element 
1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1082 {
1083     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1084     // comisd instruction, it returns false in case of unordered instead.
1085     //
1086     // Actually C++ compilers disagree over the meaning of that instruction.
1087     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1088     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1089     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1090     return a.array[0] == b.array[0];
1091 }
1092 unittest
1093 {
1094     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1095     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1096     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1097     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1098     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1099 }
1100 
1101 /// Compare the lower double-precision (64-bit) floating-point element 
1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1103 /// result (0 or 1).
1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1105 {
1106     return a.array[0] >= b.array[0];
1107 }
1108 unittest
1109 {
1110     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1111     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1112     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1113     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1114     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1115     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1116 }
1117 
1118 /// Compare the lower double-precision (64-bit) floating-point element 
1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1121 {
1122     return a.array[0] > b.array[0];
1123 }
1124 unittest
1125 {
1126     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1127     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1128     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1129     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1130     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1131 }
1132 
1133 /// Compare the lower double-precision (64-bit) floating-point element 
1134 /// in `a` and `b` for less-than-or-equal.
1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1136 {
1137     return a.array[0] <= b.array[0];
1138 }
1139 unittest
1140 {
1141     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1142     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1143     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1144     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1145     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1146     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1147 }
1148 
1149 /// Compare the lower double-precision (64-bit) floating-point element 
1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1152 {
1153     return a.array[0] < b.array[0];
1154 }
1155 unittest
1156 {
1157     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1158     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1159     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1160     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1161     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1162     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1163 }
1164 
1165 /// Compare the lower double-precision (64-bit) floating-point element
1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1168 {
1169     return a.array[0] != b.array[0];
1170 }
1171 unittest
1172 {
1173     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1174     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1175     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1176     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1177     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1178 }
1179 
1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1181 /// floating-point elements.
1182  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1183 {
1184     version(LDC)
1185     {
1186         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1187         enum ir = `
1188             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1189             %r = sitofp <2 x i32> %v to <2 x double>
1190             ret <2 x double> %r`;
1191         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1192     }
1193     else static if (GDC_with_SSE2)
1194     {
1195         return __builtin_ia32_cvtdq2pd(a);
1196     }
1197     else
1198     {
1199         double2 r = void;
1200         r.ptr[0] = a.array[0];
1201         r.ptr[1] = a.array[1];
1202         return r;
1203     }
1204 }
1205 unittest
1206 {
1207     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1208     assert(A.array[0] == 54.0);
1209     assert(A.array[1] == 54.0);
1210 }
1211 
1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1213 /// floating-point elements.
1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1215 {
1216     static if (GDC_with_SSE2)
1217     {
1218         return __builtin_ia32_cvtdq2ps(a);
1219     }
1220     else version(LDC)
1221     {
1222         // See #86 for why we had to resort to LLVM IR.
1223         // Plain code below was leading to catastrophic behaviour. 
1224         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1225         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1226         enum ir = `
1227             %r = sitofp <4 x i32> %0 to <4 x float>
1228             ret <4 x float> %r`;
1229         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1230     }
1231     else
1232     {
1233         __m128 res;
1234         res.ptr[0] = cast(float)a.array[0];
1235         res.ptr[1] = cast(float)a.array[1];
1236         res.ptr[2] = cast(float)a.array[2];
1237         res.ptr[3] = cast(float)a.array[3];
1238         return res;
1239     }
1240 }
1241 unittest
1242 {
1243     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1244     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1245 }
1246 
1247 /// Convert packed double-precision (64-bit) floating-point elements 
1248 /// in `a` to packed 32-bit integers.
1249 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1250 {
1251     // PERF ARM32
1252     static if (LDC_with_SSE2)
1253     {
1254         return __builtin_ia32_cvtpd2dq(a);
1255     }
1256     else static if (GDC_with_SSE2)
1257     {
1258         return __builtin_ia32_cvtpd2dq(a);
1259     }
1260     else static if (LDC_with_ARM64)
1261     {
1262         // Get current rounding mode.
1263         uint fpscr = arm_get_fpcr();
1264         long2 i;
1265         switch(fpscr & _MM_ROUND_MASK_ARM)
1266         {
1267             default:
1268             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1269             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1270             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1271             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1272         }
1273         int4 zero = 0;
1274         return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1275     }
1276     else
1277     {
1278         // PERF ARM32
1279         __m128i r = _mm_setzero_si128();
1280         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1281         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1282         return r;
1283     }
1284 }
1285 unittest
1286 {
1287     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1288     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1289 }
1290 
1291 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1292 /// to packed 32-bit integers
1293 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1294 {
1295     return to_m64(_mm_cvtpd_epi32(v));
1296 }
1297 unittest
1298 {
1299     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1300     assert(A.array[0] == 55 && A.array[1] == 61);
1301 }
1302 
1303 /// Convert packed double-precision (64-bit) floating-point elements 
1304 /// in `a` to packed single-precision (32-bit) floating-point elements.
1305 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1306 {
1307     static if (LDC_with_SSE2)
1308     {
1309         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1310     }
1311     else static if (GDC_with_SSE2)
1312     {
1313         return __builtin_ia32_cvtpd2ps(a);
1314     }
1315     else
1316     { 
1317         __m128 r = void;
1318         r.ptr[0] = a.array[0];
1319         r.ptr[1] = a.array[1];
1320         r.ptr[2] = 0;
1321         r.ptr[3] = 0;
1322         return r;
1323     }
1324 }
1325 unittest
1326 {
1327     __m128d A = _mm_set_pd(5.25, 4.0);
1328     __m128 B = _mm_cvtpd_ps(A);
1329     assert(B.array == [4.0f, 5.25f, 0, 0]);
1330 }
1331 
1332 /// Convert packed 32-bit integers in `v` to packed double-precision 
1333 /// (64-bit) floating-point elements.
1334 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1335 {
1336     return _mm_cvtepi32_pd(to_m128i(v));
1337 }
1338 unittest
1339 {
1340     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1341     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1342 }
1343 
1344 /// Convert packed single-precision (32-bit) floating-point elements 
1345 /// in `a` to packed 32-bit integers
1346 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1347 {
1348     static if (LDC_with_SSE2)
1349     {
1350         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1351     }
1352     else static if (GDC_with_SSE2)
1353     {
1354         return __builtin_ia32_cvtps2dq(a);
1355     }
1356     else static if (LDC_with_ARM64)
1357     {
1358         // Get current rounding mode.
1359         uint fpscr = arm_get_fpcr();
1360         switch(fpscr & _MM_ROUND_MASK_ARM)
1361         {
1362             default:
1363             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1364             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1365             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1366             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1367         }
1368     }
1369     else
1370     {
1371         __m128i r = void;
1372         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1373         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1374         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1375         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1376         return r;
1377     }
1378 }
1379 unittest
1380 {
1381     // GDC bug #98607
1382     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1383     // GDC does not provide optimization barrier for rounding mode.
1384     // Workarounded with different literals. This bug will likely only manifest in unittest.
1385     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1386 
1387     uint savedRounding = _MM_GET_ROUNDING_MODE();
1388 
1389     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1390     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1391     assert(A.array == [1, -2, 54, -3]);
1392 
1393     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1394     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1395     assert(A.array == [1, -3, 53, -3]);
1396 
1397     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1398     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1399     assert(A.array == [2, -2, 54, -2]);
1400 
1401     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1402     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1403     assert(A.array == [1, -2, 53, -2]);
1404 
1405     _MM_SET_ROUNDING_MODE(savedRounding);
1406 }
1407 
1408 /// Convert packed single-precision (32-bit) floating-point elements 
1409 /// in `a` to packed double-precision (64-bit) floating-point elements.
1410 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1411 {
1412     version(LDC)
1413     {
1414         // Generates cvtps2pd since LDC 1.0 -O0
1415         enum ir = `
1416             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1417             %r = fpext <2 x float> %v to <2 x double>
1418             ret <2 x double> %r`;
1419         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1420     }
1421     else static if (GDC_with_SSE2)
1422     {
1423         return __builtin_ia32_cvtps2pd(a);
1424     }
1425     else
1426     {
1427         double2 r = void;
1428         r.ptr[0] = a.array[0];
1429         r.ptr[1] = a.array[1];
1430         return r;
1431     }
1432 }
1433 unittest
1434 {
1435     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1436     assert(A.array[0] == 54.0);
1437     assert(A.array[1] == 54.0);
1438 }
1439 
1440 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1441 double _mm_cvtsd_f64 (__m128d a) pure @safe
1442 {
1443     return a.array[0];
1444 }
1445 
1446 /// Convert the lower double-precision (64-bit) floating-point element
1447 /// in `a` to a 32-bit integer.
1448 int _mm_cvtsd_si32 (__m128d a) @safe
1449 {
1450     static if (LDC_with_SSE2)
1451     {
1452         return __builtin_ia32_cvtsd2si(a);
1453     }
1454     else static if (GDC_with_SSE2)
1455     {
1456         return __builtin_ia32_cvtsd2si(a);
1457     }
1458     else
1459     {
1460         return convertDoubleToInt32UsingMXCSR(a[0]);
1461     }
1462 }
1463 unittest
1464 {
1465     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1466 }
1467 
1468 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1469 long _mm_cvtsd_si64 (__m128d a) @trusted
1470 {
1471     version (LDC)
1472     {
1473         version (X86_64)
1474         {
1475             return __builtin_ia32_cvtsd2si64(a);
1476         }
1477         else
1478         {
1479             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1480             // using SSE instructions only. So the builtin doesn't exit for this arch.
1481             return convertDoubleToInt64UsingMXCSR(a[0]);
1482         }
1483     }
1484     else
1485     {
1486         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1487     }
1488 }
1489 unittest
1490 {
1491     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1492 
1493     uint savedRounding = _MM_GET_ROUNDING_MODE();
1494 
1495     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1496     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1497 
1498     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1499     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1500 
1501     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1502     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1503 
1504     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1505     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1506 
1507     _MM_SET_ROUNDING_MODE(savedRounding);
1508 }
1509 
1510 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1511 
1512 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1513 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1514 /// to the upper elements of result.
1515 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1516 {
1517     static if (GDC_with_SSE2)
1518     {
1519         return __builtin_ia32_cvtsd2ss(a, b); 
1520     }
1521     else
1522     {
1523         // Generates cvtsd2ss since LDC 1.3 -O0
1524         a.ptr[0] = b.array[0];
1525         return a;
1526     }
1527 }
1528 unittest
1529 {
1530     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1531     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1532 }
1533 
1534 /// Get the lower 32-bit integer in `a`.
1535 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1536 {
1537     return a.array[0];
1538 }
1539 
1540 /// Get the lower 64-bit integer in `a`.
1541 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1542 {
1543     long2 la = cast(long2)a;
1544     return la.array[0];
1545 }
1546 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1547 
1548 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1549 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1550 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1551 {
1552     a.ptr[0] = cast(double)b;
1553     return a;
1554 }
1555 unittest
1556 {
1557     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1558     assert(a.array == [42.0, 0]);
1559 }
1560 
1561 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1562 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1563 {
1564     int4 r = [0, 0, 0, 0];
1565     r.ptr[0] = a;
1566     return r;
1567 }
1568 unittest
1569 {
1570     __m128i a = _mm_cvtsi32_si128(65);
1571     assert(a.array == [65, 0, 0, 0]);
1572 }
1573 
1574 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1575 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1576 
1577 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1578 {
1579     a.ptr[0] = cast(double)b;
1580     return a;
1581 }
1582 unittest
1583 {
1584     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1585     assert(a.array == [42.0, 0]);
1586 }
1587 
1588 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1589 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1590 {
1591     long2 r = [0, 0];
1592     r.ptr[0] = a;
1593     return cast(__m128i)(r);
1594 }
1595 
1596 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1597 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1598 
1599 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1600 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1601 // element of result.
1602 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1603 {
1604     a.ptr[0] = b.array[0];
1605     return a;
1606 }
1607 unittest
1608 {
1609     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1610     assert(a.array == [42.0, 0]);
1611 }
1612 
1613 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1614 long _mm_cvttss_si64 (__m128 a) pure @safe
1615 {
1616     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1617 }
1618 unittest
1619 {
1620     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1621 }
1622 
1623 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1624 /// Put zeroes in the upper elements of result.
1625 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1626 {
1627     static if (LDC_with_SSE2)
1628     {
1629         return __builtin_ia32_cvttpd2dq(a);
1630     }
1631     else static if (GDC_with_SSE2)
1632     {
1633         return __builtin_ia32_cvttpd2dq(a);
1634     }
1635     else
1636     {
1637         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1638         __m128i r;
1639         r.ptr[0] = cast(int)a.array[0];
1640         r.ptr[1] = cast(int)a.array[1];
1641         r.ptr[2] = 0;
1642         r.ptr[3] = 0;
1643         return r;
1644     }
1645 }
1646 unittest
1647 {
1648     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1649     assert(R.array == [-4, 45641, 0, 0]);
1650 }
1651 
1652 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1653 /// to packed 32-bit integers with truncation.
1654 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1655 {
1656     return to_m64(_mm_cvttpd_epi32(v));
1657 }
1658 unittest
1659 {
1660     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1661     int[2] correct = [-4, 45641];
1662     assert(R.array == correct);
1663 }
1664 
1665 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1666 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1667 {
1668     // x86: Generates cvttps2dq since LDC 1.3 -O2
1669     // ARM64: generates fcvtze since LDC 1.8 -O2
1670     __m128i r;
1671     r.ptr[0] = cast(int)a.array[0];
1672     r.ptr[1] = cast(int)a.array[1];
1673     r.ptr[2] = cast(int)a.array[2];
1674     r.ptr[3] = cast(int)a.array[3];
1675     return r;
1676 }
1677 unittest
1678 {
1679     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1680     assert(R.array == [-4, 45641, 0, 1]);
1681 }
1682 
1683 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1684 int _mm_cvttsd_si32 (__m128d a)
1685 {
1686     // Generates cvttsd2si since LDC 1.3 -O0
1687     return cast(int)a.array[0];
1688 }
1689 
1690 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1691 long _mm_cvttsd_si64 (__m128d a)
1692 {
1693     // Generates cvttsd2si since LDC 1.3 -O0
1694     // but in 32-bit instead, it's a long sequence that resort to FPU
1695     return cast(long)a.array[0];
1696 }
1697 
1698 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1699 
1700 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1701 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1702 {
1703     pragma(inline, true);
1704     return a / b;
1705 }
1706 
1707 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1708 {
1709     static if (GDC_with_SSE2)
1710     {
1711         return __builtin_ia32_divsd(a, b);
1712     }
1713     else version(DigitalMars)
1714     {
1715         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1716         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1717         asm pure nothrow @nogc @trusted { nop;}
1718         a.array[0] = a.array[0] / b.array[0];
1719         return a;
1720     }
1721     else
1722     {
1723         a.ptr[0] /= b.array[0];
1724         return a;
1725     }
1726 }
1727 unittest
1728 {
1729     __m128d a = [2.0, 4.5];
1730     a = _mm_div_sd(a, a);
1731     assert(a.array == [1.0, 4.5]);
1732 }
1733 
1734 /// Extract a 16-bit integer from `v`, selected with `index`.
1735 /// Warning: the returned value is zero-extended to 32-bits.
1736 int _mm_extract_epi16(__m128i v, int index) pure @safe
1737 {
1738     short8 r = cast(short8)v;
1739     return cast(ushort)(r.array[index & 7]);
1740 }
1741 unittest
1742 {
1743     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1744     assert(_mm_extract_epi16(A, 6) == 6);
1745     assert(_mm_extract_epi16(A, 0) == 65535);
1746     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1747 }
1748 
1749 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1750 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1751 {
1752     short8 r = cast(short8)v;
1753     r.ptr[index & 7] = cast(short)i;
1754     return cast(__m128i)r;
1755 }
1756 unittest
1757 {
1758     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1759     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1760     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1761     assert(R.array == correct);
1762 }
1763 
1764 
1765 void _mm_lfence() @trusted
1766 {
1767     version(GNU)
1768     {
1769     
1770         static if (GDC_with_SSE2)
1771         {
1772             __builtin_ia32_lfence();
1773         }
1774         else version(X86)
1775         {
1776             asm pure nothrow @nogc @trusted
1777             {
1778                 "lfence;\n" : : : ;
1779             }
1780         }
1781         else
1782             static assert(false);
1783     }
1784     else static if (LDC_with_SSE2)
1785     {
1786         __builtin_ia32_lfence();
1787     }
1788     else static if (DMD_with_asm)
1789     {
1790         asm nothrow @nogc pure @safe
1791         {
1792             lfence;
1793         }
1794     }
1795     else version(LDC)
1796     {
1797         llvm_memory_fence(); // PERF actually generates mfence
1798     }
1799     else
1800         static assert(false);
1801 }
1802 unittest
1803 {
1804     _mm_lfence();
1805 }
1806 
1807 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1808 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1809 __m128d _mm_load_pd (const(double) * mem_addr) pure
1810 {
1811     pragma(inline, true);
1812     __m128d* aligned = cast(__m128d*)mem_addr;
1813     return *aligned;
1814 }
1815 unittest
1816 {
1817     align(16) double[2] S = [-5.0, 7.0];
1818     __m128d R = _mm_load_pd(S.ptr);
1819     assert(R.array == S);
1820 }
1821 
1822 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1823 /// `mem_addr` does not need to be aligned on any particular boundary.
1824 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1825 {
1826     double m = *mem_addr;
1827     __m128d r;
1828     r.ptr[0] = m;
1829     r.ptr[1] = m;
1830     return r;
1831 }
1832 unittest
1833 {
1834     double what = 4;
1835     __m128d R = _mm_load_pd1(&what);
1836     double[2] correct = [4.0, 4];
1837     assert(R.array == correct);
1838 }
1839 
1840 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1841 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1842 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1843 {
1844     double2 r = [0, 0];
1845     r.ptr[0] = *mem_addr;
1846     return r;
1847 }
1848 unittest
1849 {
1850     double x = -42;
1851     __m128d a = _mm_load_sd(&x);
1852     assert(a.array == [-42.0, 0.0]);
1853 }
1854 
1855 /// Load 128-bits of integer data from memory into dst. 
1856 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1857 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1858 {
1859     pragma(inline, true);
1860     return *mem_addr;
1861 }
1862 unittest
1863 {
1864     align(16) int[4] correct = [-1, 2, 3, 4];
1865     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1866     assert(A.array == correct);
1867 }
1868 
1869 alias _mm_load1_pd = _mm_load_pd1; ///
1870 
1871 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1872 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1873 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1874 {
1875     pragma(inline, true);
1876     a.ptr[1] = *mem_addr;
1877     return a;
1878 }
1879 unittest
1880 {
1881     double A = 7.0;
1882     __m128d B = _mm_setr_pd(4.0, -5.0);
1883     __m128d R = _mm_loadh_pd(B, &A);
1884     double[2] correct = [ 4.0, 7.0 ];
1885     assert(R.array == correct);
1886 }
1887 
1888 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1889 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1890 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1891 {
1892     pragma(inline, true);
1893     auto pLong = cast(const(long)*)mem_addr;
1894     long2 r = [0, 0];
1895     r.ptr[0] = *pLong;
1896     return cast(__m128i)(r);
1897 }
1898 unittest
1899 {
1900     long A = 0x7878787870707070;
1901     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1902     long[2] correct = [0x7878787870707070, 0];
1903     assert(R.array == correct);
1904 }
1905 
1906 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1907 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1908 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1909 {
1910     a.ptr[0] = *mem_addr;
1911     return a;
1912 }
1913 unittest
1914 {
1915     double A = 7.0;
1916     __m128d B = _mm_setr_pd(4.0, -5.0);
1917     __m128d R = _mm_loadl_pd(B, &A);
1918     double[2] correct = [ 7.0, -5.0 ];
1919     assert(R.array == correct);
1920 }
1921 
1922 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1924 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1925 {
1926     __m128d a = *cast(__m128d*)(mem_addr);
1927     __m128d r;
1928     r.ptr[0] = a.array[1];
1929     r.ptr[1] = a.array[0];
1930     return r;
1931 }
1932 unittest
1933 {
1934     align(16) double[2] A = [56.0, -74.0];
1935     __m128d R = _mm_loadr_pd(A.ptr);
1936     double[2] correct = [-74.0, 56.0];
1937     assert(R.array == correct);
1938 }
1939 
1940 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1941 /// `mem_addr` does not need to be aligned on any particular boundary.
1942 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1943 {
1944     pragma(inline, true);
1945     static if (GDC_with_SSE2)
1946     {
1947         return __builtin_ia32_loadupd(mem_addr); 
1948     }
1949     else version(LDC)
1950     {
1951         return loadUnaligned!(double2)(mem_addr);
1952     }
1953     else version(DigitalMars)
1954     {
1955         static if (DMD_with_DSIMD)
1956         {
1957             return cast(__m128d)__simd(XMM.LODUPD, *mem_addr);
1958         }
1959         else static if (SSESizedVectorsAreEmulated)
1960         {
1961             // Since this vector is emulated, it doesn't have alignement constraints
1962             // and as such we can just cast it.
1963             return *cast(__m128d*)(mem_addr);
1964         }
1965         else
1966         {
1967             __m128d result;
1968             result.ptr[0] = mem_addr[0];
1969             result.ptr[1] = mem_addr[1];
1970             return result;
1971         }
1972     }
1973     else
1974     {
1975         __m128d result;
1976         result.ptr[0] = mem_addr[0];
1977         result.ptr[1] = mem_addr[1];
1978         return result;
1979     }
1980 }
1981 unittest
1982 {
1983     double[2] A = [56.0, -75.0];
1984     __m128d R = _mm_loadu_pd(A.ptr);
1985     double[2] correct = [56.0, -75.0];
1986     assert(R.array == correct);
1987 }
1988 
1989 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
1990 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1991 {
1992     pragma(inline, true);
1993     static if (GDC_with_SSE2)
1994     {
1995         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1996     }
1997     else
1998     {
1999         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2000     }
2001 }
2002 unittest
2003 {
2004     align(16) int[4] correct = [-1, 2, -3, 4];
2005     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2006     assert(A.array == correct);
2007 }
2008 
2009 /// Load unaligned 32-bit integer from memory into the first element of result.
2010 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
2011 {
2012     pragma(inline, true);
2013     int r = *cast(int*)(mem_addr);
2014     int4 result = [0, 0, 0, 0];
2015     result.ptr[0] = r;
2016     return result;
2017 }
2018 unittest
2019 {
2020     int r = 42;
2021     __m128i A = _mm_loadu_si32(&r);
2022     int[4] correct = [42, 0, 0, 0];
2023     assert(A.array == correct);
2024 }
2025 
2026 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2027 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2028 /// and pack the results in destination.
2029 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2030 {
2031     static if (GDC_with_SSE2)
2032     {
2033         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2034     }
2035     else static if (LDC_with_SSE2)
2036     {
2037         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2038     }
2039     else static if (LDC_with_ARM64)
2040     {
2041         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2042         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2043         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2044         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2045         return vcombine_s32(rl, rh);
2046     }
2047     else
2048     {
2049         short8 sa = cast(short8)a;
2050         short8 sb = cast(short8)b;
2051         int4 r;
2052         foreach(i; 0..4)
2053         {
2054             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2055         }
2056         return r;
2057     }
2058 }
2059 unittest
2060 {
2061     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2062     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2063     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2064     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2065     assert(R.array == correct);
2066 }
2067 
2068 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2069 /// (elements are not stored when the highest bit is not set in the corresponding element)
2070 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2071 /// boundary.
2072 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2073 {
2074     static if (GDC_with_SSE2)
2075     {    
2076         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2077     }
2078     else static if (LDC_with_SSE2)
2079     {
2080         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2081     }
2082     else static if (LDC_with_ARM64)
2083     {
2084         // PERF: catastrophic on ARM32
2085         byte16 bmask  = cast(byte16)mask;
2086         byte16 shift = 7;
2087         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2088         mask = cast(__m128i) bmask;
2089         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2090         dest = (a & mask) | (dest & ~mask);
2091         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2092     }
2093     else
2094     {
2095         byte16 b = cast(byte16)a;
2096         byte16 m = cast(byte16)mask;
2097         byte* dest = cast(byte*)(mem_addr);
2098         foreach(j; 0..16)
2099         {
2100             if (m.array[j] & 128)
2101             {
2102                 dest[j] = b.array[j];
2103             }
2104         }
2105     }
2106 }
2107 unittest
2108 {
2109     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2110     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2111     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2112     _mm_maskmoveu_si128(A, mask, dest.ptr);
2113     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2114     assert(dest == correct);
2115 }
2116 
2117 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2118 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2119 {
2120     static if (GDC_with_SSE2)
2121     {
2122         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2123     }
2124     else version(LDC)
2125     {
2126         // x86: pmaxsw since LDC 1.0 -O1
2127         // ARM: smax.8h since LDC 1.5 -01
2128         short8 sa = cast(short8)a;
2129         short8 sb = cast(short8)b;
2130         short8 greater = greaterMask!short8(sa, sb);
2131         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2132     }
2133     else
2134     {
2135         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2136         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2137         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2138         return _mm_xor_si128(b, mask);
2139     }
2140 }
2141 unittest
2142 {
2143     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2144                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2145     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2146     assert(R.array == correct);
2147 }
2148 
2149 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2150 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2151 {
2152     version(LDC)
2153     {
2154         // x86: pmaxub since LDC 1.0.0 -O1
2155         // ARM64: umax.16b since LDC 1.5.0 -O1
2156         // PERF: catastrophic on ARM32
2157         ubyte16 sa = cast(ubyte16)a;
2158         ubyte16 sb = cast(ubyte16)b;
2159         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2160         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2161     }
2162     else
2163     {
2164         __m128i value128 = _mm_set1_epi8(-128);
2165         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2166         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2167         __m128i mask = _mm_and_si128(aTob, higher);
2168         return _mm_xor_si128(b, mask);
2169     }
2170 }
2171 unittest
2172 {
2173     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2174                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2175     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2176     assert(R.array == correct);
2177 }
2178 
2179 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2180 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2181 {
2182     static if (GDC_with_SSE2)
2183     {
2184         return __builtin_ia32_maxpd(a, b);
2185     }
2186     else
2187     {
2188         // x86: Generates maxpd starting with LDC 1.9 -O2
2189         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2190         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2191         return a;
2192     }
2193 }
2194 unittest
2195 {
2196     __m128d A = _mm_setr_pd(4.0, 1.0);
2197     __m128d B = _mm_setr_pd(1.0, 8.0);
2198     __m128d M = _mm_max_pd(A, B);
2199     assert(M.array[0] == 4.0);
2200     assert(M.array[1] == 8.0);
2201 }
2202 
2203 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2204 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2205 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2206 {
2207     static if (GDC_with_SSE2)
2208     {
2209         return __builtin_ia32_maxsd(a, b);
2210     }
2211     else
2212     {
2213          __m128d r = a;
2214         // Generates maxsd starting with LDC 1.3
2215         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2216         return r;
2217     }
2218 }
2219 unittest
2220 {
2221     __m128d A = _mm_setr_pd(1.0, 1.0);
2222     __m128d B = _mm_setr_pd(4.0, 2.0);
2223     __m128d M = _mm_max_sd(A, B);
2224     assert(M.array[0] == 4.0);
2225     assert(M.array[1] == 1.0);
2226 }
2227 
2228 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2229 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2230 /// is globally visible before any memory instruction which follows the fence in program order.
2231 void _mm_mfence() @trusted
2232 {
2233     version(GNU)
2234     {
2235         static if (GDC_with_SSE2)
2236         {
2237             __builtin_ia32_mfence();
2238         }
2239         else version(X86)
2240         {
2241             asm pure nothrow @nogc @trusted
2242             {
2243                 "mfence;\n" : : : ;
2244             }
2245         }
2246         else
2247             static assert(false);
2248     }
2249     else static if (LDC_with_SSE2)
2250     {
2251         __builtin_ia32_mfence();
2252     }
2253     else static if (DMD_with_asm)
2254     {
2255         asm nothrow @nogc pure @safe
2256         {
2257             mfence;
2258         }
2259     }
2260     else version(LDC)
2261     {
2262         void _mm_mfence() pure @safe
2263         {
2264             // Note: will generate the DMB instruction on ARM
2265             llvm_memory_fence();
2266         }
2267     }
2268     else
2269         static assert(false);
2270 }
2271 unittest
2272 {
2273     _mm_mfence();
2274 }
2275 
2276 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2277 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2278 {
2279     static if (GDC_with_SSE2)
2280     {
2281         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2282     }
2283     else version(LDC)
2284     {
2285         // x86: pminsw since LDC 1.0 -O1
2286         // ARM64: smin.8h since LDC 1.5 -01
2287         short8 sa = cast(short8)a;
2288         short8 sb = cast(short8)b;
2289         short8 greater = greaterMask!short8(sa, sb);
2290         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2291     }
2292     else
2293     {
2294         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2295         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2296         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2297         return _mm_xor_si128(b, mask);
2298     }
2299 }
2300 unittest
2301 {
2302     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2303                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2304     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2305     assert(R.array == correct);
2306 }
2307 
2308 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2309 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2310 {
2311     version(LDC)
2312     {
2313         // x86: pminub since LDC 1.0.0 -O1
2314         // ARM: umin.16b since LDC 1.5.0 -O1
2315         // PERF: catastrophic on ARM32
2316         ubyte16 sa = cast(ubyte16)a;
2317         ubyte16 sb = cast(ubyte16)b;
2318         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2319         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2320     }
2321     else
2322     {
2323         __m128i value128 = _mm_set1_epi8(-128);
2324         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2325         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2326         __m128i mask = _mm_and_si128(aTob, lower);
2327         return _mm_xor_si128(b, mask);
2328     }
2329 }
2330 unittest
2331 {
2332     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2333                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2334     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2335     assert(R.array == correct);
2336 }
2337 
2338 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2339 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2340 {
2341     static if (GDC_with_SSE2)
2342     {
2343         return __builtin_ia32_minpd(a, b);
2344     }
2345     else
2346     {
2347         // Generates minpd starting with LDC 1.9
2348         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2349         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2350         return a;
2351     }
2352 }
2353 unittest
2354 {
2355     __m128d A = _mm_setr_pd(1.0, 2.0);
2356     __m128d B = _mm_setr_pd(4.0, 1.0);
2357     __m128d M = _mm_min_pd(A, B);
2358     assert(M.array[0] == 1.0);
2359     assert(M.array[1] == 1.0);
2360 }
2361 
2362 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2363 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2364 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2365 {
2366     static if (GDC_with_SSE2)
2367     {
2368         return __builtin_ia32_minsd(a, b);
2369     }
2370     else
2371     {
2372         // Generates minsd starting with LDC 1.3
2373         __m128d r = a;
2374         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2375         return r;
2376     }
2377 }
2378 unittest
2379 {
2380     __m128d A = _mm_setr_pd(1.0, 3.0);
2381     __m128d B = _mm_setr_pd(4.0, 2.0);
2382     __m128d M = _mm_min_sd(A, B);
2383     assert(M.array[0] == 1.0);
2384     assert(M.array[1] == 3.0);
2385 }
2386 
2387 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2388 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2389 {
2390     static if (GDC_with_SSE2)
2391     {
2392         // slightly better with GDC -O0
2393         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2394     }
2395     else
2396     {
2397         long2 result = [ 0, 0 ];
2398         long2 la = cast(long2) a;
2399         result.ptr[0] = la.array[0];
2400         return cast(__m128i)(result);
2401     }
2402 }
2403 unittest
2404 {
2405     long2 A = [13, 47];
2406     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2407     long[2] correct = [13, 0];
2408     assert(B.array == correct);
2409 }
2410 
2411 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2412 /// the upper element from `a` to the upper element of dst.
2413 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2414 {
2415     static if (GDC_with_SSE2)
2416     {
2417         return __builtin_ia32_movsd(a, b); 
2418     }
2419     else
2420     {
2421         b.ptr[1] = a.array[1];
2422         return b;
2423     }
2424 }
2425 unittest
2426 {
2427     double2 A = [13.0, 47.0];
2428     double2 B = [34.0, 58.0];
2429     double2 C = _mm_move_sd(A, B);
2430     double[2] correct = [34.0, 47.0];
2431     assert(C.array == correct);
2432 }
2433 
2434 /// Create mask from the most significant bit of each 8-bit element in `v`.
2435 int _mm_movemask_epi8 (__m128i a) pure @trusted
2436 {
2437     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2438     static if (GDC_with_SSE2)
2439     {
2440         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2441     }
2442     else static if (LDC_with_SSE2)
2443     {
2444         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2445     }
2446     else static if (LDC_with_ARM64)
2447     {
2448         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2449         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2450         // SO there might be something a bit faster, but this one is reasonable and branchless.
2451         byte8 mask_shift;
2452         mask_shift.ptr[0] = 7;
2453         mask_shift.ptr[1] = 6;
2454         mask_shift.ptr[2] = 5;
2455         mask_shift.ptr[3] = 4;
2456         mask_shift.ptr[4] = 3;
2457         mask_shift.ptr[5] = 2;
2458         mask_shift.ptr[6] = 1;
2459         mask_shift.ptr[7] = 0;
2460         byte8 mask_and = byte8(-128);
2461         byte8 lo = vget_low_u8(cast(byte16)a);
2462         byte8 hi = vget_high_u8(cast(byte16)a);
2463         lo = vand_u8(lo, mask_and);
2464         lo = vshr_u8(lo, mask_shift);
2465         hi = vand_u8(hi, mask_and);
2466         hi = vshr_u8(hi, mask_shift);
2467         lo = vpadd_u8(lo,lo);
2468         lo = vpadd_u8(lo,lo);
2469         lo = vpadd_u8(lo,lo);
2470         hi = vpadd_u8(hi,hi);
2471         hi = vpadd_u8(hi,hi);
2472         hi = vpadd_u8(hi,hi);
2473         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2474     }
2475     else
2476     {
2477         byte16 ai = cast(byte16)a;
2478         int r = 0;
2479         foreach(bit; 0..16)
2480         {
2481             if (ai.array[bit] < 0) r += (1 << bit);
2482         }
2483         return r;
2484     }
2485 }
2486 unittest
2487 {
2488     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2489 }
2490 
2491 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2492 int _mm_movemask_epi16 (__m128i a) pure @trusted
2493 {
2494     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2495 }
2496 unittest
2497 {
2498     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2499 }
2500 
2501 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2502 /// loating-point element in `v`.
2503 int _mm_movemask_pd(__m128d v) pure @safe
2504 {
2505     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2506     static if (GDC_with_SSE2)
2507     {
2508         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2509         /// packed double-precision (64-bit) floating-point element in `v`.
2510         return __builtin_ia32_movmskpd(v);
2511     }
2512     else static if (LDC_with_SSE2)
2513     {
2514         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2515         /// packed double-precision (64-bit) floating-point element in `v`.
2516         return __builtin_ia32_movmskpd(v);
2517     }
2518     else
2519     {
2520         long2 lv = cast(long2)v;
2521         int r = 0;
2522         if (lv.array[0] < 0) r += 1;
2523         if (lv.array[1] < 0) r += 2;
2524         return r;
2525     }
2526 }
2527 unittest
2528 {
2529     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2530     assert(_mm_movemask_pd(A) == 2);
2531 }
2532 
2533 /// Copy the lower 64-bit integer in `v`.
2534 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2535 {
2536     long2 lv = cast(long2)v;
2537     return long1(lv.array[0]);
2538 }
2539 unittest
2540 {
2541     __m128i A = _mm_set_epi64x(-1, -2);
2542     __m64 R = _mm_movepi64_pi64(A);
2543     assert(R.array[0] == -2);
2544 }
2545 
2546 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2547 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2548 {
2549     long2 r;
2550     r.ptr[0] = a.array[0];
2551     r.ptr[1] = 0;
2552     return cast(__m128i)r;
2553 }
2554 
2555 // Note: generates pmuludq in LDC with -O1
2556 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2557 {
2558     __m128i zero = _mm_setzero_si128();
2559 
2560     static if (__VERSION__ >= 2088)
2561     {
2562         // Need LLVM9 to avoid this shufflevector
2563         long2 la, lb;
2564         la.ptr[0] = cast(uint)a.array[0];
2565         la.ptr[1] = cast(uint)a.array[2];
2566         lb.ptr[0] = cast(uint)b.array[0];
2567         lb.ptr[1] = cast(uint)b.array[2];
2568     }
2569     else
2570     {
2571         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2572         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2573     }
2574 
2575     version(DigitalMars)
2576     {
2577         // DMD has no long2 mul
2578         // long2 mul not supported before LDC 1.5
2579         la.ptr[0] *= lb.array[0];
2580         la.ptr[1] *= lb.array[1];
2581         return cast(__m128i)(la);
2582     }
2583     else
2584     {
2585         static if (__VERSION__ >= 2076)
2586         {
2587             return cast(__m128i)(la * lb);
2588         }
2589         else
2590         {
2591             // long2 mul not supported before LDC 1.5
2592             la.ptr[0] *= lb.array[0];
2593             la.ptr[1] *= lb.array[1];
2594             return cast(__m128i)(la);
2595         }
2596     }
2597 }
2598 unittest
2599 {
2600     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2601     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2602     __m128i C = _mm_mul_epu32(A, B);
2603     long2 LC = cast(long2)C;
2604     assert(LC.array[0] == 18446744065119617025uL);
2605     assert(LC.array[1] == 12723420444339690338uL);
2606 }
2607 
2608 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2609 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2610 {
2611     pragma(inline, true);
2612     return a * b;
2613 }
2614 unittest
2615 {
2616     __m128d a = [-2.0, 1.5];
2617     a = _mm_mul_pd(a, a);
2618     assert(a.array == [4.0, 2.25]);
2619 }
2620 
2621 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2622 /// element of result, and copy the upper element from `a` to the upper element of result.
2623 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2624 {
2625     version(DigitalMars)
2626     {    
2627         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2628         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2629         asm pure nothrow @nogc @trusted { nop;}
2630         a.array[0] = a.array[0] * b.array[0];
2631         return a;
2632     }
2633     else static if (GDC_with_SSE2)
2634     {
2635         return __builtin_ia32_mulsd(a, b);
2636     }
2637     else
2638     {
2639         a.ptr[0] *= b.array[0];
2640         return a;
2641     }
2642 }
2643 unittest
2644 {
2645     __m128d a = [-2.0, 1.5];
2646     a = _mm_mul_sd(a, a);
2647     assert(a.array == [4.0, 1.5]);
2648 }
2649 
2650 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2651 /// and get an unsigned 64-bit result.
2652 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2653 {
2654     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2655 }
2656 unittest
2657 {
2658     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2659     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2660     __m64 C = _mm_mul_su32(A, B);
2661     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2662 }
2663 
2664 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2665 /// high 16 bits of the intermediate integers.
2666 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2667 {
2668     static if (GDC_with_SSE2)
2669     {
2670         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2671     }
2672     else static if (LDC_with_SSE2)
2673     {
2674         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2675     }
2676     else
2677     {
2678         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2679         //        PERF: it seems the simde solution has one less instruction in ARM64.
2680         // PERF: Catastrophic in ARM32.
2681         short8 sa = cast(short8)a;
2682         short8 sb = cast(short8)b;
2683         short8 r = void;
2684         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2685         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2686         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2687         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2688         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2689         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2690         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2691         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2692         return cast(__m128i)r;
2693     }
2694 }
2695 unittest
2696 {
2697     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2698     __m128i B = _mm_set1_epi16(16384);
2699     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2700     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2701     assert(R.array == correct);
2702 }
2703 
2704 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2705 /// high 16 bits of the intermediate integers.
2706 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2707 {
2708     static if (GDC_with_SSE2)
2709     {
2710         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2711     }
2712     else static if (LDC_with_SSE2)
2713     {
2714         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2715     }
2716     else
2717     {
2718         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2719         //      it seems the simde solution has one less instruction in ARM64
2720         // PERF: Catastrophic in ARM32.
2721         short8 sa = cast(short8)a;
2722         short8 sb = cast(short8)b;
2723         short8 r = void;
2724         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2725         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2726         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2727         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2728         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2729         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2730         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2731         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2732         return cast(__m128i)r;
2733     }
2734 }
2735 unittest
2736 {
2737     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2738     __m128i B = _mm_set1_epi16(16384);
2739     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2740     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2741     assert(R.array == correct);
2742 }
2743 
2744 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2745 /// bits of the intermediate integers.
2746 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2747 {
2748     return cast(__m128i)(cast(short8)a * cast(short8)b);
2749 }
2750 unittest
2751 {
2752     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2753     __m128i B = _mm_set1_epi16(16384);
2754     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2755     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2756     assert(R.array == correct);
2757 }
2758 
2759 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
2760 __m128i _mm_not_si128 (__m128i a) pure @safe
2761 {
2762     return ~a;
2763 }
2764 unittest
2765 {
2766     __m128i A = _mm_set1_epi32(-748);
2767     int4 notA = cast(int4) _mm_not_si128(A);
2768     int[4] correct = [747, 747, 747, 747];
2769     assert(notA.array == correct);
2770 }
2771 
2772 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2773 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2774 {
2775     pragma(inline, true);
2776     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2777 }
2778 
2779 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2780 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2781 {
2782     pragma(inline, true);
2783     return a | b;
2784 }
2785 
2786 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2787 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2788 {
2789     static if (GDC_with_SSE2)
2790     {
2791         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2792     }    
2793     else static if (LDC_with_SSE2)
2794     {
2795         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2796     }
2797     else static if (LDC_with_ARM64)
2798     {
2799         short4 ra = vqmovn_s32(cast(int4)a);
2800         short4 rb = vqmovn_s32(cast(int4)b);
2801         return cast(__m128i)vcombine_s16(ra, rb);
2802     }
2803     else
2804     {
2805         // PERF: catastrophic on ARM32
2806         short8 r;
2807         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2808         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2809         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2810         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2811         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2812         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2813         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2814         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2815         return cast(__m128i)r;
2816     }
2817 }
2818 unittest
2819 {
2820     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2821     short8 R = cast(short8) _mm_packs_epi32(A, A);
2822     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2823     assert(R.array == correct);
2824 }
2825 
2826 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2827 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2828 {
2829     static if (GDC_with_SSE2)
2830     {
2831         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2832     }
2833     else static if (LDC_with_SSE2)
2834     {
2835         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2836     }
2837     else static if (LDC_with_ARM64)
2838     {
2839         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2840         byte8 ra = vqmovn_s16(cast(short8)a);
2841         byte8 rb = vqmovn_s16(cast(short8)b);
2842         return cast(__m128i)vcombine_s8(ra, rb);
2843     }
2844     else
2845     {
2846         // PERF: ARM32 is missing
2847         byte16 r;
2848         short8 sa = cast(short8)a;
2849         short8 sb = cast(short8)b;
2850         foreach(i; 0..8)
2851             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2852         foreach(i; 0..8)
2853             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2854         return cast(__m128i)r;
2855     }
2856 }
2857 unittest
2858 {
2859     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2860     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2861     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2862                         127, -128, 127, 0, 127, -128, 127, 0];
2863     assert(R.array == correct);
2864 }
2865 
2866 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2867 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2868 {
2869     static if (GDC_with_SSE2)
2870     {
2871         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2872     }
2873     else static if (LDC_with_SSE2)
2874     {
2875         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2876     }
2877     else static if (LDC_with_ARM64)
2878     {
2879         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2880         byte8 ra = vqmovun_s16(cast(short8)a);
2881         byte8 rb = vqmovun_s16(cast(short8)b);
2882         return cast(__m128i)vcombine_s8(ra, rb);
2883     }
2884     else
2885     {
2886         short8 sa = cast(short8)a;
2887         short8 sb = cast(short8)b;
2888         ubyte[16] result = void;
2889         for (int i = 0; i < 8; ++i)
2890         {
2891             short s = sa[i];
2892             if (s < 0) s = 0;
2893             if (s > 255) s = 255;
2894             result[i] = cast(ubyte)s;
2895 
2896             s = sb[i];
2897             if (s < 0) s = 0;
2898             if (s > 255) s = 255;
2899             result[i+8] = cast(ubyte)s;
2900         }
2901         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2902     }
2903 }
2904 unittest
2905 {
2906     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2907     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2908     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2909                                                 0, 255, 0, 255, 255, 2, 1, 0];
2910     foreach(i; 0..16)
2911         assert(AA.array[i] == cast(byte)(correctResult[i]));
2912 }
2913 
2914 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2915 /// and power consumption of spin-wait loops.
2916 void _mm_pause() @trusted
2917 {
2918     version(GNU)
2919     {
2920         static if (GDC_with_SSE2)
2921         {
2922             __builtin_ia32_pause();
2923         }
2924         else version(X86)
2925         {
2926             asm pure nothrow @nogc @trusted
2927             {
2928                 "pause;\n" : : : ;
2929             }
2930         }
2931         else
2932             static assert(false);
2933     }
2934     else static if (LDC_with_SSE2)
2935     {
2936         __builtin_ia32_pause();
2937     }
2938     else static if (DMD_with_asm)
2939     {
2940         asm nothrow @nogc pure @safe
2941         {
2942             rep; nop; // F3 90 =  pause
2943         }
2944     }
2945     else version (LDC)
2946     {
2947         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2948     }
2949     else
2950         static assert(false);
2951 }
2952 unittest
2953 {
2954     _mm_pause();
2955 }
2956 
2957 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2958 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2959 /// low 16 bits of 64-bit elements in result.
2960 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2961 {
2962     static if (GDC_with_SSE2)
2963     {
2964         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
2965     }
2966     else static if (LDC_with_SSE2)
2967     {
2968         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2969     }
2970     else static if (LDC_with_ARM64)
2971     {
2972         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
2973 
2974         // PERF: Looks suboptimal vs addp
2975         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
2976         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
2977         ushort8 r = 0;
2978         r[0] = r0;
2979         r[4] = r4;
2980         return cast(__m128i) r;
2981     }
2982     else
2983     {
2984         // PERF: ARM32 is lacking
2985         byte16 ab = cast(byte16)a;
2986         byte16 bb = cast(byte16)b;
2987         ubyte[16] t;
2988         foreach(i; 0..16)
2989         {
2990             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2991             if (diff < 0) diff = -diff;
2992             t[i] = cast(ubyte)(diff);
2993         }
2994         int4 r = _mm_setzero_si128();
2995         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2996         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2997         return r;
2998     }
2999 }
3000 unittest
3001 {
3002     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3003     __m128i B = _mm_set1_epi8(1);
3004     __m128i R = _mm_sad_epu8(A, B);
3005     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3006                       0,
3007                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3008                       0];
3009     assert(R.array == correct);
3010 }
3011 
3012 /// Set packed 16-bit integers with the supplied values.
3013 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3014 {
3015     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
3016     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
3017 }
3018 unittest
3019 {
3020     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3021     short8 B = cast(short8) A;
3022     foreach(i; 0..8)
3023         assert(B.array[i] == i);
3024 }
3025 
3026 /// Set packed 32-bit integers with the supplied values.
3027 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3028 {
3029     pragma(inline, true);
3030     int[4] result = [e0, e1, e2, e3];
3031     return loadUnaligned!(int4)(result.ptr);
3032 }
3033 unittest
3034 {
3035     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3036     foreach(i; 0..4)
3037         assert(A.array[i] == i);
3038 }
3039 
3040 /// Set packed 64-bit integers with the supplied values.
3041 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3042 {
3043     pragma(inline, true);
3044     long[2] result = [e0.array[0], e1.array[0]];
3045     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3046 }
3047 unittest
3048 {
3049     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3050     long2 B = cast(long2) A;
3051     assert(B.array[0] == 5678);
3052     assert(B.array[1] == 1234);
3053 }
3054 
3055 /// Set packed 64-bit integers with the supplied values.
3056 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3057 {
3058     pragma(inline, true);
3059     long[2] result = [e0, e1];
3060     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3061 }
3062 unittest
3063 {
3064     __m128i A = _mm_set_epi64x(1234, 5678);
3065     long2 B = cast(long2) A;
3066     assert(B.array[0] == 5678);
3067     assert(B.array[1] == 1234);
3068 }
3069 
3070 /// Set packed 8-bit integers with the supplied values.
3071 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3072                       byte e11, byte e10, byte e9, byte e8,
3073                       byte e7, byte e6, byte e5, byte e4,
3074                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3075 {
3076     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3077                      e8, e9, e10, e11, e12, e13, e14, e15];
3078     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3079 }
3080 
3081 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3082 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3083 {
3084     pragma(inline, true);
3085     double[2] result = [e0, e1];
3086     return loadUnaligned!(double2)(result.ptr);
3087 }
3088 unittest
3089 {
3090     __m128d A = _mm_set_pd(61.0, 55.0);
3091     double[2] correct = [55.0, 61.0];
3092     assert(A.array == correct);
3093 }
3094 
3095 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3096 __m128d _mm_set_pd1 (double a) pure @trusted
3097 {
3098     pragma(inline, true);
3099     double[2] result = [a, a];
3100     return loadUnaligned!(double2)(result.ptr);
3101 }
3102 unittest
3103 {
3104     __m128d A = _mm_set_pd1(61.0);
3105     double[2] correct = [61.0, 61.0];
3106     assert(A.array == correct);
3107 }
3108 
3109 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3110 /// and zero the upper element.
3111 __m128d _mm_set_sd (double a) pure @trusted
3112 {
3113     double[2] result = [a, 0];
3114     return loadUnaligned!(double2)(result.ptr);
3115 }
3116 
3117 /// Broadcast 16-bit integer a to all elements of dst.
3118 __m128i _mm_set1_epi16 (short a) pure @trusted
3119 {
3120     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3121     {
3122         short8 v = a;
3123         return cast(__m128i) v;
3124     }
3125     else
3126     {
3127         pragma(inline, true);
3128         return cast(__m128i)(short8(a));
3129     }
3130 }
3131 unittest
3132 {
3133     short8 a = cast(short8) _mm_set1_epi16(31);
3134     for (int i = 0; i < 8; ++i)
3135         assert(a.array[i] == 31);
3136 }
3137 
3138 /// Broadcast 32-bit integer `a` to all elements.
3139 __m128i _mm_set1_epi32 (int a) pure @trusted
3140 {
3141     pragma(inline, true);
3142     return cast(__m128i)(int4(a));
3143 }
3144 unittest
3145 {
3146     int4 a = cast(int4) _mm_set1_epi32(31);
3147     for (int i = 0; i < 4; ++i)
3148         assert(a.array[i] == 31);
3149 }
3150 
3151 /// Broadcast 64-bit integer `a` to all elements.
3152 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3153 {
3154     return _mm_set_epi64(a, a);
3155 }
3156 unittest
3157 {
3158     long b = 0x1DEADCAFE; 
3159     __m64 a;
3160     a.ptr[0] = b;
3161     long2 c = cast(long2) _mm_set1_epi64(a);
3162     assert(c.array[0] == b);
3163     assert(c.array[1] == b);
3164 }
3165 
3166 /// Broadcast 64-bit integer `a` to all elements
3167 __m128i _mm_set1_epi64x (long a) pure @trusted
3168 {
3169     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3170     return cast(__m128i)(b);
3171 }
3172 unittest
3173 {
3174     long b = 0x1DEADCAFE;
3175     long2 c = cast(long2) _mm_set1_epi64x(b);
3176     for (int i = 0; i < 2; ++i)
3177         assert(c.array[i] == b);
3178 }
3179 
3180 /// Broadcast 8-bit integer `a` to all elements.
3181 __m128i _mm_set1_epi8 (byte a) pure @trusted
3182 {
3183     pragma(inline, true);
3184     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3185     return cast(__m128i)(b);
3186 }
3187 unittest
3188 {
3189     byte16 b = cast(byte16) _mm_set1_epi8(31);
3190     for (int i = 0; i < 16; ++i)
3191         assert(b.array[i] == 31);
3192 }
3193 
3194 alias _mm_set1_pd = _mm_set_pd1;
3195 
3196 /// Set packed 16-bit integers with the supplied values in reverse order.
3197 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3198                         short e3, short e2, short e1, short e0) pure @trusted
3199 {
3200     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
3201     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
3202 }
3203 unittest
3204 {
3205     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3206     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3207     assert(A.array == correct);
3208 }
3209 
3210 /// Set packed 32-bit integers with the supplied values in reverse order.
3211 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3212 {
3213     pragma(inline, true);
3214     int[4] result = [e3, e2, e1, e0];
3215     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3216 }
3217 unittest
3218 {
3219     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3220     int[4] correct = [-1, 0, -2147483648, 2147483647];
3221     assert(A.array == correct);
3222 }
3223 
3224 /// Set packed 64-bit integers with the supplied values in reverse order.
3225 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3226 {
3227     long[2] result = [e1, e0];
3228     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3229 }
3230 unittest
3231 {
3232     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3233     long[2] correct = [-1, 0];
3234     assert(A.array == correct);
3235 }
3236 
3237 /// Set packed 8-bit integers with the supplied values in reverse order.
3238 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3239                        byte e11, byte e10, byte e9,  byte e8,
3240                        byte e7,  byte e6,  byte e5,  byte e4,
3241                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3242 {
3243     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3244                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3245     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3246 }
3247 
3248 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3249 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3250 {
3251     pragma(inline, true);
3252     double2 result;
3253     result.ptr[0] = e1;
3254     result.ptr[1] = e0;
3255     return result;
3256 }
3257 unittest
3258 {
3259     __m128d A = _mm_setr_pd(61.0, 55.0);
3260     double[2] correct = [61.0, 55.0];
3261     assert(A.array == correct);
3262 }
3263 
3264 /// Return vector of type `__m128d` with all elements set to zero.
3265 __m128d _mm_setzero_pd () pure @trusted
3266 {
3267     pragma(inline, true);
3268     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3269     double[2] result = [0.0, 0.0];
3270     return loadUnaligned!(double2)(result.ptr);
3271 }
3272 
3273 /// Return vector of type `__m128i` with all elements set to zero.
3274 __m128i _mm_setzero_si128() pure @trusted
3275 {
3276     pragma(inline, true);
3277     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3278     int[4] result = [0, 0, 0, 0];
3279     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3280 }
3281 
3282 /// Shuffle 32-bit integers in a using the control in `imm8`.
3283 /// See_also: `_MM_SHUFFLE`.
3284 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3285 {
3286     static if (GDC_with_SSE2)
3287     {
3288         return __builtin_ia32_pshufd(a, imm8);
3289     }
3290     else
3291     {
3292         return shufflevector!(int4, (imm8 >> 0) & 3,
3293                                     (imm8 >> 2) & 3,
3294                                     (imm8 >> 4) & 3,
3295                                     (imm8 >> 6) & 3)(a, a);
3296     }
3297 }
3298 unittest
3299 {
3300     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3301     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3302     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3303     int[4] expectedB = [ 3, 2, 1, 0 ];
3304     assert(B.array == expectedB);
3305 }
3306 
3307 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3308 /// See_also: `_MM_SHUFFLE2`.
3309 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3310 {
3311     static if (GDC_with_SSE2)
3312     {
3313         return __builtin_ia32_shufpd(a, b, imm8);
3314     }
3315     else
3316     {
3317         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3318                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
3319     }
3320 }
3321 unittest
3322 {
3323     __m128d A = _mm_setr_pd(0.5, 2.0);
3324     __m128d B = _mm_setr_pd(4.0, 5.0);
3325     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3326     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3327     double[2] correct = [ 2.0, 5.0 ];
3328     assert(R.array == correct);
3329 }
3330 
3331 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3332 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3333 /// See also: `_MM_SHUFFLE`.
3334 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3335 {
3336     static if (GDC_with_SSE2)
3337     {
3338         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3339     }
3340     else
3341     {
3342         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3343                                           4 + ( (imm8 >> 0) & 3 ),
3344                                           4 + ( (imm8 >> 2) & 3 ),
3345                                           4 + ( (imm8 >> 4) & 3 ),
3346                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3347     }
3348 }
3349 unittest
3350 {
3351     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3352     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3353     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3354     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3355     assert(C.array == expectedC);
3356 }
3357 
3358 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3359 /// bits of result, with the high 64 bits being copied from from `a` to result.
3360 /// See_also: `_MM_SHUFFLE`.
3361 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3362 {
3363     static if (GDC_with_SSE2)
3364     {
3365         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3366     }
3367     else
3368     {
3369         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3370                                                     ( (imm8 >> 2) & 3 ),
3371                                                     ( (imm8 >> 4) & 3 ),
3372                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3373     }
3374 }
3375 unittest
3376 {
3377     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3378     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3379     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3380     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3381     assert(B.array == expectedB);
3382 }
3383 
3384 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3385 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3386 {
3387     static if (LDC_with_SSE2)
3388     {
3389         return __builtin_ia32_pslld128(a, count);
3390     }
3391     else static if (GDC_with_SSE2)
3392     {
3393         return __builtin_ia32_pslld128(a, count);
3394     }
3395     else static if (DMD_with_32bit_asm)
3396     {
3397         asm pure nothrow @nogc @trusted
3398         {
3399             movdqu XMM0, a;
3400             movdqu XMM1, count;
3401             pslld XMM0, XMM1;
3402             movdqu a, XMM0;
3403         }
3404         return a;
3405     }
3406     else
3407     {
3408         int4 r = void;
3409         long2 lc = cast(long2)count;
3410         int bits = cast(int)(lc.array[0]);
3411         foreach(i; 0..4)
3412             r[i] = cast(uint)(a[i]) << bits;
3413         return r;
3414     }
3415 }
3416 
3417 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3418 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3419 {
3420     static if (LDC_with_SSE2)
3421     {
3422         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3423     }
3424     else static if (GDC_with_SSE2)
3425     {
3426         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3427     }
3428     else static if (DMD_with_32bit_asm)
3429     {
3430         asm pure nothrow @nogc @trusted
3431         {
3432             movdqu XMM0, a;
3433             movdqu XMM1, count;
3434             psllq XMM0, XMM1;
3435             movdqu a, XMM0;
3436         }
3437         return a;
3438     }
3439     else
3440     {
3441         // ARM: good since LDC 1.12 -O2
3442         // ~but -O0 version is catastrophic
3443         long2 r = void;
3444         long2 sa = cast(long2)a;
3445         long2 lc = cast(long2)count;
3446         int bits = cast(int)(lc.array[0]);
3447         foreach(i; 0..2)
3448             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3449         return cast(__m128i)r;
3450     }
3451 }
3452 
3453 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3454 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3455 {
3456     static if (LDC_with_SSE2)
3457     {
3458         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3459     }
3460     else static if (GDC_with_SSE2)
3461     {
3462         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3463     }
3464     else static if (DMD_with_32bit_asm)
3465     {
3466         asm pure nothrow @nogc
3467         {
3468             movdqu XMM0, a;
3469             movdqu XMM1, count;
3470             psllw XMM0, XMM1;
3471             movdqu a, XMM0;
3472         }
3473         return a;
3474     }
3475     else
3476     {
3477         short8 sa = cast(short8)a;
3478         long2 lc = cast(long2)count;
3479         int bits = cast(int)(lc.array[0]);
3480         short8 r = void;
3481         foreach(i; 0..8)
3482             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3483         return cast(int4)r;
3484     }
3485 }
3486 
3487 
3488 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3489 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3490 {
3491     static if (GDC_with_SSE2)
3492     {
3493         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3494     }
3495     else static if (LDC_with_SSE2)
3496     {
3497         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3498     }
3499     else
3500     {
3501         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3502         //       D says "It's illegal to shift by the same or more bits 
3503         //       than the size of the quantity being shifted"
3504         //       and it's UB instead.
3505         int4 r = _mm_setzero_si128();
3506 
3507         ubyte count = cast(ubyte) imm8;
3508         if (count > 31)
3509             return r;
3510         
3511         foreach(i; 0..4)
3512             r.array[i] = cast(uint)(a.array[i]) << count;
3513         return r;
3514     }
3515 }
3516 unittest
3517 {
3518     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3519     __m128i B = _mm_slli_epi32(A, 1);
3520     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3521     int[4] expectedB = [ 0, 4, 6, -8];
3522     assert(B.array == expectedB);
3523     assert(B2.array == expectedB);
3524 
3525     __m128i C = _mm_slli_epi32(A, 0);
3526     int[4] expectedC = [ 0, 2, 3, -4];
3527     assert(C.array == expectedC);
3528 
3529     __m128i D = _mm_slli_epi32(A, 65);
3530     int[4] expectedD = [ 0, 0, 0, 0];
3531     assert(D.array == expectedD);
3532 }
3533 
3534 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3535 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3536 {
3537     static if (GDC_with_SSE2)
3538     {
3539         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3540     }
3541     else static if (LDC_with_SSE2)
3542     {
3543         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3544     }
3545     else
3546     {
3547         long2 sa = cast(long2)a;
3548 
3549         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3550         //       D says "It's illegal to shift by the same or more bits 
3551         //       than the size of the quantity being shifted"
3552         //       and it's UB instead.
3553         long2 r = cast(long2) _mm_setzero_si128();
3554         ubyte count = cast(ubyte) imm8;
3555         if (count > 63)
3556             return cast(__m128i)r;
3557 
3558         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3559         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3560         return cast(__m128i)r;
3561     }
3562 }
3563 unittest
3564 {
3565     __m128i A = _mm_setr_epi64(8, -4);
3566     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3567     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3568     long[2] expectedB = [ 16, -8];
3569     assert(B.array == expectedB);
3570     assert(B2.array == expectedB);
3571 
3572     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3573     long[2] expectedC = [ 8, -4];
3574     assert(C.array == expectedC);
3575 
3576     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3577     long[2] expectedD = [ 0, -0];
3578     assert(D.array == expectedD);
3579 }
3580 
3581 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3582 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3583 {
3584     static if (GDC_with_SSE2)
3585     {
3586         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3587     }
3588     else static if (LDC_with_SSE2)
3589     {
3590         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3591     }
3592     else static if (LDC_with_ARM64)
3593     {
3594         short8 sa = cast(short8)a;
3595         short8 r = cast(short8)_mm_setzero_si128();
3596         ubyte count = cast(ubyte) imm8;
3597         if (count > 15)
3598             return cast(__m128i)r;
3599         r = sa << short8(count);
3600         return cast(__m128i)r;
3601     }
3602     else
3603     {
3604         short8 sa = cast(short8)a;
3605         short8 r = cast(short8)_mm_setzero_si128();
3606         ubyte count = cast(ubyte) imm8;
3607         if (count > 15)
3608             return cast(__m128i)r;
3609         foreach(i; 0..8)
3610             r.ptr[i] = cast(short)(sa.array[i] << count);
3611         return cast(__m128i)r;
3612     }
3613 }
3614 unittest
3615 {
3616     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3617     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3618     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3619     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3620     assert(B.array == expectedB);
3621     assert(B2.array == expectedB);
3622 
3623     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3624     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3625     assert(C.array == expectedC);
3626 }
3627 
3628 
3629 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3630 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3631 {
3632     static if (bytes & 0xF0)
3633     {
3634         return _mm_setzero_si128();
3635     }
3636     else
3637     {
3638         static if (GDC_with_SSE2)
3639         {
3640             return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3641         }
3642         else version(DigitalMars)
3643         {
3644             version(D_InlineAsm_X86)
3645             {
3646                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3647                 {
3648                     movdqu XMM0, op;
3649                     pslldq XMM0, bytes;
3650                     movdqu op, XMM0;
3651                 }
3652                 return op;
3653             }
3654             else
3655             {
3656                 byte16 A = cast(byte16)op;
3657                 byte16 R;
3658                 for (int n = 15; n >= bytes; --n)
3659                     R.ptr[n] = A.array[n-bytes];
3660                 for (int n = bytes-1; n >= 0; --n)
3661                     R.ptr[n] = 0;
3662                 return cast(__m128i)R;
3663             }
3664         }
3665         else
3666         {
3667             return cast(__m128i) shufflevector!(byte16,
3668             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3669             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3670             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3671             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3672         }
3673     }
3674 }
3675 unittest
3676 {
3677     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3678     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3679     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3680     assert(R.array == correct);
3681 
3682     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3683     int[4] expectedB = [0, 0, 0, 0];
3684     assert(B.array == expectedB);
3685 }
3686 
3687 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3688 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3689 {
3690     version(LDC)
3691     {
3692         // Disappeared with LDC 1.11
3693         static if (__VERSION__ < 2081)
3694             return __builtin_ia32_sqrtpd(vec);
3695         else
3696         {
3697             vec.array[0] = llvm_sqrt(vec.array[0]);
3698             vec.array[1] = llvm_sqrt(vec.array[1]);
3699             return vec;
3700         }
3701     }
3702     else static if (GDC_with_SSE2)    
3703     {
3704         return __builtin_ia32_sqrtpd(vec);
3705     }
3706     else
3707     {
3708         vec.ptr[0] = sqrt(vec.array[0]);
3709         vec.ptr[1] = sqrt(vec.array[1]);
3710         return vec;
3711     }
3712 }
3713 
3714 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3715 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3716 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3717 {
3718     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3719     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3720     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3721     version(LDC)
3722     {
3723         // Disappeared with LDC 1.11
3724         static if (__VERSION__ < 2081)
3725         {
3726             __m128d c = __builtin_ia32_sqrtsd(b);
3727             a[0] = c[0];
3728             return a;
3729         }
3730         else
3731         {
3732             a.array[0] = llvm_sqrt(b.array[0]);
3733             return a;
3734         }
3735     }
3736     else static if (GDC_with_SSE2)
3737     {
3738         __m128d c = __builtin_ia32_sqrtsd(b);
3739         a.ptr[0] = c.array[0];
3740         return a;
3741     }
3742     else
3743     {
3744         a.ptr[0] = sqrt(b.array[0]);
3745         return a;
3746     }
3747 }
3748 unittest
3749 {
3750     __m128d A = _mm_setr_pd(1.0, 3.0);
3751     __m128d B = _mm_setr_pd(4.0, 5.0);
3752     __m128d R = _mm_sqrt_sd(A, B);
3753     double[2] correct = [2.0, 3.0 ];
3754     assert(R.array == correct);
3755 }
3756 
3757 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3758 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3759 {
3760     static if (GDC_with_SSE2)
3761     {
3762         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3763     }
3764     else static if (LDC_with_SSE2)
3765     {
3766         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3767     }
3768     else
3769     {
3770         short8 sa = cast(short8)a;
3771         long2 lc = cast(long2)count;
3772         int bits = cast(int)(lc.array[0]);
3773         short8 r = void;
3774         foreach(i; 0..8)
3775             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3776         return cast(int4)r;
3777     }
3778 }
3779 
3780 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3781 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3782 {
3783     static if (LDC_with_SSE2)
3784     {
3785         return __builtin_ia32_psrad128(a, count);
3786     }
3787     else static if (GDC_with_SSE2)
3788     {
3789         return __builtin_ia32_psrad128(a, count);
3790     }
3791     else
3792     {    
3793         int4 r = void;
3794         long2 lc = cast(long2)count;
3795         int bits = cast(int)(lc.array[0]);
3796         r.ptr[0] = (a.array[0] >> bits);
3797         r.ptr[1] = (a.array[1] >> bits);
3798         r.ptr[2] = (a.array[2] >> bits);
3799         r.ptr[3] = (a.array[3] >> bits);
3800         return r;
3801     }
3802 }
3803 
3804 
3805 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3806 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3807 {
3808     static if (GDC_with_SSE2)
3809     {
3810         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3811     }
3812     else static if (LDC_with_SSE2)
3813     {
3814         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3815     }
3816     else static if (LDC_with_ARM64)
3817     {
3818         short8 sa = cast(short8)a;
3819         ubyte count = cast(ubyte)imm8;
3820         if (count > 15) 
3821             count = 15;
3822         short8 r = sa >> short8(count);
3823         return cast(__m128i)r;
3824     }
3825     else
3826     {
3827         short8 sa = cast(short8)a;
3828         short8 r = void;
3829 
3830         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3831         //       D says "It's illegal to shift by the same or more bits 
3832         //       than the size of the quantity being shifted"
3833         //       and it's UB instead.
3834         ubyte count = cast(ubyte)imm8;
3835         if (count > 15) 
3836             count = 15;
3837         foreach(i; 0..8)
3838             r.ptr[i] = cast(short)(sa.array[i] >> count);
3839         return cast(int4)r;
3840     }
3841 }
3842 unittest
3843 {
3844     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3845     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3846     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3847     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3848     assert(B.array == expectedB);
3849     assert(B2.array == expectedB);
3850 
3851     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3852     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3853     assert(C.array == expectedC);
3854 }
3855 
3856 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3857 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3858 {
3859     static if (LDC_with_SSE2)
3860     {
3861         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3862     }
3863     else static if (GDC_with_SSE2)
3864     {
3865         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3866     }
3867     else
3868     {
3869         int4 r = void;
3870 
3871         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3872         //       D says "It's illegal to shift by the same or more bits 
3873         //       than the size of the quantity being shifted"
3874         //       and it's UB instead.
3875         ubyte count = cast(ubyte) imm8;
3876         if (count > 31)
3877             count = 31;
3878 
3879         r.ptr[0] = (a.array[0] >> count);
3880         r.ptr[1] = (a.array[1] >> count);
3881         r.ptr[2] = (a.array[2] >> count);
3882         r.ptr[3] = (a.array[3] >> count);
3883         return r;
3884     }
3885 }
3886 unittest
3887 {
3888     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3889     __m128i B = _mm_srai_epi32(A, 1);
3890     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3891     int[4] expectedB = [ 0, 1, 1, -2];
3892     assert(B.array == expectedB);
3893     assert(B2.array == expectedB);
3894 
3895     __m128i C = _mm_srai_epi32(A, 32);
3896     int[4] expectedC = [ 0, 0, 0, -1];
3897     assert(C.array == expectedC);
3898 
3899     __m128i D = _mm_srai_epi32(A, 0);
3900     int[4] expectedD = [ 0, 2, 3, -4];
3901     assert(D.array == expectedD);
3902 }
3903 
3904 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3905 {
3906     static if (LDC_with_SSE2)
3907     {
3908         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3909     }
3910     else static if (GDC_with_SSE2)
3911     {
3912         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3913     }
3914     else
3915     {
3916         short8 sa = cast(short8)a;
3917         long2 lc = cast(long2)count;
3918         int bits = cast(int)(lc.array[0]);
3919         short8 r = void;
3920         foreach(i; 0..8)
3921             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3922         return cast(int4)r;
3923     }
3924 }
3925 
3926 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
3927 {
3928     static if (LDC_with_SSE2)
3929     {
3930         return __builtin_ia32_psrld128(a, count);
3931     }
3932     else static if (GDC_with_SSE2)
3933     {
3934         return __builtin_ia32_psrld128(a, count);
3935     }
3936     else
3937     {
3938         int4 r = void;
3939         long2 lc = cast(long2)count;
3940         int bits = cast(int)(lc.array[0]);
3941         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
3942         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
3943         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
3944         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
3945         return r;
3946     }
3947 }
3948 
3949 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
3950 {
3951     static if (LDC_with_SSE2)
3952     {
3953         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3954     }
3955     else static if (GDC_with_SSE2)
3956     {
3957         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3958     }
3959     else
3960     {
3961         long2 r = void;
3962         long2 sa = cast(long2)a;
3963         long2 lc = cast(long2)count;
3964         int bits = cast(int)(lc.array[0]);
3965         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
3966         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
3967         return cast(__m128i)r;
3968     }
3969 }
3970 
3971 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3972 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3973 {
3974     static if (GDC_with_SSE2)
3975     {
3976         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3977     }
3978     else static if (LDC_with_SSE2)
3979     {
3980         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3981     }
3982     else static if (LDC_with_ARM64)
3983     {
3984         short8 sa = cast(short8)a;
3985         short8 r = cast(short8) _mm_setzero_si128();
3986 
3987         ubyte count = cast(ubyte)imm8;
3988         if (count >= 16)
3989             return cast(__m128i)r;
3990 
3991         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3992         return cast(__m128i)r;
3993     }
3994     else
3995     {
3996         short8 sa = cast(short8)a;
3997         ubyte count = cast(ubyte)imm8;
3998 
3999         short8 r = cast(short8) _mm_setzero_si128();
4000         if (count >= 16)
4001             return cast(__m128i)r;
4002 
4003         foreach(i; 0..8)
4004             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4005         return cast(__m128i)r;
4006     }
4007 }
4008 unittest
4009 {
4010     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4011     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4012     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4013     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4014     assert(B.array == expectedB);
4015     assert(B2.array == expectedB);
4016 
4017     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4018     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4019     assert(C.array == expectedC);
4020 
4021     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4022     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4023     assert(D.array == expectedD);
4024 }
4025 
4026 
4027 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4028 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4029 {
4030     static if (GDC_with_SSE2)
4031     {
4032         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4033     }
4034     else static if (LDC_with_SSE2)
4035     {
4036         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4037     }
4038     else
4039     {
4040         ubyte count = cast(ubyte) imm8;
4041 
4042         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4043         //       D says "It's illegal to shift by the same or more bits 
4044         //       than the size of the quantity being shifted"
4045         //       and it's UB instead.
4046         int4 r = _mm_setzero_si128();
4047         if (count >= 32)
4048             return r;
4049         r.ptr[0] = a.array[0] >>> count;
4050         r.ptr[1] = a.array[1] >>> count;
4051         r.ptr[2] = a.array[2] >>> count;
4052         r.ptr[3] = a.array[3] >>> count;
4053         return r;
4054     }
4055 }
4056 unittest
4057 {
4058     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4059     __m128i B = _mm_srli_epi32(A, 1);
4060     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4061     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4062     assert(B.array == expectedB);
4063     assert(B2.array == expectedB);
4064  
4065     __m128i C = _mm_srli_epi32(A, 255);
4066     int[4] expectedC = [ 0, 0, 0, 0 ];
4067     assert(C.array == expectedC);
4068 }
4069 
4070 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4071 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4072 {
4073     static if (GDC_with_SSE2)
4074     {
4075         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4076     }
4077     else static if (LDC_with_SSE2)
4078     {
4079         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4080     }
4081     else
4082     {
4083         long2 r = cast(long2) _mm_setzero_si128();
4084         long2 sa = cast(long2)a;
4085 
4086         ubyte count = cast(ubyte) imm8;
4087         if (count >= 64)
4088             return cast(__m128i)r;
4089 
4090         r.ptr[0] = sa.array[0] >>> count;
4091         r.ptr[1] = sa.array[1] >>> count;
4092         return cast(__m128i)r;
4093     }
4094 }
4095 unittest
4096 {
4097     __m128i A = _mm_setr_epi64(8, -4);
4098     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4099     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4100     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4101     assert(B.array == expectedB);
4102     assert(B2.array == expectedB);
4103 
4104     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4105     long[2] expectedC = [ 0, 0 ];
4106     assert(C.array == expectedC);
4107 }
4108 
4109 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4110 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
4111 {
4112     static if (bytes & 0xF0)
4113     {
4114         return _mm_setzero_si128();
4115     }
4116     else static if (GDC_with_SSE2)
4117     {
4118         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4119     }
4120     else static if (DMD_with_32bit_asm)
4121     {
4122         asm pure nothrow @nogc @trusted
4123         {
4124             movdqu XMM0, v;
4125             psrldq XMM0, bytes;
4126             movdqu v, XMM0;
4127         }
4128         return v;
4129     }
4130     else
4131     {
4132         return cast(__m128i) shufflevector!(byte16,
4133                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4134                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4135                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4136     }
4137 }
4138 unittest
4139 {
4140     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
4141     int[4] correct = [2, 3, 4, 0];
4142     assert(R.array == correct);
4143 
4144     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4145     int[4] expectedA = [0, 0, 0, 0];
4146     assert(A.array == expectedA);
4147 }
4148 
4149 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4150 /// #BONUS
4151 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4152 {
4153     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4154 }
4155 unittest
4156 {
4157     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4158     float[4] correct = [3.0f, 4.0f, 0, 0];
4159     assert(R.array == correct);
4160 }
4161 
4162 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4163 /// #BONUS
4164 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4165 {
4166     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4167 }
4168 
4169 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4170 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4171 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4172 {
4173     pragma(inline, true);
4174     __m128d* aligned = cast(__m128d*)mem_addr;
4175     *aligned = a;
4176 }
4177 
4178 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4179 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4180 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4181 {
4182     __m128d* aligned = cast(__m128d*)mem_addr;
4183     __m128d r;
4184     r.ptr[0] = a.array[0];
4185     r.ptr[1] = a.array[0];
4186     *aligned = r;
4187 }
4188 
4189 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4190 /// be aligned on any particular boundary.
4191 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4192 {
4193     pragma(inline, true);
4194     *mem_addr = a.array[0];
4195 }
4196 
4197 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4198 /// general-protection exception may be generated.
4199 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4200 {
4201     pragma(inline, true);
4202     *mem_addr = a;
4203 }
4204 
4205 alias _mm_store1_pd = _mm_store_pd1; ///
4206 
4207 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4208 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4209 {
4210     pragma(inline, true);
4211     *mem_addr = a.array[1];
4212 }
4213 
4214 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4215 // expectations from the user point of view. This problem also exist in C++.
4216 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4217 {
4218     pragma(inline, true);
4219     long* dest = cast(long*)mem_addr;
4220     long2 la = cast(long2)a;
4221     *dest = la.array[0];
4222 }
4223 unittest
4224 {
4225     long[3] A = [1, 2, 3];
4226     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4227     long[3] correct = [1, 0x1_0000_0000, 3];
4228     assert(A == correct);
4229 }
4230 
4231 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4232 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4233 {
4234     pragma(inline, true);
4235     *mem_addr = a.array[0];
4236 }
4237 
4238 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4239 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4240 void _mm_storer_pd (double* mem_addr, __m128d a) pure
4241 {
4242     __m128d* aligned = cast(__m128d*)mem_addr;
4243     *aligned = shufflevector!(double2, 1, 0)(a, a);
4244 }
4245 
4246 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4247 /// `mem_addr` does not need to be aligned on any particular boundary.
4248 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
4249 {
4250     pragma(inline, true);
4251     storeUnaligned!double2(a, mem_addr);
4252 }
4253 
4254 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4255 /// boundary.
4256 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4257 {
4258     pragma(inline, true);
4259     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4260 }
4261 
4262 /// Store 32-bit integer from the first element of `a` into memory. 
4263 /// `mem_addr` does not need to be aligned on any particular boundary.
4264 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4265 {
4266     pragma(inline, true);
4267     int* dest = cast(int*)mem_addr;
4268     *dest = a.array[0];
4269 }
4270 unittest
4271 {
4272     int[2] arr = [-24, 12];
4273     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4274     assert(arr == [-24, -1]);
4275 }
4276 
4277 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4278 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4279 /// boundary or a general-protection exception may be generated.
4280 void _mm_stream_pd (double* mem_addr, __m128d a)
4281 {
4282     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4283     __m128d* dest = cast(__m128d*)mem_addr;
4284     *dest = a;
4285 }
4286 
4287 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4288 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4289 /// may be generated.
4290 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4291 {
4292     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4293     __m128i* dest = cast(__m128i*)mem_addr;
4294     *dest = a;
4295 }
4296 
4297 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4298 /// pollution. If the cache line containing address mem_addr is already in the cache,
4299 /// the cache will be updated.
4300 void _mm_stream_si32 (int* mem_addr, int a)
4301 {
4302     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4303     *mem_addr = a;
4304 }
4305 
4306 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4307 /// cache pollution. If the cache line containing address mem_addr is already
4308 /// in the cache, the cache will be updated.
4309 void _mm_stream_si64 (long* mem_addr, long a)
4310 {
4311     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4312     *mem_addr = a;
4313 }
4314 
4315 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4316 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4317 {
4318     pragma(inline, true);
4319     return cast(__m128i)(cast(short8)a - cast(short8)b);
4320 }
4321 
4322 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4323 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4324 {
4325     pragma(inline, true);
4326     return cast(__m128i)(cast(int4)a - cast(int4)b);
4327 }
4328 
4329 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4330 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4331 {
4332     pragma(inline, true);
4333     return cast(__m128i)(cast(long2)a - cast(long2)b);
4334 }
4335 
4336 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4337 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4338 {
4339     pragma(inline, true);
4340     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4341 }
4342 
4343 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4344 /// floating-point elements in `a`.
4345 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4346 {
4347     pragma(inline, true);
4348     return a - b;
4349 }
4350 
4351 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4352 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4353 /// upper element of result.
4354 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4355 {
4356     version(DigitalMars)
4357     {
4358         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4359         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4360         asm pure nothrow @nogc @trusted { nop;}
4361         a[0] = a[0] - b[0];
4362         return a;
4363     }
4364     else static if (GDC_with_SSE2)
4365     {
4366         return __builtin_ia32_subsd(a, b);
4367     }
4368     else
4369     {
4370         a.ptr[0] -= b.array[0];
4371         return a;
4372     }
4373 }
4374 unittest
4375 {
4376     __m128d a = [1.5, -2.0];
4377     a = _mm_sub_sd(a, a);
4378     assert(a.array == [0.0, -2.0]);
4379 }
4380 
4381 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4382 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4383 {
4384     pragma(inline, true);
4385     return a - b;
4386 }
4387 
4388 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4389 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4390 {
4391     version(LDC)
4392     {
4393         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4394         {
4395             // Generates PSUBSW since LDC 1.15 -O0
4396             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4397             
4398             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4399             enum ir = `
4400                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4401                 ret <8 x i16> %r`;
4402             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4403         }
4404         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4405         {
4406             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4407             short[8] res;
4408             short8 sa = cast(short8)a;
4409             short8 sb = cast(short8)b;
4410             foreach(i; 0..8)
4411                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4412             return _mm_loadu_si128(cast(int4*)res.ptr);
4413         }
4414         else static if (LDC_with_SSE2)
4415         {
4416             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4417         }
4418         else
4419             static assert(false);
4420     }
4421     else static if (GDC_with_SSE2)
4422     {
4423         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4424     }
4425     else
4426     {
4427         short[8] res;
4428         short8 sa = cast(short8)a;
4429         short8 sb = cast(short8)b;
4430         foreach(i; 0..8)
4431             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4432         return _mm_loadu_si128(cast(int4*)res.ptr);
4433     }
4434 }
4435 unittest
4436 {
4437     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4438                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4439     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4440     assert(res.array == correctResult);
4441 }
4442 
4443 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4444 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4445 {
4446     version(LDC)
4447     {
4448         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4449         {
4450             // x86: Generates PSUBSB since LDC 1.15 -O0
4451             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4452             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4453             enum ir = `
4454                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4455                 ret <16 x i8> %r`;
4456             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4457         }
4458         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4459         {
4460             byte[16] res;
4461             byte16 sa = cast(byte16)a;
4462             byte16 sb = cast(byte16)b;
4463             foreach(i; 0..16)
4464                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4465             return _mm_loadu_si128(cast(int4*)res.ptr);
4466         }
4467         else static if (LDC_with_SSE2)
4468         {
4469             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4470         }
4471         else
4472             static assert(false);
4473     }
4474     else static if (GDC_with_SSE2)
4475     {
4476         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4477     }
4478     else
4479     {
4480         byte[16] res;
4481         byte16 sa = cast(byte16)a;
4482         byte16 sb = cast(byte16)b;
4483         foreach(i; 0..16)
4484             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4485         return _mm_loadu_si128(cast(int4*)res.ptr);
4486     }
4487 }
4488 unittest
4489 {
4490     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4491                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4492     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4493     assert(res.array == correctResult);
4494 }
4495 
4496 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4497 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4498 {
4499     version(LDC)
4500     {
4501         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4502         {
4503             // x86: Generates PSUBUSW since LDC 1.15 -O0
4504             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4505             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4506             enum ir = `
4507                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4508                 ret <8 x i16> %r`;
4509             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4510         }
4511         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4512         {
4513             short[8] res;
4514             short8 sa = cast(short8)a;
4515             short8 sb = cast(short8)b;
4516             foreach(i; 0..8)
4517             {
4518                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4519                 res[i] = saturateSignedIntToUnsignedShort(sum);
4520             }
4521             return _mm_loadu_si128(cast(int4*)res.ptr);
4522         }
4523         else static if (LDC_with_SSE2)
4524         {
4525             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4526         }
4527         else 
4528             static assert(false);
4529     }
4530     else static if (GDC_with_SSE2)
4531     {
4532         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4533     }
4534     else
4535     {
4536         short[8] res;
4537         short8 sa = cast(short8)a;
4538         short8 sb = cast(short8)b;
4539         foreach(i; 0..8)
4540         {
4541             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4542             res[i] = saturateSignedIntToUnsignedShort(sum);
4543         }
4544         return _mm_loadu_si128(cast(int4*)res.ptr);
4545     }
4546 }
4547 unittest
4548 {
4549     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4550                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4551     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4552     assert(R.array == correct);
4553 }
4554 
4555 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4556 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4557 {
4558     version(LDC)
4559     {
4560         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4561         {
4562             // x86: Generates PSUBUSB since LDC 1.15 -O0
4563             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4564             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4565             enum ir = `
4566                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4567                 ret <16 x i8> %r`;
4568             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4569         }
4570         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4571         {
4572             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4573             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4574             {
4575                 ubyte[16] res;
4576                 byte16 sa = cast(byte16)a;
4577                 byte16 sb = cast(byte16)b;
4578                 foreach(i; 0..16)
4579                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4580                 return _mm_loadu_si128(cast(int4*)res.ptr);
4581             }
4582         }
4583         else static if (LDC_with_SSE2)
4584         {
4585             return __builtin_ia32_psubusb128(a, b);
4586         }
4587         else 
4588             static assert(false);
4589     }
4590     else static if (GDC_with_SSE2)
4591     {
4592         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4593     }
4594     else
4595     {
4596         ubyte[16] res;
4597         byte16 sa = cast(byte16)a;
4598         byte16 sb = cast(byte16)b;
4599         foreach(i; 0..16)
4600             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4601         return _mm_loadu_si128(cast(int4*)res.ptr);
4602     }
4603 }
4604 unittest
4605 {
4606     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4607                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4608     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4609     assert(res.array == correctResult);
4610 }
4611 
4612 // Note: the only difference between these intrinsics is the signalling
4613 //       behaviour of quiet NaNs. This is incorrect but the case where
4614 //       you would want to differentiate between qNaN and sNaN and then
4615 //       treat them differently on purpose seems extremely rare.
4616 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4617 alias _mm_ucomige_sd = _mm_comige_sd; ///
4618 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4619 alias _mm_ucomile_sd = _mm_comile_sd; ///
4620 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4621 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4622 
4623 /// Return vector of type `__m128d` with undefined elements.
4624 __m128d _mm_undefined_pd() pure @safe
4625 {
4626     pragma(inline, true);
4627     __m128d result = void;
4628     return result;
4629 }
4630 
4631 /// Return vector of type `__m128i` with undefined elements.
4632 __m128i _mm_undefined_si128() pure @safe
4633 {
4634     pragma(inline, true);
4635     __m128i result = void;
4636     return result;
4637 }
4638 
4639 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4640 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4641 {
4642     static if (GDC_with_SSE2)
4643     {
4644         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4645     }
4646     else static if (DMD_with_32bit_asm)
4647     {
4648         asm pure nothrow @nogc @trusted
4649         {
4650             movdqu XMM0, a;
4651             movdqu XMM1, b;
4652             punpckhwd XMM0, XMM1;
4653             movdqu a, XMM0;
4654         }
4655         return a;
4656     }
4657     else
4658     {
4659         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4660                                            (cast(short8)a, cast(short8)b);
4661     }
4662 }
4663 unittest
4664 {
4665     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4666     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4667     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4668     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4669     assert(C.array == correct);
4670 }
4671 
4672 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4673 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
4674 {
4675     static if (GDC_with_SSE2)
4676     {
4677         return __builtin_ia32_punpckhdq128(a, b);
4678     }
4679     else version(DigitalMars)
4680     {
4681         __m128i r;
4682         r.ptr[0] = a.array[2];
4683         r.ptr[1] = b.array[2];
4684         r.ptr[2] = a.array[3];
4685         r.ptr[3] = b.array[3];
4686         return r;
4687     }
4688     else
4689     {
4690         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4691     }
4692 }
4693 unittest
4694 {
4695     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4696     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4697     __m128i C = _mm_unpackhi_epi32(A, B);
4698     int[4] correct = [3, 7, 4, 8];
4699     assert(C.array == correct);
4700 }
4701 
4702 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4703 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4704 {
4705     static if (GDC_with_SSE2)
4706     {
4707         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
4708     }
4709     else
4710     {
4711         __m128i r = cast(__m128i)b;
4712         r[0] = a[2];
4713         r[1] = a[3];
4714         return r; 
4715     }
4716 }
4717 unittest // Issue #36
4718 {
4719     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4720     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4721     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4722     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4723     assert(C.array == correct);
4724 }
4725 
4726 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4727 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4728 {
4729     static if (GDC_with_SSE2)
4730     {
4731         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
4732     }
4733     else static if (DMD_with_32bit_asm)
4734     {
4735         asm pure nothrow @nogc @trusted
4736         {
4737             movdqu XMM0, a;
4738             movdqu XMM1, b;
4739             punpckhbw XMM0, XMM1;
4740             movdqu a, XMM0;
4741         }
4742         return a;
4743     }
4744     else
4745     {
4746         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4747                                                    12, 28, 13, 29, 14, 30, 15, 31)
4748                                                    (cast(byte16)a, cast(byte16)b);
4749     }
4750 }
4751 unittest
4752 {
4753     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4754     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4755     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
4756     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
4757     assert(C.array == correct);
4758 }
4759 
4760 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4761 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4762 {
4763     static if (GDC_with_SSE2)
4764     {
4765         return __builtin_ia32_unpckhpd(a, b);
4766     }
4767     else
4768     {
4769         return shufflevector!(__m128d, 1, 3)(a, b);
4770     }
4771 }
4772 unittest
4773 {
4774     __m128d A = _mm_setr_pd(4.0, 6.0);
4775     __m128d B = _mm_setr_pd(7.0, 9.0);
4776     __m128d C = _mm_unpackhi_pd(A, B);
4777     double[2] correct = [6.0, 9.0];
4778     assert(C.array == correct);
4779 }
4780 
4781 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4782 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4783 {
4784     static if (GDC_with_SSE2)
4785     {
4786         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
4787     }
4788     else static if (DMD_with_32bit_asm)
4789     {
4790         asm pure nothrow @nogc @trusted
4791         {
4792             movdqu XMM0, a;
4793             movdqu XMM1, b;
4794             punpcklwd XMM0, XMM1;
4795             movdqu a, XMM0;
4796         }
4797         return a;
4798     }
4799     else
4800     {
4801         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4802                                            (cast(short8)a, cast(short8)b);
4803     }
4804 }
4805 unittest
4806 {
4807     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4808     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4809     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
4810     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
4811     assert(C.array == correct);
4812 }
4813 
4814 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4815 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
4816 {
4817     static if (GDC_with_SSE2)
4818     {
4819         return __builtin_ia32_punpckldq128(a, b);
4820     }
4821     else version(DigitalMars)
4822     {
4823         __m128i r;
4824         r.ptr[0] = a.array[0];
4825         r.ptr[1] = b.array[0];
4826         r.ptr[2] = a.array[1];
4827         r.ptr[3] = b.array[1];
4828         return r;
4829     }
4830     else
4831     {
4832         return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
4833     }
4834 }
4835 unittest
4836 {
4837     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4838     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4839     __m128i C = _mm_unpacklo_epi32(A, B);
4840     int[4] correct = [1, 5, 2, 6];
4841     assert(C.array == correct);
4842 }
4843 
4844 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4845 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4846 {
4847     static if (GDC_with_SSE2)
4848     {
4849         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
4850     }
4851     else
4852     {
4853         long2 lA = cast(long2)a;
4854         long2 lB = cast(long2)b;
4855         long2 R;
4856         R.ptr[0] = lA.array[0];
4857         R.ptr[1] = lB.array[0];
4858         return cast(__m128i)R;
4859     }
4860 }
4861 unittest // Issue #36
4862 {
4863     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4864     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4865     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4866     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4867     assert(C.array == correct);
4868 }
4869 
4870 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
4871 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4872 {
4873     static if (GDC_with_SSE2)
4874     {
4875         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
4876     }
4877     else static if (DMD_with_32bit_asm)
4878     {
4879         asm pure nothrow @nogc @trusted
4880         {
4881             movdqu XMM0, a;
4882             movdqu XMM1, b;
4883             punpcklbw XMM0, XMM1;
4884             movdqu a, XMM0;
4885         }
4886         return a;
4887     }
4888     else
4889     {
4890         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4891                                                     4, 20, 5, 21, 6, 22, 7, 23)
4892                                            (cast(byte16)a, cast(byte16)b);
4893     }
4894 }
4895 unittest
4896 {
4897     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4898     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4899     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
4900     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
4901     assert(C.array == correct);
4902 }
4903 
4904 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
4905 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4906 {
4907     static if (GDC_with_SSE2)
4908     {
4909         return __builtin_ia32_unpcklpd(a, b);
4910     }
4911     else
4912     {
4913         return shufflevector!(__m128d, 0, 2)(a, b);
4914     }
4915 }
4916 unittest
4917 {
4918     __m128d A = _mm_setr_pd(4.0, 6.0);
4919     __m128d B = _mm_setr_pd(7.0, 9.0);
4920     __m128d C = _mm_unpacklo_pd(A, B);
4921     double[2] correct = [4.0, 7.0];
4922     assert(C.array == correct);
4923 }
4924 
4925 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4926 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4927 {
4928     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4929 }
4930 // TODO unittest and thus force inline
4931 
4932 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
4933 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4934 {
4935     return a ^ b;
4936 }
4937 // TODO unittest and thus force inline
4938 
4939 unittest
4940 {
4941     float distance(float[4] a, float[4] b) nothrow @nogc
4942     {
4943         __m128 va = _mm_loadu_ps(a.ptr);
4944         __m128 vb = _mm_loadu_ps(b.ptr);
4945         __m128 diffSquared = _mm_sub_ps(va, vb);
4946         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4947         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4948         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4949         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4950     }
4951     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4952 }