1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.emmintrin;
7 
8 public import inteli.types;
9 public import inteli.xmmintrin; // SSE2 includes SSE1
10 import inteli.mmx;
11 import inteli.internals;
12 
13 nothrow @nogc:
14 
15 
16 // SSE2 instructions
17 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
18 
19 /// Add packed 16-bit integers in `a` and `b`.
20 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
21 {
22     return cast(__m128i)(cast(short8)a + cast(short8)b);
23 }
24 unittest
25 {
26     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
27     short8 R = cast(short8) _mm_add_epi16(A, A);
28     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
29     assert(R.array == correct);
30 }
31 
32 /// Add packed 32-bit integers in `a` and `b`.
33 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
34 {
35     return cast(__m128i)(cast(int4)a + cast(int4)b);
36 }
37 unittest
38 {
39     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
40     int4 R = _mm_add_epi32(A, A);
41     int[4] correct = [ -14, -2, 0, 18 ];
42     assert(R.array == correct);
43 }
44 
45 /// Add packed 64-bit integers in `a` and `b`.
46 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
47 {
48     return cast(__m128i)(cast(long2)a + cast(long2)b);
49 }
50 unittest
51 {
52     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
53     long2 R = cast(long2) _mm_add_epi64(A, A);
54     long[2] correct = [ -2, 0 ];
55     assert(R.array == correct);
56 }
57 
58 /// Add packed 8-bit integers in `a` and `b`.
59 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
60 {
61     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
62 }
63 unittest
64 {
65     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
66     byte16 R = cast(byte16) _mm_add_epi8(A, A);
67     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
68     assert(R.array == correct);
69 }
70 
71 /// Add the lower double-precision (64-bit) floating-point element 
72 /// in `a` and `b`, store the result in the lower element of dst, 
73 /// and copy the upper element from `a` to the upper element of destination. 
74 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
75 {
76     static if (GDC_with_SSE2)
77     {
78         return __builtin_ia32_addsd(a, b);
79     }
80     else version(DigitalMars)
81     {
82         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
83         asm pure nothrow @nogc @trusted { nop;}
84         a[0] = a[0] + b[0];
85         return a;
86     }
87     else
88     {
89         a[0] += b[0];
90         return a;
91     }
92 }
93 unittest
94 {
95     __m128d a = [1.5, -2.0];
96     a = _mm_add_sd(a, a);
97     assert(a.array == [3.0, -2.0]);
98 }
99 
100 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
101 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
102 {
103     return a + b;
104 }
105 unittest
106 {
107     __m128d a = [1.5, -2.0];
108     a = _mm_add_pd(a, a);
109     assert(a.array == [3.0, -4.0]);
110 }
111 
112 /// Add 64-bit integers `a` and `b`.
113 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
114 {
115     return a + b;
116 }
117 
118 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
119 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
120 {
121     static if (GDC_with_SSE2)
122     {
123         return __builtin_ia32_paddsw128(a, b);
124     }
125     else version(LDC)
126     {
127         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
128         {
129             // x86: Generates PADDSW since LDC 1.15 -O0
130             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
131             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
132             enum ir = `
133                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
134                 ret <8 x i16> %r`;
135             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
136         }
137         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
138         {
139             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
140             short[8] res;
141             short8 sa = cast(short8)a;
142             short8 sb = cast(short8)b;
143             foreach(i; 0..8)
144                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
145             return _mm_loadu_si128(cast(int4*)res.ptr);
146         }
147         else
148             return __builtin_ia32_paddsw128(a, b);
149     }
150     else
151     {
152         short[8] res;
153         short8 sa = cast(short8)a;
154         short8 sb = cast(short8)b;
155         foreach(i; 0..8)
156             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
157         return _mm_loadu_si128(cast(int4*)res.ptr);
158     }
159 }
160 unittest
161 {
162     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
163                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
164     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
165     assert(res.array == correctResult);
166 }
167 
168 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
169 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
170 {
171     static if (GDC_with_SSE2)
172     {
173         return __builtin_ia32_paddsb128(a, b);
174     }
175     else version(LDC)
176     {
177         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
178         {
179             // x86: Generates PADDSB since LDC 1.15 -O0
180             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
181             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
182             enum ir = `
183                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
184                 ret <16 x i8> %r`;
185             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
186         }
187         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
188         {
189             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
190             byte[16] res;
191             byte16 sa = cast(byte16)a;
192             byte16 sb = cast(byte16)b;
193             foreach(i; 0..16)
194                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
195             return _mm_loadu_si128(cast(int4*)res.ptr);
196         }
197         else
198             return __builtin_ia32_paddsb128(a, b);
199     }
200     else
201     {
202         byte[16] res;
203         byte16 sa = cast(byte16)a;
204         byte16 sb = cast(byte16)b;
205         foreach(i; 0..16)
206             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
207         return _mm_loadu_si128(cast(int4*)res.ptr);
208     }
209 }
210 unittest
211 {
212     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
213                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
214     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
215                                                16, 18, 20, 22, 24, 26, 28, 30];
216     assert(res.array == correctResult);
217 }
218 
219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
220 // PERF: #GDC version?
221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
222 {
223     version(LDC)
224     {
225         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
226         {
227             // x86: Generates PADDUSB since LDC 1.15 -O0
228             // ARM: Generates uqadd.16b since LDC 1.21 -O1
229             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
230             enum ir = `
231                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
232                 ret <16 x i8> %r`;
233             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
234         }
235         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
236         {
237             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
238             ubyte[16] res;
239             byte16 sa = cast(byte16)a;
240             byte16 sb = cast(byte16)b;
241             foreach(i; 0..16)
242                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
243             return _mm_loadu_si128(cast(int4*)res.ptr);
244         }
245         else
246             return __builtin_ia32_paddusb128(a, b);
247     }
248     else
249     {
250         ubyte[16] res;
251         byte16 sa = cast(byte16)a;
252         byte16 sb = cast(byte16)b;
253         foreach(i; 0..16)
254             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
255         return _mm_loadu_si128(cast(int4*)res.ptr);
256     }
257 }
258 unittest
259 {
260     byte16 res = cast(byte16) _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
261                                             _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
262     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 0, cast(byte)255, 4, 6, 8, 10, 12, 14];
263     assert(res.array == correctResult);
264 }
265 
266 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
267 // PERF: #GDC version?
268 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
269 {
270     version(LDC)
271     {
272         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
273         {
274             // x86: Generates PADDUSW since LDC 1.15 -O0
275             // ARM: Generates uqadd.8h since LDC 1.21 -O1
276             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
277             enum ir = `
278                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
279                 ret <8 x i16> %r`;
280             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
281         }
282         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
283         {
284             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
285             ushort[8] res;
286             short8 sa = cast(short8)a;
287             short8 sb = cast(short8)b;
288             foreach(i; 0..8)
289                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
290             return _mm_loadu_si128(cast(int4*)res.ptr);
291         }
292         else
293             return __builtin_ia32_paddusw128(a, b);
294     }
295     else
296     {
297         ushort[8] res;
298         short8 sa = cast(short8)a;
299         short8 sb = cast(short8)b;
300         foreach(i; 0..8)
301             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
302         return _mm_loadu_si128(cast(int4*)res.ptr);
303     }
304 }
305 unittest
306 {
307     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
308                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
309     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
310     assert(res.array == correctResult);
311 }
312 
313 /// Compute the bitwise AND of packed double-precision (64-bit) 
314 /// floating-point elements in `a` and `b`.
315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
316 {
317     return cast(__m128d)( cast(long2)a & cast(long2)b );
318 }
319 unittest
320 {
321     double a = 4.32;
322     double b = -78.99;
323     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
324     __m128d A = _mm_set_pd(a, b);
325     __m128d B = _mm_set_pd(b, a);
326     long2 R = cast(long2)( _mm_and_pd(A, B) );
327     assert(R.array[0] == correct);
328     assert(R.array[1] == correct);
329 }
330 
331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
333 {
334     return a & b;
335 }
336 unittest
337 {
338     __m128i A = _mm_set1_epi32(7);
339     __m128i B = _mm_set1_epi32(14);
340     __m128i R = _mm_and_si128(A, B);
341     int[4] correct = [6, 6, 6, 6];
342     assert(R.array == correct);
343 }
344 
345 /// Compute the bitwise NOT of packed double-precision (64-bit) 
346 /// floating-point elements in `a` and then AND with `b`.
347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
348 {
349     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
350 }
351 unittest
352 {
353     double a = 4.32;
354     double b = -78.99;
355     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
356     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
357     __m128d A = _mm_setr_pd(a, b);
358     __m128d B = _mm_setr_pd(b, a);
359     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
360     assert(R.array[0] == correct);
361     assert(R.array[1] == correct2);
362 }
363 
364 /// Compute the bitwise NOT of 128 bits (representing integer data) 
365 /// in `a` and then AND with `b`.
366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
367 {
368     return (~a) & b;
369 }
370 unittest
371 {
372     __m128i A = _mm_set1_epi32(7);
373     __m128i B = _mm_set1_epi32(14);
374     __m128i R = _mm_andnot_si128(A, B);
375     int[4] correct = [8, 8, 8, 8];
376     assert(R.array == correct);
377 }
378 
379 /// Average packed unsigned 16-bit integers in `a` and `b`.
380 /// TODO: #ARM
381 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
382 {
383     static if (GDC_with_SSE2)
384     {
385         return __builtin_ia32_pavgw128(a, b);
386     }
387     else version(LDC)
388     {
389         // Generates pavgw even in LDC 1.0, even in -O0
390         enum ir = `
391             %ia = zext <8 x i16> %0 to <8 x i32>
392             %ib = zext <8 x i16> %1 to <8 x i32>
393             %isum = add <8 x i32> %ia, %ib
394             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
395             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
396             %r = trunc <8 x i32> %isums to <8 x i16>
397             ret <8 x i16> %r`;
398         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
399     }
400     else
401     {
402         short8 sa = cast(short8)a;
403         short8 sb = cast(short8)b;
404         short8 sr = void;
405         foreach(i; 0..8)
406         {
407             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
408         }
409         return cast(int4)sr;
410     }
411 }
412 unittest
413 {
414     __m128i A = _mm_set1_epi16(31);
415     __m128i B = _mm_set1_epi16(64);
416     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
417     foreach(i; 0..8)
418         assert(avg.array[i] == 48);
419 }
420 
421 /// Average packed unsigned 8-bit integers in `a` and `b`.
422 // TODO: #ARM
423 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
424 {
425     static if (GDC_with_SSE2)
426     {
427         return __builtin_ia32_pavgb128(a, b);
428     }
429     else version(LDC)
430     {
431         // Generates pavgb even in LDC 1.0, even in -O0
432         enum ir = `
433             %ia = zext <16 x i8> %0 to <16 x i16>
434             %ib = zext <16 x i8> %1 to <16 x i16>
435             %isum = add <16 x i16> %ia, %ib
436             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
437             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
438             %r = trunc <16 x i16> %isums to <16 x i8>
439             ret <16 x i8> %r`;
440         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
441     }
442     else
443     {
444         byte16 sa = cast(byte16)a;
445         byte16 sb = cast(byte16)b;
446         byte16 sr = void;
447         foreach(i; 0..16)
448         {
449             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
450         }
451         return cast(int4)sr;
452     }
453 }
454 unittest
455 {
456     __m128i A = _mm_set1_epi8(31);
457     __m128i B = _mm_set1_epi8(64);
458     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
459     foreach(i; 0..16)
460         assert(avg.array[i] == 48);
461 }
462 
463 /// Shift `a` left by `bytes` bytes while shifting in zeros.
464 alias _mm_bslli_si128 = _mm_slli_si128;
465 unittest
466 {
467     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
468     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
469     __m128i result = _mm_bslli_si128!5(toShift);
470     assert( (cast(byte16)result).array == exact);
471 }
472 
473 /// Shift `v` right by `bytes` bytes while shifting in zeros.
474 alias _mm_bsrli_si128 = _mm_srli_si128;
475 unittest
476 {
477     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
478     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
479     __m128i result = _mm_bsrli_si128!5(toShift);
480     assert( (cast(byte16)result).array == exact);
481 }
482 
483 /// Cast vector of type `__m128d` to type `__m128`. 
484 /// Note: Also possible with a regular `cast(__m128)(a)`.
485 __m128 _mm_castpd_ps (__m128d a) pure @safe
486 {
487     return cast(__m128)a;
488 }
489 
490 /// Cast vector of type `__m128d` to type `__m128i`. 
491 /// Note: Also possible with a regular `cast(__m128i)(a)`.
492 __m128i _mm_castpd_si128 (__m128d a) pure @safe
493 {
494     return cast(__m128i)a;
495 }
496 
497 /// Cast vector of type `__m128` to type `__m128d`. 
498 /// Note: Also possible with a regular `cast(__m128d)(a)`.
499 __m128d _mm_castps_pd (__m128 a) pure @safe
500 {
501     return cast(__m128d)a;
502 }
503 
504 /// Cast vector of type `__m128` to type `__m128i`. 
505 /// Note: Also possible with a regular `cast(__m128i)(a)`.
506 __m128i _mm_castps_si128 (__m128 a) pure @safe
507 {
508     return cast(__m128i)a;
509 }
510 
511 /// Cast vector of type `__m128i` to type `__m128d`. 
512 /// Note: Also possible with a regular `cast(__m128d)(a)`.
513 __m128d _mm_castsi128_pd (__m128i a) pure @safe
514 {
515     return cast(__m128d)a;
516 }
517 
518 /// Cast vector of type `__m128i` to type `__m128`. 
519 /// Note: Also possible with a regular `cast(__m128)(a)`.
520 __m128 _mm_castsi128_ps (__m128i a) pure @safe
521 {
522     return cast(__m128)a;
523 }
524 
525 /// Invalidate and flush the cache line that contains `p` 
526 /// from all levels of the cache hierarchy.
527 void _mm_clflush (const(void)* p) @trusted
528 {
529     static if (GDC_with_SSE2)
530     {
531         __builtin_ia32_clflush(p);
532     }
533     else static if (LDC_with_SSE2)
534     {
535         __builtin_ia32_clflush(cast(void*)p);
536     }
537     else version(D_InlineAsm_X86)
538     {
539         asm pure nothrow @nogc @safe
540         {
541             mov EAX, p;
542             clflush [EAX];
543         }
544     }
545     else version(D_InlineAsm_X86_64)
546     {
547         asm pure nothrow @nogc @safe
548         {
549             mov RAX, p;
550             clflush [RAX];
551         }
552     }
553     else 
554     {
555         // Do nothing. Invalidating cacheline does
556         // not affect correctness.
557     }
558 }
559 unittest
560 {
561     ubyte[64] cacheline;
562     _mm_clflush(cacheline.ptr);
563 }
564 
565 /// Compare packed 16-bit integers in `a` and `b` for equality.
566 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
567 {
568     static if (GDC_with_SSE2)
569     {
570         return __builtin_ia32_pcmpeqw128(a, b);
571     }
572     else
573     {
574         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
575     }
576 }
577 unittest
578 {
579     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
580     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
581     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
582     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
583     assert(R.array == E);
584 }
585 
586 /// Compare packed 32-bit integers in `a` and `b` for equality.
587 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
588 {
589     static if (GDC_with_SSE2)
590     {
591         return __builtin_ia32_pcmpeqd128(a, b);
592     }
593     else
594     {
595         return equalMask!__m128i(a, b);
596     }
597 }
598 unittest
599 {
600     int4   A = [-3, -2, -1,  0];
601     int4   B = [ 4, -2,  2,  0];
602     int[4] E = [ 0, -1,  0, -1];
603     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
604     assert(R.array == E);
605 }
606 
607 /// Compare packed 8-bit integers in `a` and `b` for equality.
608 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
609 {
610     static if (GDC_with_SSE2)
611     {
612         return __builtin_ia32_pcmpeqb128(a, b); 
613     }
614     else
615     {
616         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
617     }
618 }
619 unittest
620 {
621     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
622     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
623     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
624     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
625     assert(C.array == correct);
626 }
627 
628 /// Compare packed double-precision (64-bit) floating-point elements 
629 /// in `a` and `b` for equality.
630 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
631 {
632     static if (GDC_with_SSE2)
633     {
634         return __builtin_ia32_cmpeqpd(a, b);
635     }
636     else
637     {
638         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
639     }
640 }
641 
642 /// Compare the lower double-precision (64-bit) floating-point elements
643 /// in `a` and `b` for equality, store the result in the lower element,
644 /// and copy the upper element from `a`.
645 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
646 {
647     static if (GDC_with_SSE2)
648     {
649         return __builtin_ia32_cmpeqsd(a, b);
650     }
651     else
652     {
653         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
654     }
655 }
656 
657 /// Compare packed double-precision (64-bit) floating-point elements 
658 /// in `a` and `b` for greater-than-or-equal.
659 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
660 {
661     static if (GDC_with_SSE2)
662     {
663         return __builtin_ia32_cmpgepd(a, b);
664     }
665     else
666     {
667         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
668     }
669 }
670 
671 /// Compare the lower double-precision (64-bit) floating-point elements 
672 /// in `a` and `b` for greater-than-or-equal, store the result in the 
673 /// lower element, and copy the upper element from `a`.
674 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
675 {
676     // Note: There is no __builtin_ia32_cmpgesd builtin.
677     static if (GDC_with_SSE2)
678     {
679         return __builtin_ia32_cmpnltsd(b, a);
680     }
681     else
682     {
683         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
684     }
685 }
686 
687 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
688 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
689 {
690     static if (GDC_with_SSE2)
691     {
692         return __builtin_ia32_pcmpgtw128(a, b); 
693     }
694     else
695     {
696         return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
697     }
698 }
699 unittest
700 {
701     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
702     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
703     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
704     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
705     assert(R.array == E);
706 }
707 
708 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
709 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
710 {
711     static if (GDC_with_SSE2)
712     {
713         return __builtin_ia32_pcmpgtd128(a, b); 
714     }
715     else
716     {
717         return cast(__m128i)( greaterMask!int4(a, b));
718     }
719 }
720 unittest
721 {
722     int4   A = [-3,  2, -1,  0];
723     int4   B = [ 4, -2,  2,  0];
724     int[4] E = [ 0, -1,  0,  0];
725     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
726     assert(R.array == E);
727 }
728 
729 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
730 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
731 {
732     static if (GDC_with_SSE2)
733     {
734         return __builtin_ia32_pcmpgtb128(a, b); 
735     }
736     else
737     {
738         return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
739     }
740 }
741 unittest
742 {
743     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
744     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
745     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
746     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
747     __m128i D = _mm_cmpeq_epi8(A, B);
748     assert(C.array == correct);
749 }
750 
751 /// Compare packed double-precision (64-bit) floating-point elements 
752 /// in `a` and `b` for greater-than.
753 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
754 {
755     static if (GDC_with_SSE2)
756     {
757         return __builtin_ia32_cmpgtpd(a, b); 
758     }
759     else
760     {
761         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
762     }
763 }
764 
765 /// Compare the lower double-precision (64-bit) floating-point elements 
766 /// in `a` and `b` for greater-than, store the result in the lower element,
767 /// and copy the upper element from `a`.
768 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
769 {
770     // Note: There is no __builtin_ia32_cmpgtsd builtin.
771     static if (GDC_with_SSE2)
772     {
773         return __builtin_ia32_cmpnlesd(b, a);
774     }
775     else
776     {
777         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
778     }
779 }
780 
781 /// Compare packed double-precision (64-bit) floating-point elements 
782 /// in `a` and `b` for less-than-or-equal.
783 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
784 {
785     static if (GDC_with_SSE2)
786     {
787         return __builtin_ia32_cmplepd(a, b); 
788     }
789     else
790     {
791         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
792     }
793 }
794 
795 /// Compare the lower double-precision (64-bit) floating-point elements 
796 /// in `a` and `b` for less-than-or-equal, store the result in the 
797 /// lower element, and copy the upper element from `a`.
798 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
799 {
800     static if (GDC_with_SSE2)
801     {
802         return __builtin_ia32_cmplesd(a, b); 
803     }
804     else
805     {
806         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
807     }
808 }
809 
810 /// Compare packed 16-bit integers in `a` and `b` for less-than.
811 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
812 {
813     return _mm_cmpgt_epi16(b, a);
814 }
815 
816 /// Compare packed 32-bit integers in `a` and `b` for less-than.
817 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
818 {
819     return _mm_cmpgt_epi32(b, a);
820 }
821 
822 /// Compare packed 8-bit integers in `a` and `b` for less-than.
823 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
824 {
825     return _mm_cmpgt_epi8(b, a);
826 }
827 
828 /// Compare packed double-precision (64-bit) floating-point elements
829 /// in `a` and `b` for less-than.
830 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
831 {
832     static if (GDC_with_SSE2)
833     {
834         return __builtin_ia32_cmpltpd(a, b); 
835     }
836     else
837     {
838         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
839     }
840 }
841 
842 /// Compare the lower double-precision (64-bit) floating-point elements
843 /// in `a` and `b` for less-than, store the result in the lower 
844 /// element, and copy the upper element from `a`.
845 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
846 {
847     static if (GDC_with_SSE2)
848     {
849         return __builtin_ia32_cmpltsd(a, b); 
850     }
851     else
852     {
853         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
854     }
855 }
856 
857 /// Compare packed double-precision (64-bit) floating-point elements
858 /// in `a` and `b` for not-equal.
859 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
860 {
861     static if (GDC_with_SSE2)
862     {
863         return __builtin_ia32_cmpneqpd(a, b); 
864     }
865     else
866     {
867         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
868     }
869 }
870 
871 /// Compare the lower double-precision (64-bit) floating-point elements
872 /// in `a` and `b` for not-equal, store the result in the lower 
873 /// element, and copy the upper element from `a`.
874 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
875 {
876     static if (GDC_with_SSE2)
877     {
878         return __builtin_ia32_cmpneqsd(a, b); 
879     }
880     else
881     {
882         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
883     }
884 }
885 
886 /// Compare packed double-precision (64-bit) floating-point elements 
887 /// in `a` and `b` for not-greater-than-or-equal.
888 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
889 {
890     static if (GDC_with_SSE2)
891     {
892         return __builtin_ia32_cmpngepd(a, b); 
893     }
894     else
895     {
896         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
897     }
898 }
899 
900 /// Compare the lower double-precision (64-bit) floating-point elements 
901 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
902 /// the lower element, and copy the upper element from `a`.
903 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
904 {
905     // Note: There is no __builtin_ia32_cmpngesd builtin.
906     static if (GDC_with_SSE2)
907     {
908         return __builtin_ia32_cmpltsd(b, a); 
909     }
910     else
911     {
912         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
913     }
914 }
915 
916 /// Compare packed double-precision (64-bit) floating-point elements 
917 /// in `a` and `b` for not-greater-than.
918 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
919 {
920     static if (GDC_with_SSE2)
921     {
922         return __builtin_ia32_cmpngtpd(a, b);
923     }
924     else
925     {
926         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
927     }
928 }
929 
930 /// Compare the lower double-precision (64-bit) floating-point elements 
931 /// in `a` and `b` for not-greater-than, store the result in the 
932 /// lower element, and copy the upper element from `a`.
933 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
934 {
935     // Note: There is no __builtin_ia32_cmpngtsd builtin.
936     static if (GDC_with_SSE2)
937     {
938         return __builtin_ia32_cmplesd(b, a);
939     }
940     else
941     {
942         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
943     }
944 }
945 
946 /// Compare packed double-precision (64-bit) floating-point elements 
947 /// in `a` and `b` for not-less-than-or-equal.
948 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
949 {
950     static if (GDC_with_SSE2)
951     {
952         return __builtin_ia32_cmpnlepd(a, b);
953     }
954     else
955     {
956         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
957     }
958 }
959 
960 /// Compare the lower double-precision (64-bit) floating-point elements 
961 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
962 /// lower element, and copy the upper element from `a`.
963 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
964 {
965     static if (GDC_with_SSE2)
966     {
967         return __builtin_ia32_cmpnlesd(a, b);
968     }
969     else
970     {
971         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
972     }
973 }
974  
975 /// Compare packed double-precision (64-bit) floating-point elements 
976 /// in `a` and `b` for not-less-than.
977 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
978 {
979     static if (GDC_with_SSE2)
980     {
981         return __builtin_ia32_cmpnltpd(a, b);
982     }
983     else
984     {
985         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
986     }
987 }
988 
989 /// Compare the lower double-precision (64-bit) floating-point elements 
990 /// in `a` and `b` for not-less-than, store the result in the lower 
991 /// element, and copy the upper element from `a`.
992 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
993 {
994     static if (GDC_with_SSE2)
995     {
996         return __builtin_ia32_cmpnltsd(a, b);
997     }
998     else
999     {
1000         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1001     }
1002 }
1003 
1004 /// Compare packed double-precision (64-bit) floating-point elements 
1005 /// in `a` and `b` to see if neither is NaN.
1006 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1007 {
1008     static if (GDC_with_SSE2)
1009     {
1010         return __builtin_ia32_cmpordpd(a, b);
1011     }
1012     else
1013     {
1014         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1015     }
1016 }
1017 
1018 /// Compare the lower double-precision (64-bit) floating-point elements 
1019 /// in `a` and `b` to see if neither is NaN, store the result in the 
1020 /// lower element, and copy the upper element from `a` to the upper element.
1021 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1022 {
1023     static if (GDC_with_SSE2)
1024     {
1025         return __builtin_ia32_cmpordsd(a, b);
1026     }
1027     else
1028     {
1029         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1030     }
1031 }
1032 
1033 /// Compare packed double-precision (64-bit) floating-point elements 
1034 /// in `a` and `b` to see if either is NaN.
1035 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1036 {
1037     static if (GDC_with_SSE2)
1038     {
1039         return __builtin_ia32_cmpunordpd(a, b);
1040     }
1041     else
1042     {
1043         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1044     }
1045 }
1046 
1047 /// Compare the lower double-precision (64-bit) floating-point elements 
1048 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1049 /// element, and copy the upper element from `a` to the upper element.
1050 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1051 {
1052     static if (GDC_with_SSE2)
1053     {
1054         return __builtin_ia32_cmpunordsd(a, b);
1055     }
1056     else
1057     {
1058         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1059     }
1060 }
1061 
1062 
1063 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
1064 // Some such comparisons yields true for NaNs, other don't.
1065 
1066 /// Compare the lower double-precision (64-bit) floating-point element 
1067 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1068 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1069 {
1070     static if (GDC_with_SSE2)
1071     {
1072         return __builtin_ia32_comieq(a, b);
1073     }
1074     else
1075     {
1076         return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC
1077     }
1078 }
1079 
1080 /// Compare the lower double-precision (64-bit) floating-point element 
1081 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1082 /// result (0 or 1).
1083 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1084 {
1085     static if (GDC_with_SSE2)
1086     {
1087         return __builtin_ia32_comige(a, b);
1088     }
1089     else
1090     {
1091         return comsd!(FPComparison.oge)(a, b);
1092     }
1093 }
1094 
1095 /// Compare the lower double-precision (64-bit) floating-point element 
1096 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1097 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1098 {
1099     static if (GDC_with_SSE2)
1100     {
1101         return __builtin_ia32_comigt(a, b);
1102     }
1103     else
1104     {
1105         return comsd!(FPComparison.ogt)(a, b);
1106     }
1107 }
1108 
1109 /// Compare the lower double-precision (64-bit) floating-point element 
1110 /// in `a` and `b` for less-than-or-equal.
1111 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1112 {
1113     static if (GDC_with_SSE2)
1114     {
1115         return __builtin_ia32_comile(a, b);
1116     }
1117     else
1118     {
1119         return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC
1120     }
1121 }
1122 
1123 /// Compare the lower double-precision (64-bit) floating-point element 
1124 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1125 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1126 {
1127     static if (GDC_with_SSE2)
1128     {
1129         return __builtin_ia32_comilt(a, b);
1130     }
1131     else
1132     {
1133         return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC
1134     }
1135 }
1136 
1137 /// Compare the lower double-precision (64-bit) floating-point element
1138 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1139 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1140 {
1141     static if (GDC_with_SSE2)
1142     {
1143         return __builtin_ia32_comineq(a, b);
1144     }
1145     else
1146     {
1147         return comsd!(FPComparison.one)(a, b);
1148     }
1149 }
1150 
1151 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1152 /// floating-point elements.
1153  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1154 {
1155     version(LDC)
1156     {
1157         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1158         enum ir = `
1159             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1160             %r = sitofp <2 x i32> %v to <2 x double>
1161             ret <2 x double> %r`;
1162         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1163     }
1164     else static if (GDC_with_SSE2)
1165     {
1166         return __builtin_ia32_cvtdq2pd(a);
1167     }
1168     else
1169     {
1170         double2 r = void;
1171         r.ptr[0] = a.array[0];
1172         r.ptr[1] = a.array[1];
1173         return r;
1174     }
1175 }
1176 unittest
1177 {
1178     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1179     assert(A.array[0] == 54.0);
1180     assert(A.array[1] == 54.0);
1181 }
1182 
1183 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1184 /// floating-point elements.
1185 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1186 {
1187     static if (GDC_with_SSE2)
1188     {
1189         return __builtin_ia32_cvtdq2ps(a);
1190     }
1191     else
1192     {
1193         // x86: Generates cvtdq2ps since LDC 1.0.0 -O1
1194         // ARM: Generats scvtf.4s since LDC 1.8.0 -02        
1195         __m128 res;
1196         res.ptr[0] = cast(float)a.array[0];
1197         res.ptr[1] = cast(float)a.array[1];
1198         res.ptr[2] = cast(float)a.array[2];
1199         res.ptr[3] = cast(float)a.array[3];
1200         return res;
1201     }
1202 }
1203 unittest
1204 {
1205     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1206     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1207 }
1208 
1209 /// Convert packed double-precision (64-bit) floating-point elements 
1210 /// in `a` to packed 32-bit integers.
1211 // TODO #ARM
1212 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1213 {
1214     static if (LDC_with_SSE2)
1215     {
1216         // Like in clang, implemented with a magic intrinsic right now
1217         return __builtin_ia32_cvtpd2dq(a);
1218     }
1219     else static if (GDC_with_SSE2)
1220     {
1221         return __builtin_ia32_cvtpd2dq(a);
1222     }
1223     else
1224     {
1225         __m128i r = _mm_setzero_si128();
1226         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1227         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1228         return r;
1229     }
1230 }
1231 unittest
1232 {
1233     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1234     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1235 }
1236 
1237 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1238 /// to packed 32-bit integers
1239 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1240 {
1241     return to_m64(_mm_cvtpd_epi32(v));
1242 }
1243 unittest
1244 {
1245     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1246     assert(A.array[0] == 55 && A.array[1] == 61);
1247 }
1248 
1249 /// Convert packed double-precision (64-bit) floating-point elements 
1250 /// in `a` to packed single-precision (32-bit) floating-point elements.
1251 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1252 {
1253     static if (LDC_with_SSE2)
1254     {
1255         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1256     }
1257     else static if (GDC_with_SSE2)
1258     {
1259         return __builtin_ia32_cvtpd2ps(a);
1260     }
1261     else
1262     { 
1263         __m128 r = void;
1264         r.ptr[0] = a.array[0];
1265         r.ptr[1] = a.array[1];
1266         r.ptr[2] = 0;
1267         r.ptr[3] = 0;
1268         return r;
1269     }
1270 }
1271 unittest
1272 {
1273     __m128d A = _mm_set_pd(5.25, 4.0);
1274     __m128 B = _mm_cvtpd_ps(A);
1275     assert(B.array == [4.0f, 5.25f, 0, 0]);
1276 }
1277 
1278 /// Convert packed 32-bit integers in `v` to packed double-precision 
1279 /// (64-bit) floating-point elements.
1280 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1281 {
1282     return _mm_cvtepi32_pd(to_m128i(v));
1283 }
1284 unittest
1285 {
1286     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1287     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1288 }
1289 
1290 /// Convert packed single-precision (32-bit) floating-point elements 
1291 /// in `a` to packed 32-bit integers
1292 // TODO #ARM
1293 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1294 {
1295     static if (LDC_with_SSE2)
1296     {
1297         // Disabled, since it fail with optimizations unfortunately
1298         //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
1299         return __asm!__m128i("cvtps2dq $1,$0","=x,x",a);
1300     }
1301     else static if (GDC_with_SSE2)
1302     {
1303         return __builtin_ia32_cvtps2dq(a);
1304     }
1305     else
1306     {
1307         __m128i r = void;
1308         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1309         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1310         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1311         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1312         return r;
1313     }
1314 }
1315 unittest
1316 {
1317     uint savedRounding = _MM_GET_ROUNDING_MODE();
1318 
1319     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1320     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1321     assert(A.array == [1, -2, 54, -3]);
1322 
1323     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1324     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1325     assert(A.array == [1, -3, 53, -3]);
1326 
1327     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1328     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1329     assert(A.array == [2, -2, 54, -2]);
1330 
1331     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1332     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1333     assert(A.array == [1, -2, 53, -2]);
1334 
1335     _MM_SET_ROUNDING_MODE(savedRounding);
1336 }
1337 
1338 /// Convert packed single-precision (32-bit) floating-point elements 
1339 /// in `a` to packed double-precision (64-bit) floating-point elements.
1340 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1341 {
1342     version(LDC)
1343     {
1344         // Generates cvtps2pd since LDC 1.0 -O0
1345         enum ir = `
1346             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1347             %r = fpext <2 x float> %v to <2 x double>
1348             ret <2 x double> %r`;
1349         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1350     }
1351     else static if (GDC_with_SSE2)
1352     {
1353         return __builtin_ia32_cvtps2pd(a);
1354     }
1355     else
1356     {
1357         double2 r = void;
1358         r.ptr[0] = a.array[0];
1359         r.ptr[1] = a.array[1];
1360         return r;
1361     }
1362 }
1363 unittest
1364 {
1365     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1366     assert(A.array[0] == 54.0);
1367     assert(A.array[1] == 54.0);
1368 }
1369 
1370 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1371 double _mm_cvtsd_f64 (__m128d a) pure @safe
1372 {
1373     return a.array[0];
1374 }
1375 
1376 /// Convert the lower double-precision (64-bit) floating-point element
1377 /// in `a` to a 32-bit integer.
1378 int _mm_cvtsd_si32 (__m128d a) @safe
1379 {
1380     static if (LDC_with_SSE2)
1381     {
1382         return __builtin_ia32_cvtsd2si(a);
1383     }
1384     else static if (GDC_with_SSE2)
1385     {
1386         return __builtin_ia32_cvtsd2si(a);
1387     }
1388     else
1389     {
1390         return convertDoubleToInt32UsingMXCSR(a[0]);
1391     }
1392 }
1393 unittest
1394 {
1395     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1396 }
1397 
1398 version(LDC)
1399 {
1400     // Unfortunately this builtin crashes in 32-bit
1401     version(X86_64)
1402         alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
1403     else
1404     {
1405         long _mm_cvtsd_si64 (__m128d a) @safe
1406         {
1407             return convertDoubleToInt64UsingMXCSR(a[0]);
1408         }
1409     }
1410 }
1411 else
1412 {
1413     long _mm_cvtsd_si64 (__m128d a) @safe
1414     {
1415         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1416     }
1417 }
1418 unittest
1419 {
1420     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1421 
1422     uint savedRounding = _MM_GET_ROUNDING_MODE();
1423 
1424     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1425     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1426 
1427     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1428     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1429 
1430     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1431     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1432 
1433     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1434     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1435 
1436     _MM_SET_ROUNDING_MODE(savedRounding);
1437 }
1438 
1439 alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
1440 
1441 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe
1442 {
1443     static if (GDC_with_SSE2)
1444     {
1445         return __builtin_ia32_cvtsd2ss(a, b); 
1446     }
1447     else
1448     {
1449         // Generates cvtsd2ss since LDC 1.3 -O0
1450         a[0] = b[0];
1451         return a;
1452     }
1453 }
1454 unittest
1455 {
1456     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1457     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1458 }
1459 
1460 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1461 {
1462     return a.array[0];
1463 }
1464 
1465 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1466 {
1467     long2 la = cast(long2)a;
1468     return la.array[0];
1469 }
1470 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1471 
1472 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted
1473 {
1474     v.ptr[0] = cast(double)x;
1475     return v;
1476 }
1477 unittest
1478 {
1479     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1480     assert(a.array == [42.0, 0]);
1481 }
1482 
1483 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1484 {
1485     int4 r = [0, 0, 0, 0];
1486     r.ptr[0] = a;
1487     return r;
1488 }
1489 unittest
1490 {
1491     __m128i a = _mm_cvtsi32_si128(65);
1492     assert(a.array == [65, 0, 0, 0]);
1493 }
1494 
1495 
1496 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
1497 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted
1498 {
1499     v.ptr[0] = cast(double)x;
1500     return v;
1501 }
1502 unittest
1503 {
1504     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1505     assert(a.array == [42.0, 0]);
1506 }
1507 
1508 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1509 {
1510     long2 r = [0, 0];
1511     r.ptr[0] = a;
1512     return cast(__m128i)(r);
1513 }
1514 
1515 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
1516 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
1517 
1518 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted
1519 {
1520     v.ptr[0] = x.array[0];
1521     return v;
1522 }
1523 unittest
1524 {
1525     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1526     assert(a.array == [42.0, 0]);
1527 }
1528 
1529 long _mm_cvttss_si64 (__m128 a) pure @safe
1530 {
1531     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1532 }
1533 unittest
1534 {
1535     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1536 }
1537 
1538 static if (LDC_with_SSE2)
1539 {
1540     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1541 }
1542 else static if (GDC_with_SSE2)
1543 {
1544     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1545 }
1546 else
1547 {
1548     __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe
1549     {
1550         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1551         __m128i r;
1552         r.array[0] = cast(int)a.array[0];
1553         r.array[1] = cast(int)a.array[1];
1554         r.array[2] = 0;
1555         r.array[3] = 0;
1556         return r;
1557     }
1558 }
1559 unittest
1560 {
1561     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1562     assert(R.array == [-4, 45641, 0, 0]);
1563 }
1564 
1565 
1566 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1567 /// to packed 32-bit integers with truncation.
1568 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1569 {
1570     return to_m64(_mm_cvttpd_epi32(v));
1571 }
1572 unittest
1573 {
1574     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1575     int[2] correct = [-4, 45641];
1576     assert(R.array == correct);
1577 }
1578 
1579 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1580 {
1581     // Note: Generates cvttps2dq since LDC 1.3 -O2
1582     __m128i r;
1583     r.ptr[0] = cast(int)a.array[0];
1584     r.ptr[1] = cast(int)a.array[1];
1585     r.ptr[2] = cast(int)a.array[2];
1586     r.ptr[3] = cast(int)a.array[3];
1587     return r;
1588 }
1589 unittest
1590 {
1591     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1592     assert(R.array == [-4, 45641, 0, 1]);
1593 }
1594 
1595 int _mm_cvttsd_si32 (__m128d a)
1596 {
1597     // Generates cvttsd2si since LDC 1.3 -O0
1598     return cast(int)a.array[0];
1599 }
1600 
1601 long _mm_cvttsd_si64 (__m128d a)
1602 {
1603     // Generates cvttsd2si since LDC 1.3 -O0
1604     // but in 32-bit instead, it's a long sequence that resort to FPU
1605     return cast(long)a.array[0];
1606 }
1607 
1608 alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
1609 
1610 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1611 {
1612     return a / b;
1613 }
1614 
1615 static if (GDC_with_SSE2)
1616 {
1617     __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1618     {
1619         return __builtin_ia32_divsd(a, b);
1620     }
1621 }
1622 else version(DigitalMars)
1623 {
1624     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1625     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1626     {
1627         asm pure nothrow @nogc @trusted { nop;}
1628         a.array[0] = a.array[0] / b.array[0];
1629         return a;
1630     }
1631 }
1632 else
1633 {
1634     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1635     {
1636         a.array[0] /= b.array[0];
1637         return a;
1638     }
1639 }
1640 unittest
1641 {
1642     __m128d a = [2.0, 4.5];
1643     a = _mm_div_sd(a, a);
1644     assert(a.array == [1.0, 4.5]);
1645 }
1646 
1647 /// Extract a 16-bit integer from `v`, selected with `index`
1648 // PERF: ARM version has array bound check
1649 int _mm_extract_epi16(__m128i v, int index) pure @safe
1650 {
1651     short8 r = cast(short8)v;
1652     return cast(ushort)(r.array[index]);
1653 }
1654 unittest
1655 {
1656     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1657     assert(_mm_extract_epi16(A, 6) == 6);
1658     assert(_mm_extract_epi16(A, 0) == 65535);
1659 }
1660 
1661 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1662 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1663 {
1664     short8 r = cast(short8)v;
1665     r.ptr[index & 7] = cast(short)i;
1666     return cast(__m128i)r;
1667 }
1668 unittest
1669 {
1670     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1671     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1672     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1673     assert(R.array == correct);
1674 }
1675 
1676 version(GNU)
1677 {
1678     void _mm_lfence() pure @trusted
1679     {
1680         static if (GDC_with_SSE2)
1681         {
1682             __builtin_ia32_lfence();
1683         }
1684         else version(X86)
1685         {
1686             asm pure nothrow @nogc @trusted
1687             {
1688                 "lfence;\n" : : : ;
1689             }
1690         }
1691         else
1692             static assert(false);
1693     }
1694 }
1695 else static if (LDC_with_SSE2)
1696 {
1697     alias _mm_lfence = __builtin_ia32_lfence;
1698 }
1699 else static if (DMD_with_asm)
1700 {
1701     void _mm_lfence() pure @safe
1702     {
1703         asm nothrow @nogc pure @safe
1704         {
1705             lfence;
1706         }
1707     }
1708 }
1709 else version(LDC)
1710 {
1711     void _mm_lfence() pure @safe
1712     {
1713         llvm_memory_fence(); // Note: actually generates mfence
1714     }
1715 }
1716 else
1717     static assert(false);
1718 unittest
1719 {
1720     _mm_lfence();
1721 }
1722 
1723 
1724 __m128d _mm_load_pd (const(double) * mem_addr) pure
1725 {
1726     __m128d* aligned = cast(__m128d*)mem_addr;
1727     return *aligned;
1728 }
1729 
1730 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1731 {
1732     double[2] arr = [*mem_addr, *mem_addr];
1733     return loadUnaligned!(double2)(&arr[0]);
1734 }
1735 
1736 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1737 {
1738     double2 r = [0, 0];
1739     r.ptr[0] = *mem_addr;
1740     return r;
1741 }
1742 unittest
1743 {
1744     double x = -42;
1745     __m128d a = _mm_load_sd(&x);
1746     assert(a.array == [-42.0, 0.0]);
1747 }
1748 
1749 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
1750 {
1751     return *mem_addr;
1752 }
1753 
1754 alias _mm_load1_pd = _mm_load_pd1;
1755 
1756 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1757 {
1758     a.ptr[1] = *mem_addr;
1759     return a;
1760 }
1761 
1762 // Note: strange signature since the memory doesn't have to aligned
1763 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
1764 {
1765     auto pLong = cast(const(long)*)mem_addr;
1766     long2 r = [0, 0];
1767     r.ptr[0] = *pLong;
1768     return cast(__m128i)(r);
1769 }
1770 
1771 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1772 {
1773     a.ptr[0] = *mem_addr;
1774     return a;
1775 }
1776 
1777 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted
1778 {
1779     __m128d a = *cast(__m128d*)(mem_addr);
1780     __m128d r;
1781     r.ptr[0] = a.array[1];
1782     r.ptr[1] = a.array[0];
1783     return r;
1784 }
1785 
1786 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
1787 {
1788     static if (GDC_with_SSE2)
1789     {
1790         return __builtin_ia32_loadupd(mem_addr); 
1791     }
1792     else
1793     {
1794         return loadUnaligned!(double2)(mem_addr);
1795     }
1796 }
1797 
1798 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1799 {
1800     static if (GDC_with_SSE2)
1801     {
1802         return __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1803     }
1804     else
1805     {
1806         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1807     }
1808 }
1809 
1810 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1811 {
1812     int r = *cast(int*)(mem_addr);
1813     int4 result = [0, 0, 0, 0];
1814     result.ptr[0] = r;
1815     return result;
1816 }
1817 unittest
1818 {
1819     int r = 42;
1820     __m128i A = _mm_loadu_si32(&r);
1821     int[4] correct = [42, 0, 0, 0];
1822     assert(A.array == correct);
1823 }
1824 
1825 static if (GDC_with_SSE2)
1826 {
1827     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1828     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1829     /// and pack the results in destination.
1830     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1831 }
1832 else static if (LDC_with_SSE2)
1833 {
1834     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1835     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1836     /// and pack the results in destination.
1837     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1838 }
1839 else
1840 {
1841     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1842     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1843     /// and pack the results in destination.
1844     // TODO: #ARM
1845     __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe
1846     {
1847         short8 sa = cast(short8)a;
1848         short8 sb = cast(short8)b;
1849 
1850         int4 r;
1851         foreach(i; 0..4)
1852         {
1853             r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
1854         }
1855         return r;
1856     }
1857 }
1858 unittest
1859 {
1860     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1861     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1862     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
1863     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
1864     assert(R.array == correct);
1865 }
1866 
1867 static if (LDC_with_SSE2)
1868 {
1869     /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
1870     /// (elements are not stored when the highest bit is not set in the corresponding element)
1871     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
1872     /// boundary.
1873     alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR
1874 }
1875 else
1876 {
1877     static if (GDC_with_SSE2)
1878     {
1879         ///ditto
1880         void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1881         {
1882             return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
1883         }
1884     }
1885     else
1886     {
1887         ///ditto
1888         // PERF: on ARM, is absolutely catastrophic, however needing this intrinsics is rare.
1889         void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1890         {
1891             byte16 b = cast(byte16)a;
1892             byte16 m = cast(byte16)mask;
1893             byte* dest = cast(byte*)(mem_addr);
1894             foreach(j; 0..16)
1895             {
1896                 if (m.array[j] & 128)
1897                 {
1898                     dest[j] = b.array[j];
1899                 }
1900             }
1901         }
1902     }
1903 }
1904 unittest
1905 {
1906     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
1907     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
1908     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
1909     _mm_maskmoveu_si128(A, mask, dest.ptr);
1910     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
1911     assert(dest == correct);
1912 }
1913 
1914 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
1915 {
1916     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1917     __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
1918     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1919     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1920     return _mm_xor_si128(b, mask);
1921 }
1922 unittest
1923 {
1924     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1925                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1926     short[8] correct =                                  [45, 1,  9,  7, 9,  7, 0,  0];
1927     assert(R.array == correct);
1928 }
1929 
1930 
1931 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1932 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
1933 {
1934     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1935     __m128i value128 = _mm_set1_epi8(-128);
1936     __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1937     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1938     __m128i mask = _mm_and_si128(aTob, higher);
1939     return _mm_xor_si128(b, mask);
1940 }
1941 unittest
1942 {
1943     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1944                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1945     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
1946     assert(R.array == correct);
1947 }
1948 
1949 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe
1950 {
1951     static if (GDC_with_SSE2)
1952     {
1953         return __builtin_ia32_maxpd(a, b);
1954     }
1955     else
1956     {
1957         // Generates maxpd starting with LDC 1.9
1958         a[0] = (a[0] > b[0]) ? a[0] : b[0];
1959         a[1] = (a[1] > b[1]) ? a[1] : b[1];
1960         return a;
1961     }
1962 }
1963 unittest
1964 {
1965     __m128d A = _mm_setr_pd(4.0, 1.0);
1966     __m128d B = _mm_setr_pd(1.0, 8.0);
1967     __m128d M = _mm_max_pd(A, B);
1968     assert(M.array[0] == 4.0);
1969     assert(M.array[1] == 8.0);
1970 }
1971 
1972 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe
1973 {
1974     static if (GDC_with_SSE2)
1975     {
1976         return __builtin_ia32_maxsd(a, b);
1977     }
1978     else
1979     {
1980          __m128d r = a;
1981         // Generates maxsd starting with LDC 1.3
1982         r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
1983         return r;
1984     }
1985 }
1986 unittest
1987 {
1988     __m128d A = _mm_setr_pd(1.0, 1.0);
1989     __m128d B = _mm_setr_pd(4.0, 2.0);
1990     __m128d M = _mm_max_sd(A, B);
1991     assert(M.array[0] == 4.0);
1992     assert(M.array[1] == 1.0);
1993 }
1994 
1995 version(GNU)
1996 {
1997     void _mm_mfence() pure @trusted
1998     {
1999         static if (GDC_with_SSE2)
2000         {
2001             __builtin_ia32_mfence();
2002         }
2003         else version(X86)
2004         {
2005             asm pure nothrow @nogc @trusted
2006             {
2007                 "mfence;\n" : : : ;
2008             }
2009         }
2010         else
2011             static assert(false);
2012     }
2013 }
2014 else static if (LDC_with_SSE2)
2015 {
2016     alias _mm_mfence = __builtin_ia32_mfence;
2017 }
2018 else static if (DMD_with_asm)
2019 {
2020     void _mm_mfence() pure @safe
2021     {
2022         asm nothrow @nogc pure @safe
2023         {
2024             mfence;
2025         }
2026     }
2027 }
2028 else version(LDC)
2029 {
2030     void _mm_mfence() pure @safe
2031     {
2032         // Note: will generate the DMB instruction on ARM
2033         llvm_memory_fence();
2034     }
2035 }
2036 else
2037     static assert(false);
2038 unittest
2039 {
2040     _mm_mfence();
2041 }
2042 
2043 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2044 {
2045     // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?)
2046     // Implemented using masks and XOR
2047     __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2048     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2049     __m128i mask = _mm_and_si128(aTob, lowerShorts);
2050     return _mm_xor_si128(b, mask);
2051 }
2052 unittest
2053 {
2054     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
2055                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2056     short[8] correct =  [-4,-8, -4, -8, 0,-57, 0, -57];
2057     assert(R.array == correct);
2058 }
2059 
2060 
2061 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2062 {
2063     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
2064     __m128i value128 = _mm_set1_epi8(-128);
2065     __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2066     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2067     __m128i mask = _mm_and_si128(aTob, lower);
2068     return _mm_xor_si128(b, mask);
2069 }
2070 unittest
2071 {
2072     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2073                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2074     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2075     assert(R.array == correct);
2076 }
2077 
2078 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe
2079 {
2080     static if (GDC_with_SSE2)
2081     {
2082         return __builtin_ia32_minpd(a, b);
2083     }
2084     else
2085     {
2086         // Generates minpd starting with LDC 1.9
2087         a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2088         a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2089         return a;
2090     }
2091 }
2092 unittest
2093 {
2094     __m128d A = _mm_setr_pd(1.0, 2.0);
2095     __m128d B = _mm_setr_pd(4.0, 1.0);
2096     __m128d M = _mm_min_pd(A, B);
2097     assert(M.array[0] == 1.0);
2098     assert(M.array[1] == 1.0);
2099 }
2100 
2101 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2102 {
2103     static if (GDC_with_SSE2)
2104     {
2105         return __builtin_ia32_minsd(a, b);
2106     }
2107     else
2108     {
2109         // Generates minsd starting with LDC 1.3
2110         __m128d r = a;
2111         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2112         return r;
2113     }
2114 }
2115 unittest
2116 {
2117     __m128d A = _mm_setr_pd(1.0, 3.0);
2118     __m128d B = _mm_setr_pd(4.0, 2.0);
2119     __m128d M = _mm_min_sd(A, B);
2120     assert(M.array[0] == 1.0);
2121     assert(M.array[1] == 3.0);
2122 }
2123 
2124 __m128i _mm_move_epi64 (__m128i a) pure @safe
2125 {
2126     static if (GDC_with_SSE2)
2127     {
2128         return __builtin_ia32_movq128(a);
2129     }
2130     else
2131     {
2132         long2 result = [ 0, 0 ];
2133         long2 la = cast(long2) a;
2134         result.array[0] = la.array[0];
2135         return cast(__m128i)(result);
2136     }
2137 }
2138 unittest
2139 {
2140     long2 A = [13, 47];
2141     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2142     long[2] correct = [13, 0];
2143     assert(B.array == correct);
2144 }
2145 
2146 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
2147 {
2148     static if (GDC_with_SSE2)
2149     {
2150         return __builtin_ia32_movsd(a, b); 
2151     }
2152     else
2153     {
2154         b.array[1] = a.array[1];
2155         return b;
2156     }
2157 }
2158 unittest
2159 {
2160     double2 A = [13.0, 47.0];
2161     double2 B = [34.0, 58.0];
2162     double2 C = _mm_move_sd(A, B);
2163     double[2] correct = [34.0, 47.0];
2164     assert(C.array == correct);
2165 }
2166 
2167 static if (GDC_with_SSE2)
2168 {
2169     /// Create mask from the most significant bit of each 8-bit element in `v`.
2170     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
2171 }
2172 else static if (LDC_with_SSE2)
2173 {
2174     /// Create mask from the most significant bit of each 8-bit element in `v`.
2175     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
2176 }
2177 // TODO #ARM: doesn't work
2178 /*
2179 else static if (LDC_with_ARM)
2180 {
2181     /// Create mask from the most significant bit of each 8-bit element in `v`.
2182     int _mm_movemask_epi8 (__m128i a) pure @safe
2183     {
2184         // PERF: looks worse than the one in simde
2185         byte16 ai = cast(byte16)a;
2186         byte16 shift7 = [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]; 
2187         ai = ai >>> shift7;
2188         byte16 shift  = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]; 
2189         ai = ai << shift; // 4-way shift, only efficient on ARM.
2190         short8 lo = cast(short8) _mm_unpacklo_epi8(ai, _mm_setzero_si128());
2191         short8 hi = cast(short8) _mm_unpackhi_epi8(ai, _mm_setzero_si128());
2192         short8 shift8 = [8, 8, 8, 8, 8, 8, 8, 8];
2193         lo |= (hi << shift8);
2194         return lo.array[0] + lo.array[1] + lo.array[2] + lo.array[3]
2195              + lo.array[4] + lo.array[5] + lo.array[6] + lo.array[7];
2196     }
2197 } */
2198 else 
2199 {
2200     /// Create mask from the most significant bit of each 8-bit element in `v`.
2201     int _mm_movemask_epi8(__m128i v) pure @safe
2202     {
2203         byte16 ai = cast(byte16)v;
2204         int r = 0;
2205         foreach(bit; 0..16)
2206         {
2207             if (ai.array[bit] < 0) r += (1 << bit);
2208         }
2209         return r;
2210     }
2211 }
2212 unittest
2213 {
2214     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2215 }
2216 
2217 static if (GDC_with_SSE2)
2218 {
2219     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2220     /// packed double-precision (64-bit) floating-point element in `v`.
2221     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2222 }
2223 else static if (LDC_with_SSE2)
2224 {
2225     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2226     /// packed double-precision (64-bit) floating-point element in `v`.
2227     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2228 }
2229 else
2230 {
2231     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2232     /// packed double-precision (64-bit) floating-point element in `v`.
2233     int _mm_movemask_pd(__m128d v) pure @safe
2234     {
2235         long2 lv = cast(long2)v;
2236         int r = 0;
2237         if (lv.array[0] < 0) r += 1;
2238         if (lv.array[1] < 0) r += 2;
2239         return r;
2240     }
2241 }
2242 unittest
2243 {
2244     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2245     assert(_mm_movemask_pd(A) == 2);
2246 }
2247 
2248 /// Copy the lower 64-bit integer in `v`.
2249 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2250 {
2251     long2 lv = cast(long2)v;
2252     return long1(lv.array[0]);
2253 }
2254 unittest
2255 {
2256     __m128i A = _mm_set_epi64x(-1, -2);
2257     __m64 R = _mm_movepi64_pi64(A);
2258     assert(R.array[0] == -2);
2259 }
2260 
2261 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2262 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2263 {
2264     long2 r;
2265     r.ptr[0] = a.array[0];
2266     r.ptr[1] = 0;
2267     return cast(__m128i)r;
2268 }
2269 
2270 // Note: generates pmuludq in LDC with -O1
2271 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2272 {
2273     __m128i zero = _mm_setzero_si128();
2274 
2275     static if (__VERSION__ >= 2088)
2276     {
2277         // Need LLVM9 to avoid this shufflevector
2278         long2 la, lb;
2279         la.ptr[0] = cast(uint)a.array[0];
2280         la.ptr[1] = cast(uint)a.array[2];
2281         lb.ptr[0] = cast(uint)b.array[0];
2282         lb.ptr[1] = cast(uint)b.array[2];
2283     }
2284     else
2285     {
2286         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2287         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2288     }
2289 
2290     static if (__VERSION__ >= 2076)
2291     {
2292         return cast(__m128i)(la * lb);
2293     }
2294     else
2295     {
2296         // long2 mul not supported before LDC 1.5
2297         la.ptr[0] *= lb.array[0];
2298         la.ptr[1] *= lb.array[1];
2299         return cast(__m128i)(la);
2300     }
2301 }
2302 unittest
2303 {
2304     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2305     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2306     __m128i C = _mm_mul_epu32(A, B);
2307     long2 LC = cast(long2)C;
2308     assert(LC.array[0] == 18446744065119617025uL);
2309     assert(LC.array[1] == 12723420444339690338uL);
2310 }
2311 
2312 
2313 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2314 {
2315     return a * b;
2316 }
2317 unittest
2318 {
2319     __m128d a = [-2.0, 1.5];
2320     a = _mm_mul_pd(a, a);
2321     assert(a.array == [4.0, 2.25]);
2322 }
2323 
2324 version(DigitalMars)
2325 {
2326     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2327     __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2328     {
2329         asm pure nothrow @nogc @trusted { nop;}
2330         a.array[0] = a.array[0] * b.array[0];
2331         return a;
2332     }
2333 }
2334 else
2335 {
2336     static if (GDC_with_SSE2)
2337     {
2338         alias _mm_mul_sd = __builtin_ia32_mulsd;
2339     }
2340     else
2341     {
2342         __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2343         {
2344             a.array[0] *= b.array[0];
2345             return a;
2346         }
2347     }
2348 }
2349 unittest
2350 {
2351     __m128d a = [-2.0, 1.5];
2352     a = _mm_mul_sd(a, a);
2353     assert(a.array == [4.0, 1.5]);
2354 }
2355 
2356 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2357 /// and get an unsigned 64-bit result.
2358 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2359 {
2360     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2361 }
2362 unittest
2363 {
2364     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2365     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2366     __m64 C = _mm_mul_su32(A, B);
2367     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2368 }
2369 
2370 static if (GDC_with_SSE2)
2371 {
2372     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
2373 }
2374 else static if (LDC_with_SSE2)
2375 {
2376     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
2377 }
2378 else
2379 {    
2380     __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2381     {
2382         short8 sa = cast(short8)a;
2383         short8 sb = cast(short8)b;
2384         short8 r = void;
2385         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2386         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2387         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2388         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2389         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2390         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2391         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2392         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2393         return cast(__m128i)r;
2394     }
2395 }
2396 unittest
2397 {
2398     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2399     __m128i B = _mm_set1_epi16(16384);
2400     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2401     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2402     assert(R.array == correct);
2403 }
2404 
2405 static if (GDC_with_SSE2)
2406 {
2407     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
2408 }
2409 else static if (LDC_with_SSE2)
2410 {
2411     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
2412 }
2413 else
2414 {   
2415     __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2416     {
2417         short8 sa = cast(short8)a;
2418         short8 sb = cast(short8)b;
2419         short8 r = void;
2420         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2421         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2422         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2423         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2424         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2425         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2426         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2427         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2428         return cast(__m128i)r;
2429     }
2430 }
2431 unittest
2432 {
2433     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2434     __m128i B = _mm_set1_epi16(16384);
2435     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2436     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2437     assert(R.array == correct);
2438 }
2439 
2440 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2441 {
2442     return cast(__m128i)(cast(short8)a * cast(short8)b);
2443 }
2444 unittest
2445 {
2446     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2447     __m128i B = _mm_set1_epi16(16384);
2448     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2449     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2450     assert(R.array == correct);
2451 }
2452 
2453 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2454 {
2455     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2456 }
2457 
2458 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2459 {
2460     return a | b;
2461 }
2462 
2463 static if (GDC_with_SSE2)
2464 {
2465     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
2466 }
2467 else static if (LDC_with_SSE2)
2468 {
2469     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
2470 }
2471 else
2472 {
2473     __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2474     {
2475         short8 r;
2476         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2477         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2478         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2479         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2480         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2481         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2482         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2483         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2484         return cast(__m128i)r;
2485     }
2486 }
2487 unittest
2488 {
2489     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2490     short8 R = cast(short8) _mm_packs_epi32(A, A);
2491     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2492     assert(R.array == correct);
2493 }
2494 
2495 static if (GDC_with_SSE2)
2496 {
2497     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
2498 }
2499 else static if (LDC_with_SSE2)
2500 {
2501     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
2502 }
2503 else
2504 {   
2505     __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2506     {
2507         byte16 r;
2508         short8 sa = cast(short8)a;
2509         short8 sb = cast(short8)b;
2510         foreach(i; 0..8)
2511             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2512         foreach(i; 0..8)
2513             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2514         return cast(__m128i)r;
2515     }
2516 }
2517 unittest
2518 {
2519     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2520     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2521     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2522                         127, -128, 127, 0, 127, -128, 127, 0];
2523     assert(R.array == correct);
2524 }
2525 
2526 static if (GDC_with_SSE2)
2527 {
2528     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
2529 }
2530 else static if (LDC_with_SSE2)
2531 {
2532     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
2533 }
2534 else
2535 {   
2536     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2537     {
2538         short8 sa = cast(short8)a;
2539         short8 sb = cast(short8)b;
2540         ubyte[16] result = void;
2541         for (int i = 0; i < 8; ++i)
2542         {
2543             short s = sa[i];
2544             if (s < 0) s = 0;
2545             if (s > 255) s = 255;
2546             result[i] = cast(ubyte)s;
2547 
2548             s = sb[i];
2549             if (s < 0) s = 0;
2550             if (s > 255) s = 255;
2551             result[i+8] = cast(ubyte)s;
2552         }
2553         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2554     }
2555 }
2556 unittest
2557 {
2558     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2559     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2560     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2561                                                 0, 255, 0, 255, 255, 2, 1, 0];
2562     foreach(i; 0..16)
2563         assert(AA.array[i] == cast(byte)(correctResult[i]));
2564 }
2565 
2566 
2567 version(GNU)
2568 {
2569     void _mm_pause() pure @trusted
2570     {
2571         static if (GDC_with_SSE2)
2572         {
2573             __builtin_ia32_pause();
2574         }
2575         else version(X86)
2576         {
2577             asm pure nothrow @nogc @trusted
2578             {
2579                 "pause;\n" : : : ;
2580             }
2581         }
2582         else
2583             static assert(false);
2584     }
2585 }
2586 else static if (LDC_with_SSE2)
2587 {
2588     alias _mm_pause = __builtin_ia32_pause;
2589 }
2590 else static if (DMD_with_asm)
2591 {
2592     void _mm_pause() pure @safe
2593     {
2594         asm nothrow @nogc pure @safe
2595         {
2596             rep; nop; // F3 90 =  pause
2597         }
2598     }
2599 }
2600 else version (LDC)
2601 {
2602     void _mm_pause() pure @safe
2603     {
2604         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2605     }
2606 }
2607 else
2608     static assert(false);
2609 unittest
2610 {
2611     _mm_pause();
2612 }
2613 
2614 static if (GDC_with_SSE2)
2615 {
2616     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
2617 }
2618 else static if (LDC_with_SSE2)
2619 {
2620     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
2621 }
2622 else
2623 {   
2624     __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2625     {
2626         byte16 ab = cast(byte16)a;
2627         byte16 bb = cast(byte16)b;
2628         ubyte[16] t;
2629         foreach(i; 0..16)
2630         {
2631             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2632             if (diff < 0) diff = -diff;
2633             t[i] = cast(ubyte)(diff);
2634         }
2635         int4 r = _mm_setzero_si128();
2636         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2637         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2638         return r;
2639     }
2640 }
2641 unittest
2642 {
2643     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2644     __m128i B = _mm_set1_epi8(1);
2645     __m128i R = _mm_sad_epu8(A, B);
2646     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2647                       0,
2648                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2649                       0];
2650     assert(R.array == correct);
2651 }
2652 
2653 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2654 {
2655     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2656     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2657 }
2658 unittest
2659 {
2660     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2661     short8 B = cast(short8) A;
2662     foreach(i; 0..8)
2663         assert(B.array[i] == i);
2664 }
2665 
2666 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2667 {
2668     int[4] result = [e0, e1, e2, e3];
2669     return loadUnaligned!(int4)(result.ptr);
2670 }
2671 unittest
2672 {
2673     __m128i A = _mm_set_epi32(3, 2, 1, 0);
2674     foreach(i; 0..4)
2675         assert(A.array[i] == i);
2676 }
2677 
2678 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
2679 {
2680     long[2] result = [e0.array[0], e1.array[0]];
2681     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2682 }
2683 unittest
2684 {
2685     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
2686     long2 B = cast(long2) A;
2687     assert(B.array[0] == 5678);
2688     assert(B.array[1] == 1234);
2689 }
2690 
2691 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
2692 {
2693     long[2] result = [e0, e1];
2694     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2695 }
2696 unittest
2697 {
2698     __m128i A = _mm_set_epi64x(1234, 5678);
2699     long2 B = cast(long2) A;
2700     assert(B.array[0] == 5678);
2701     assert(B.array[1] == 1234);
2702 }
2703 
2704 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
2705                       byte e11, byte e10, byte e9, byte e8,
2706                       byte e7, byte e6, byte e5, byte e4,
2707                       byte e3, byte e2, byte e1, byte e0) pure @trusted
2708 {
2709     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
2710                      e8, e9, e10, e11, e12, e13, e14, e15];
2711     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2712 }
2713 
2714 __m128d _mm_set_pd (double e1, double e0) pure @trusted
2715 {
2716     double[2] result = [e0, e1];
2717     return loadUnaligned!(double2)(result.ptr);
2718 }
2719 unittest
2720 {
2721     __m128d A = _mm_set_pd(61.0, 55.0);
2722     double[2] correct = [55.0, 61.0];
2723     assert(A.array == correct);
2724 }
2725 
2726 __m128d _mm_set_pd1 (double a) pure @trusted
2727 {
2728     double[2] result = [a, a];
2729     return loadUnaligned!(double2)(result.ptr);
2730 }
2731 unittest
2732 {
2733     __m128d A = _mm_set_pd1(61.0);
2734     double[2] correct = [61.0, 61.0];
2735     assert(A.array == correct);
2736 }
2737 
2738 __m128d _mm_set_sd (double a) pure @trusted
2739 {
2740     double[2] result = [a, 0];
2741     return loadUnaligned!(double2)(result.ptr);
2742 }
2743 
2744 __m128i _mm_set1_epi16 (short a) pure @trusted
2745 {
2746     return cast(__m128i)(short8(a));
2747 }
2748 
2749 __m128i _mm_set1_epi32 (int a) pure @trusted
2750 {
2751     return cast(__m128i)(int4(a));
2752 }
2753 unittest
2754 {
2755     __m128 a = _mm_set1_ps(-1.0f);
2756     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
2757     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
2758 }
2759 
2760 /// Broadcast 64-bit integer `a` to all elements of `dst`.
2761 __m128i _mm_set1_epi64 (__m64 a) pure @safe
2762 {
2763     return _mm_set_epi64(a, a);
2764 }
2765 
2766 __m128i _mm_set1_epi64x (long a) pure @trusted
2767 {
2768     return cast(__m128i)(long2(a));
2769 }
2770 
2771 __m128i _mm_set1_epi8 (byte a) pure @trusted
2772 {
2773     return cast(__m128i)(byte16(a));
2774 }
2775 
2776 alias _mm_set1_pd = _mm_set_pd1;
2777 
2778 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
2779                         short e3, short e2, short e1, short e0) pure @trusted
2780 {
2781     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
2782     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
2783 }
2784 
2785 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2786 {
2787     int[4] result = [e3, e2, e1, e0];
2788     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2789 }
2790 
2791 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
2792 {
2793     long[2] result = [e1, e0];
2794     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2795 }
2796 
2797 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
2798                        byte e11, byte e10, byte e9,  byte e8,
2799                        byte e7,  byte e6,  byte e5,  byte e4,
2800                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
2801 {
2802     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
2803                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
2804     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2805 }
2806 
2807 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
2808 {
2809     double[2] result = [e1, e0];
2810     return loadUnaligned!(double2)(result.ptr);
2811 }
2812 unittest
2813 {
2814     __m128d A = _mm_setr_pd(61.0, 55.0);
2815     double[2] correct = [61.0, 55.0];
2816     assert(A.array == correct);
2817 }
2818 
2819 __m128d _mm_setzero_pd () pure @trusted
2820 {
2821     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
2822     double[2] result = [0.0, 0.0];
2823     return loadUnaligned!(double2)(result.ptr);
2824 }
2825 
2826 __m128i _mm_setzero_si128() pure @trusted
2827 {
2828     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
2829     int[4] result = [0, 0, 0, 0];
2830     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2831 }
2832 
2833 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
2834 {
2835     static if (GDC_with_SSE2)
2836     {
2837         return __builtin_ia32_pshufd(a, imm8);
2838     }
2839     else
2840     {
2841         return shufflevector!(int4, (imm8 >> 0) & 3,
2842                                     (imm8 >> 2) & 3,
2843                                     (imm8 >> 4) & 3,
2844                                     (imm8 >> 6) & 3)(a, a);
2845     }
2846 }
2847 unittest
2848 {
2849     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
2850     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2851     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
2852     int[4] expectedB = [ 3, 2, 1, 0 ];
2853     assert(B.array == expectedB);
2854 }
2855 
2856 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
2857 {
2858     static if (GDC_with_SSE2)
2859     {
2860         return __builtin_ia32_shufpd(a, b, imm8);
2861     }
2862     else
2863     {
2864         return shufflevector!(double2, 0 + ( imm8 & 1 ),
2865                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
2866     }
2867 }
2868 unittest
2869 {
2870     __m128d A = _mm_setr_pd(0.5, 2.0);
2871     __m128d B = _mm_setr_pd(4.0, 5.0);
2872     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
2873     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
2874     double[2] correct = [ 2.0, 5.0 ];
2875     assert(R.array == correct);
2876 }
2877 
2878 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
2879 {
2880     static if (GDC_with_SSE2)
2881     {
2882         return __builtin_ia32_pshufhw(a, imm8);
2883     }
2884     else
2885     {
2886         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
2887                                           4 + ( (imm8 >> 0) & 3 ),
2888                                           4 + ( (imm8 >> 2) & 3 ),
2889                                           4 + ( (imm8 >> 4) & 3 ),
2890                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
2891     }
2892 }
2893 unittest
2894 {
2895     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2896     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2897     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
2898     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
2899     assert(C.array == expectedC);
2900 }
2901 
2902 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
2903 {
2904     static if (GDC_with_SSE2)
2905     {
2906         return __builtin_ia32_pshuflw(a, imm8);
2907     }
2908     else
2909     {
2910         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
2911                                                     ( (imm8 >> 2) & 3 ),
2912                                                     ( (imm8 >> 4) & 3 ),
2913                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
2914     }
2915 }
2916 unittest
2917 {
2918     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2919     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2920     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
2921     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
2922     assert(B.array == expectedB);
2923 }
2924 
2925 static if (LDC_with_SSE2)
2926 {
2927     deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128;
2928 }
2929 else static if (GDC_with_SSE2)
2930 {
2931     deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128;
2932 }
2933 else static if (DMD_with_32bit_asm)
2934 {
2935     deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
2936     {
2937         asm pure nothrow @nogc @trusted
2938         {
2939             movdqu XMM0, a;
2940             movdqu XMM1, count;
2941             pslld XMM0, XMM1;
2942             movdqu a, XMM0;
2943         }
2944         return a;
2945     }
2946 }
2947 else
2948 {
2949     deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
2950     {
2951         int4 r = void;
2952         long2 lc = cast(long2)count;
2953         int bits = cast(int)(lc.array[0]);
2954         foreach(i; 0..4)
2955             r[i] = cast(uint)(a[i]) << bits;
2956         return r;
2957     }
2958 }
2959 
2960 static if (LDC_with_SSE2)
2961 {
2962     deprecated("Use _mm_slli_epi64 instead.") alias _mm_sll_epi64  = __builtin_ia32_psllq128;
2963 }
2964 else static if (GDC_with_SSE2)
2965 {
2966     deprecated("Use _mm_slli_epi64 instead.") alias _mm_sll_epi64  = __builtin_ia32_psllq128;
2967 }
2968 else static if (DMD_with_32bit_asm)
2969 {
2970     deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
2971     {
2972         asm pure nothrow @nogc @trusted
2973         {
2974             movdqu XMM0, a;
2975             movdqu XMM1, count;
2976             psllq XMM0, XMM1;
2977             movdqu a, XMM0;
2978         }
2979         return a;
2980     }
2981 }
2982 else
2983 {
2984     deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
2985     {
2986         // ARM: good since LDC 1.12 -O2
2987         // ~but -O0 version is catastrophic
2988         long2 r = void;
2989         long2 sa = cast(long2)a;
2990         long2 lc = cast(long2)count;
2991         int bits = cast(int)(lc.array[0]);
2992         foreach(i; 0..2)
2993             r.array[i] = cast(ulong)(sa.array[i]) << bits;
2994         return cast(__m128i)r;
2995     }
2996 }
2997 
2998 static if (LDC_with_SSE2)
2999 {
3000     deprecated("Use _mm_slli_epi16 instead.") alias _mm_sll_epi16 = __builtin_ia32_psllw128;
3001 }
3002 else static if (GDC_with_SSE2)
3003 {
3004     deprecated("Use _mm_slli_epi16 instead.") alias _mm_sll_epi16 = __builtin_ia32_psllw128;
3005 }
3006 else static if (DMD_with_32bit_asm)
3007 {
3008     deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3009     {
3010         asm pure nothrow @nogc
3011         {
3012             movdqu XMM0, a;
3013             movdqu XMM1, count;
3014             psllw XMM0, XMM1;
3015             movdqu a, XMM0;
3016         }
3017         return a;
3018     }
3019 }
3020 else
3021 {
3022     deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3023     {
3024         short8 sa = cast(short8)a;
3025         long2 lc = cast(long2)count;
3026         int bits = cast(int)(lc.array[0]);
3027         short8 r = void;
3028         foreach(i; 0..8)
3029             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3030         return cast(int4)r;
3031     }
3032 }
3033 
3034 static if (LDC_with_SSE2)
3035 {
3036     alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
3037 }
3038 else
3039 {
3040     static if (GDC_with_SSE2)
3041     {
3042         alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
3043     }
3044     else
3045     {
3046         // TODO #ARM, not fantastic again
3047         __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
3048         {
3049             int4 r = void;
3050             foreach(i; 0..4)
3051                 r.array[i] = cast(uint)(a.array[i]) << imm8;
3052             return r;
3053         }
3054     }
3055 }
3056 unittest
3057 {
3058     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3059     __m128i B = _mm_slli_epi32(A, 1);
3060     int[4] expectedB = [ 0, 4, 6, -8];
3061     assert(B.array == expectedB);
3062 }
3063 
3064 static if (LDC_with_SSE2)
3065 {
3066     alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
3067 }
3068 else
3069 {
3070     static if (GDC_with_SSE2)
3071     {
3072         alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
3073     }
3074     else
3075     {
3076         // PERF #ARM: unroll that loop
3077         __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe
3078         {
3079             long2 r = void;
3080             long2 sa = cast(long2)a;
3081             foreach(i; 0..2)
3082                 r.array[i] = cast(ulong)(sa.array[i]) << imm8;
3083             return cast(__m128i)r;
3084         }
3085     }
3086 }
3087 unittest
3088 {
3089     __m128i A = _mm_setr_epi64(8, -4);
3090     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3091     long[2] expectedB = [ 16, -8];
3092     assert(B.array == expectedB);
3093 }
3094 
3095 static if (LDC_with_SSE2)
3096 {
3097     alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
3098 }
3099 else
3100 {
3101     static if (GDC_with_SSE2)
3102     {
3103         alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
3104     }
3105     else
3106     {
3107         // TODO #ARM
3108         __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe
3109         {
3110             short8 sa = cast(short8)a;
3111             short8 r = void;
3112             foreach(i; 0..8)
3113                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) << imm8);
3114             return cast(int4)r;
3115         }
3116     }
3117 }
3118 unittest
3119 {
3120     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3121     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3122     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3123     assert(B.array == expectedB);
3124 }
3125 
3126 
3127 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3128 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3129 {
3130     static if (bytes & 0xF0)
3131     {
3132         return _mm_setzero_si128();
3133     }
3134     else
3135     {
3136         static if (GDC_with_SSE2)
3137         {
3138             return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 
3139         }
3140         else version(DigitalMars)
3141         {
3142             version(D_InlineAsm_X86)
3143             {
3144                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3145                 {
3146                     movdqu XMM0, op;
3147                     pslldq XMM0, bytes;
3148                     movdqu op, XMM0;
3149                 }
3150                 return op;
3151             }
3152             else
3153             {
3154                 byte16 A = cast(byte16)op;
3155                 byte16 R;
3156                 for (int n = 15; n >= bytes; --n)
3157                     R.ptr[n] = A.array[n-bytes];
3158                 for (int n = bytes-1; n >= 0; --n)
3159                     R.ptr[n] = 0;
3160                 return cast(__m128i)R;
3161             }
3162         }
3163         else
3164         {
3165             return cast(__m128i) shufflevector!(byte16,
3166             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3167             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3168             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3169             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3170         }
3171     }
3172 }
3173 unittest
3174 {
3175     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3176     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3177     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3178     assert(R.array == correct);
3179 }
3180 
3181 version(LDC)
3182 {
3183     // Disappeared with LDC 1.11
3184     static if (__VERSION__ < 2081)
3185         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
3186     else
3187     {
3188         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
3189         {
3190             vec.array[0] = llvm_sqrt(vec.array[0]);
3191             vec.array[1] = llvm_sqrt(vec.array[1]);
3192             return vec;
3193         }
3194     }
3195 }
3196 else
3197 {
3198     static if (GDC_with_SSE2)
3199     {
3200         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
3201     }
3202     else
3203     {
3204         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
3205         {
3206             vec.array[0] = sqrt(vec.array[0]);
3207             vec.array[1] = sqrt(vec.array[1]);
3208             return vec;
3209         }
3210     }
3211 }
3212 
3213 
3214 version(LDC)
3215 {
3216     // Disappeared with LDC 1.11
3217     static if (__VERSION__ < 2081)
3218         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3219     else
3220     {
3221         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3222         {
3223             vec.array[0] = llvm_sqrt(vec.array[0]);
3224             vec.array[1] = vec.array[1];
3225             return vec;
3226         }
3227     }
3228 }
3229 else
3230 {
3231     static if (GDC_with_SSE2)
3232     {
3233         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3234     }
3235     else
3236     {
3237         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3238         {
3239             vec.array[0] = sqrt(vec.array[0]);
3240             vec.array[1] = vec.array[1];
3241             return vec;
3242         }
3243     }
3244 }
3245 
3246 
3247 static if (LDC_with_SSE2)
3248 {
3249     deprecated("Use _mm_srai_epi16 instead.") alias _mm_sra_epi16 = __builtin_ia32_psraw128;
3250 }
3251 else
3252 {
3253     static if (GDC_with_SSE2)
3254     {
3255         deprecated("Use _mm_srai_epi16 instead.") alias _mm_sra_epi16 = __builtin_ia32_psraw128;
3256     }
3257     else
3258     {
3259         deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
3260         {
3261             short8 sa = cast(short8)a;
3262             long2 lc = cast(long2)count;
3263             int bits = cast(int)(lc.array[0]);
3264             short8 r = void;
3265             foreach(i; 0..8)
3266                 r.array[i] = cast(short)(sa.array[i] >> bits);
3267             return cast(int4)r;
3268         }
3269     }
3270 }
3271 
3272 static if (LDC_with_SSE2)
3273 {
3274     deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3275 }
3276 else static if (GDC_with_SSE2)
3277 {
3278     deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3279 }
3280 else
3281 {
3282     deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe
3283     {
3284         int4 r = void;
3285         long2 lc = cast(long2)count;
3286         int bits = cast(int)(lc.array[0]);
3287         r.array[0] = (a.array[0] >> bits);
3288         r.array[1] = (a.array[1] >> bits);
3289         r.array[2] = (a.array[2] >> bits);
3290         r.array[3] = (a.array[3] >> bits);
3291         return r;
3292     }
3293 }
3294 
3295 static if (LDC_with_SSE2)
3296 {
3297     alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
3298 }
3299 else
3300 {
3301     static if (GDC_with_SSE2)
3302     {
3303         alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
3304     }
3305     else
3306     {
3307         // TODO: ARM
3308         __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe
3309         {
3310             short8 sa = cast(short8)a;
3311             short8 r = void;
3312             foreach(i; 0..8)
3313                 r.array[i] = cast(short)(sa.array[i] >> imm8);
3314             return cast(int4)r;
3315         }
3316     }
3317 }
3318 unittest
3319 {
3320     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3321     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3322     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3323     assert(B.array == expectedB);
3324 }
3325 
3326 static if (LDC_with_SSE2)
3327 {
3328     alias _mm_srai_epi32  = __builtin_ia32_psradi128;
3329 }
3330 else static if (GDC_with_SSE2)
3331 {
3332     alias _mm_srai_epi32  = __builtin_ia32_psradi128;
3333 }
3334 else
3335 {
3336     __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe
3337     {
3338         int4 r = void;
3339         r.array[0] = (a.array[0] >> imm8);
3340         r.array[1] = (a.array[1] >> imm8);
3341         r.array[2] = (a.array[2] >> imm8);
3342         r.array[3] = (a.array[3] >> imm8);
3343         return r;
3344     }
3345 }
3346 unittest
3347 {
3348     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3349     __m128i B = _mm_srai_epi32(A, 1);
3350     int[4] expectedB = [ 0, 1, 1, -2];
3351     assert(B.array == expectedB);
3352 }
3353 
3354 static if (LDC_with_SSE2)
3355 {
3356     deprecated("Use _mm_srli_epi16 instead.") alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
3357 }
3358 else static if (GDC_with_SSE2)
3359 {
3360     deprecated("Use _mm_srli_epi16 instead.") alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
3361 }
3362 else
3363 {
3364     deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
3365     {
3366         short8 sa = cast(short8)a;
3367         long2 lc = cast(long2)count;
3368         int bits = cast(int)(lc.array[0]);
3369         short8 r = void;
3370         foreach(i; 0..8)
3371             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3372         return cast(int4)r;
3373     }
3374 }
3375 
3376 static if (LDC_with_SSE2)
3377 {
3378     deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3379 }
3380 else static if (GDC_with_SSE2)
3381 {
3382     deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3383 }
3384 else
3385 {
3386     deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe
3387     {
3388         int4 r = void;
3389         long2 lc = cast(long2)count;
3390         int bits = cast(int)(lc.array[0]);
3391         r.array[0] = cast(uint)(a.array[0]) >> bits;
3392         r.array[1] = cast(uint)(a.array[1]) >> bits;
3393         r.array[2] = cast(uint)(a.array[2]) >> bits;
3394         r.array[3] = cast(uint)(a.array[3]) >> bits;
3395         return r;
3396     }
3397 }
3398 
3399 static if (LDC_with_SSE2)
3400 {
3401     deprecated("Use _mm_srli_epi64 instead.") alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
3402 }
3403 else static if (GDC_with_SSE2)
3404 {
3405     deprecated("Use _mm_srli_epi64 instead.") alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
3406 }
3407 else
3408 {
3409     deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
3410     {
3411         long2 r = void;
3412         long2 sa = cast(long2)a;
3413         long2 lc = cast(long2)count;
3414         int bits = cast(int)(lc.array[0]);
3415         r.array[0] = cast(ulong)(sa.array[0]) >> bits;
3416         r.array[1] = cast(ulong)(sa.array[1]) >> bits;
3417         return cast(__m128i)r;
3418     }
3419 }
3420 
3421 static if (LDC_with_SSE2)
3422 {
3423     alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
3424 }
3425 else
3426 {
3427     static if (GDC_with_SSE2)
3428     {
3429         alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
3430     }
3431     else
3432     {
3433         // TODO #ARM
3434         __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
3435         {
3436             short8 sa = cast(short8)a;
3437             short8 r = void;
3438             foreach(i; 0..8)
3439                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> imm8);
3440             return cast(int4)r;
3441         }
3442     }
3443 }
3444 unittest
3445 {
3446     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3447     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3448     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3449     assert(B.array == expectedB);
3450 }
3451 
3452 static if (LDC_with_SSE2)
3453 {
3454     alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
3455 }
3456 else
3457 {
3458     static if (GDC_with_SSE2)
3459     {
3460         alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
3461     }
3462     else
3463     {
3464         __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3465         {
3466             int4 r = void;
3467             r.ptr[0] = cast(uint)(a.array[0]) >> imm8;
3468             r.ptr[1] = cast(uint)(a.array[1]) >> imm8;
3469             r.ptr[2] = cast(uint)(a.array[2]) >> imm8;
3470             r.ptr[3] = cast(uint)(a.array[3]) >> imm8;
3471             return r;
3472         }
3473     }
3474 }
3475 unittest
3476 {
3477     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3478     __m128i B = _mm_srli_epi32(A, 1);
3479     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3480     assert(B.array == expectedB);
3481 }
3482 
3483 static if (LDC_with_SSE2)
3484 {
3485     alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
3486 }
3487 else
3488 {
3489     static if (GDC_with_SSE2)
3490     {
3491         alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
3492     }
3493     else
3494     {
3495         __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
3496         {
3497             long2 r = void;
3498             long2 sa = cast(long2)a;
3499             r.ptr[0] = cast(ulong)(sa.array[0]) >> imm8;
3500             r.ptr[1] = cast(ulong)(sa.array[1]) >> imm8;
3501             return cast(__m128i)r;
3502         }
3503     }
3504 }
3505 unittest
3506 {
3507     __m128i A = _mm_setr_epi64(8, -4);
3508     long2 B = cast(long2) _mm_srli_epi64(A, 1);
3509     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
3510     assert(B.array == expectedB);
3511 }
3512 
3513 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3514 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
3515 {
3516     static if (bytes & 0xF0)
3517     {
3518         return _mm_setzero_si128();
3519     }
3520     else
3521     {
3522         static if (GDC_with_SSE2)
3523         {
3524             return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8));
3525         }
3526         else static if (DMD_with_32bit_asm)
3527         {
3528             asm pure nothrow @nogc @trusted
3529             {
3530                 movdqu XMM0, v;
3531                 psrldq XMM0, bytes;
3532                 movdqu v, XMM0;
3533             }
3534             return v;
3535         }
3536         else
3537         {
3538             return cast(__m128i) shufflevector!(byte16,
3539                                                 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
3540                                                 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
3541                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
3542         }
3543     }
3544 
3545 }
3546 
3547 unittest
3548 {
3549     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
3550     int[4] correct = [2, 3, 4, 0];
3551     assert(R.array == correct);
3552 }
3553 
3554 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3555 /// #BONUS
3556 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
3557 {
3558     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
3559 }
3560 unittest
3561 {
3562     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
3563     float[4] correct = [3.0f, 4.0f, 0, 0];
3564     assert(R.array == correct);
3565 }
3566 
3567 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3568 /// #BONUS
3569 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
3570 {
3571     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
3572 }
3573 
3574 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
3575 {
3576     __m128d* aligned = cast(__m128d*)mem_addr;
3577     *aligned = a;
3578 }
3579 
3580 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
3581 {
3582     __m128d* aligned = cast(__m128d*)mem_addr;
3583     __m128d r;
3584     r.ptr[0] = a.array[0];
3585     r.ptr[1] = a.array[0];
3586     *aligned = r;
3587 }
3588 
3589 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
3590 {
3591     *mem_addr = a.array[0];
3592 }
3593 
3594 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
3595 {
3596     *mem_addr = a;
3597 }
3598 
3599 alias _mm_store1_pd = _mm_store_pd1;
3600 
3601 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
3602 {
3603     *mem_addr = a.array[1];
3604 }
3605 
3606 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
3607 // expectations from the user point of view. This problem also exist in C++.
3608 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
3609 {
3610     long* dest = cast(long*)mem_addr;
3611     long2 la = cast(long2)a;
3612     *dest = la.array[0];
3613 }
3614 unittest
3615 {
3616     long[3] A = [1, 2, 3];
3617     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
3618     long[3] correct = [1, 0x1_0000_0000, 3];
3619     assert(A == correct);
3620 }
3621 
3622 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
3623 {
3624     *mem_addr = a.array[0];
3625 }
3626 
3627 void _mm_storer_pd (double* mem_addr, __m128d a) pure
3628 {
3629     __m128d* aligned = cast(__m128d*)mem_addr;
3630     *aligned = shufflevector!(double2, 1, 0)(a, a);
3631 }
3632 
3633 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
3634 {
3635     storeUnaligned!double2(a, mem_addr);
3636 }
3637 
3638 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
3639 {
3640     storeUnaligned!__m128i(a, cast(int*)mem_addr);
3641 }
3642 
3643 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
3644 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
3645 /// boundary or a general-protection exception may be generated.
3646 void _mm_stream_pd (double* mem_addr, __m128d a)
3647 {
3648     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3649     __m128d* dest = cast(__m128d*)mem_addr;
3650     *dest = a;
3651 }
3652 
3653 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
3654 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
3655 /// may be generated.
3656 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
3657 {
3658     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3659     __m128i* dest = cast(__m128i*)mem_addr;
3660     *dest = a;
3661 }
3662 
3663 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
3664 /// pollution. If the cache line containing address mem_addr is already in the cache,
3665 /// the cache will be updated.
3666 void _mm_stream_si32 (int* mem_addr, int a)
3667 {
3668     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3669     *mem_addr = a;
3670 }
3671 
3672 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
3673 /// cache pollution. If the cache line containing address mem_addr is already
3674 /// in the cache, the cache will be updated.
3675 void _mm_stream_si64 (long* mem_addr, long a)
3676 {
3677     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3678     *mem_addr = a;
3679 }
3680 
3681 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
3682 {
3683     return cast(__m128i)(cast(short8)a - cast(short8)b);
3684 }
3685 
3686 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
3687 {
3688     return cast(__m128i)(cast(int4)a - cast(int4)b);
3689 }
3690 
3691 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
3692 {
3693     return cast(__m128i)(cast(long2)a - cast(long2)b);
3694 }
3695 
3696 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
3697 {
3698     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
3699 }
3700 
3701 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
3702 {
3703     return a - b;
3704 }
3705 
3706 version(DigitalMars)
3707 {
3708     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
3709     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
3710     {
3711         asm pure nothrow @nogc @trusted { nop;}
3712         a[0] = a[0] - b[0];
3713         return a;
3714     }
3715 }
3716 else static if (GDC_with_SSE2)
3717 {
3718     alias _mm_sub_sd = __builtin_ia32_subsd;
3719 }
3720 else
3721 {
3722     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
3723     {
3724         a.array[0] -= b.array[0];
3725         return a;
3726     }
3727 }
3728 unittest
3729 {
3730     __m128d a = [1.5, -2.0];
3731     a = _mm_sub_sd(a, a);
3732     assert(a.array == [0.0, -2.0]);
3733 }
3734 
3735 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
3736 {
3737     return a - b;
3738 }
3739 
3740 version(LDC)
3741 {
3742     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3743     {
3744         // Generates PSUBSW since LDC 1.15 -O0
3745         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
3746         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
3747         {
3748             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
3749             enum ir = `
3750                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
3751                 ret <8 x i16> %r`;
3752             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
3753         }
3754     }
3755     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
3756     {
3757         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
3758         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
3759         {
3760             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
3761             short[8] res;
3762             short8 sa = cast(short8)a;
3763             short8 sb = cast(short8)b;
3764             foreach(i; 0..8)
3765                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
3766             return _mm_loadu_si128(cast(int4*)res.ptr);
3767         }
3768     }
3769     else
3770         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
3771 }
3772 else
3773 {
3774     static if (GDC_with_SSE2)
3775     {
3776         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
3777     }
3778     else
3779     {
3780         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
3781         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
3782         {
3783             short[8] res;
3784             short8 sa = cast(short8)a;
3785             short8 sb = cast(short8)b;
3786             foreach(i; 0..8)
3787                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
3788             return _mm_loadu_si128(cast(int4*)res.ptr);
3789         }
3790     }
3791 }
3792 unittest
3793 {
3794     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
3795                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
3796     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
3797     assert(res.array == correctResult);
3798 }
3799 
3800 version(LDC)
3801 {
3802     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3803     {
3804         // x86: Generates PSUBSB since LDC 1.15 -O0
3805         // ARM: Generates sqsub.16b since LDC 1.21 -O0
3806         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
3807         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
3808         {
3809             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
3810             enum ir = `
3811                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
3812                 ret <16 x i8> %r`;
3813             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
3814         }
3815     }
3816     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
3817     {
3818         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
3819         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
3820         {
3821             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
3822             byte[16] res;
3823             byte16 sa = cast(byte16)a;
3824             byte16 sb = cast(byte16)b;
3825             foreach(i; 0..16)
3826                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
3827             return _mm_loadu_si128(cast(int4*)res.ptr);
3828         }
3829     }
3830     else
3831         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
3832 }
3833 else
3834 {
3835     static if (GDC_with_SSE2)
3836     {
3837         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
3838     }
3839     else
3840     {
3841         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
3842         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
3843         {
3844             byte[16] res;
3845             byte16 sa = cast(byte16)a;
3846             byte16 sb = cast(byte16)b;
3847             foreach(i; 0..16)
3848                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
3849             return _mm_loadu_si128(cast(int4*)res.ptr);
3850         }
3851     }
3852 }
3853 unittest
3854 {
3855     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
3856                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
3857     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3858     assert(res.array == correctResult);
3859 }
3860 
3861 version(LDC)
3862 {
3863     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3864     {
3865         // x86: Generates PSUBUSW since LDC 1.15 -O0
3866         // ARM: Generates uqsub.8h since LDC 1.21 -O0
3867         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
3868         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
3869         {
3870             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
3871             enum ir = `
3872                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
3873                 ret <8 x i16> %r`;
3874             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
3875         }
3876     }
3877     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
3878     {
3879         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
3880         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
3881         {
3882             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
3883             short[8] res;
3884             short8 sa = cast(short8)a;
3885             short8 sb = cast(short8)b;
3886             foreach(i; 0..8)
3887             {
3888                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
3889                 res[i] = saturateSignedIntToUnsignedShort(sum);
3890             }
3891             return _mm_loadu_si128(cast(int4*)res.ptr);
3892         }
3893     }
3894     else
3895         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
3896 }
3897 else
3898 {
3899     static if (GDC_with_SSE2)
3900     {
3901         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
3902     }
3903     else
3904     {
3905         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
3906         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
3907         {
3908             short[8] res;
3909             short8 sa = cast(short8)a;
3910             short8 sb = cast(short8)b;
3911             foreach(i; 0..8)
3912             {
3913                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
3914                 res[i] = saturateSignedIntToUnsignedShort(sum);
3915             }
3916             return _mm_loadu_si128(cast(int4*)res.ptr);
3917         }
3918     }
3919 }
3920 unittest
3921 {
3922     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
3923                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
3924     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
3925     assert(R.array == correct);
3926 }
3927 
3928 version(LDC)
3929 {
3930     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3931     {
3932         // x86: Generates PSUBUSB since LDC 1.15 -O0
3933         // ARM: Generates uqsub.16b since LDC 1.21 -O0
3934         /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
3935         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
3936         {
3937             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
3938             enum ir = `
3939                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
3940                 ret <16 x i8> %r`;
3941             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
3942         }
3943     }
3944     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
3945     {
3946          /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
3947         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
3948         {
3949             // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead
3950             ubyte[16] res;
3951             byte16 sa = cast(byte16)a;
3952             byte16 sb = cast(byte16)b;
3953             foreach(i; 0..16)
3954                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
3955             return _mm_loadu_si128(cast(int4*)res.ptr);
3956         }
3957     }
3958     else    
3959         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
3960 }
3961 else
3962 {
3963     static if (GDC_with_SSE2)
3964     {
3965         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
3966     }
3967     else
3968     {
3969         /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
3970         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
3971         {
3972             ubyte[16] res;
3973             byte16 sa = cast(byte16)a;
3974             byte16 sb = cast(byte16)b;
3975             foreach(i; 0..16)
3976                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
3977             return _mm_loadu_si128(cast(int4*)res.ptr);
3978         }
3979     }
3980 }
3981 unittest
3982 {
3983     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
3984                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
3985     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3986     assert(res.array == correctResult);
3987 }
3988 
3989 // Note: the only difference between these intrinsics is the signalling
3990 //       behaviour of quiet NaNs. This is incorrect but the case where
3991 //       you would want to differentiate between qNaN and sNaN and then
3992 //       treat them differently on purpose seems extremely rare.
3993 alias _mm_ucomieq_sd = _mm_comieq_sd;
3994 alias _mm_ucomige_sd = _mm_comige_sd;
3995 alias _mm_ucomigt_sd = _mm_comigt_sd;
3996 alias _mm_ucomile_sd = _mm_comile_sd;
3997 alias _mm_ucomilt_sd = _mm_comilt_sd;
3998 alias _mm_ucomineq_sd = _mm_comineq_sd;
3999 
4000 __m128d _mm_undefined_pd() pure @safe
4001 {
4002     __m128d result = void;
4003     return result;
4004 }
4005 __m128i _mm_undefined_si128() pure @safe
4006 {
4007     __m128i result = void;
4008     return result;
4009 }
4010 
4011 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4012 {
4013     static if (GDC_with_SSE2)
4014     {
4015         return __builtin_ia32_punpckhwd128(a, b);
4016     }
4017     else static if (DMD_with_32bit_asm)
4018     {
4019         asm pure nothrow @nogc @trusted
4020         {
4021             movdqu XMM0, a;
4022             movdqu XMM1, b;
4023             punpckhwd XMM0, XMM1;
4024             movdqu a, XMM0;
4025         }
4026         return a;
4027     }
4028     else
4029     {
4030         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4031                                            (cast(short8)a, cast(short8)b);
4032     }
4033 }
4034 unittest
4035 {
4036     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4037     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4038     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4039     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4040     assert(C.array == correct);
4041 }
4042 
4043 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
4044 {
4045     static if (GDC_with_SSE2)
4046     {
4047         return __builtin_ia32_punpckhdq128(a, b);
4048     }
4049     else
4050     {
4051         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4052     }
4053 }
4054 
4055 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4056 {
4057     static if (GDC_with_SSE2)
4058     {
4059         return __builtin_ia32_punpckhqdq128(a, b);
4060     }
4061     else
4062     {
4063         __m128i r = cast(__m128i)b;
4064         r[0] = a[2];
4065         r[1] = a[3];
4066         return r; 
4067     }
4068 }
4069 unittest // Issue #36
4070 {
4071     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4072     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4073     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4074     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4075     assert(C.array == correct);
4076 }
4077 
4078 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4079 {
4080     static if (GDC_with_SSE2)
4081     {
4082         return __builtin_ia32_punpckhbw128(a, b);
4083     }
4084     else static if (DMD_with_32bit_asm)
4085     {
4086         asm pure nothrow @nogc @trusted
4087         {
4088             movdqu XMM0, a;
4089             movdqu XMM1, b;
4090             punpckhbw XMM0, XMM1;
4091             movdqu a, XMM0;
4092         }
4093         return a;
4094     }
4095     else
4096     {
4097         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4098                                                    12, 28, 13, 29, 14, 30, 15, 31)
4099                                                    (cast(byte16)a, cast(byte16)b);
4100     }
4101 }
4102 
4103 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4104 {
4105     static if (GDC_with_SSE2)
4106     {
4107         return __builtin_ia32_unpckhpd(a, b);
4108     }
4109     else
4110     {
4111         return shufflevector!(__m128d, 1, 3)(a, b);
4112     }
4113 }
4114 
4115 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4116 {
4117     static if (GDC_with_SSE2)
4118     {
4119         return __builtin_ia32_punpcklwd128(a, b);
4120     }
4121     else static if (DMD_with_32bit_asm)
4122     {
4123         asm pure nothrow @nogc @trusted
4124         {
4125             movdqu XMM0, a;
4126             movdqu XMM1, b;
4127             punpcklwd XMM0, XMM1;
4128             movdqu a, XMM0;
4129         }
4130         return a;
4131     }
4132     else
4133     {
4134         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4135                                            (cast(short8)a, cast(short8)b);
4136     }
4137 }
4138 
4139 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
4140 {
4141     static if (GDC_with_SSE2)
4142     {
4143         return __builtin_ia32_punpckldq128(a, b);
4144     }
4145     else
4146     {
4147         return shufflevector!(int4, 0, 4, 1, 5)
4148                              (cast(int4)a, cast(int4)b);
4149     }
4150 }
4151 
4152 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4153 {
4154     static if (GDC_with_SSE2)
4155     {
4156         return __builtin_ia32_punpcklqdq128(a, b);
4157     }
4158     else
4159     {
4160         long2 lA = cast(long2)a;
4161         long2 lB = cast(long2)b;
4162         long2 R;
4163         R.ptr[0] = lA.array[0];
4164         R.ptr[1] = lB.array[0];
4165         return cast(__m128i)R;
4166     }
4167 }
4168 unittest // Issue #36
4169 {
4170     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4171     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4172     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4173     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4174     assert(C.array == correct);
4175 }
4176 
4177 
4178 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4179 {
4180     static if (GDC_with_SSE2)
4181     {
4182         return __builtin_ia32_punpcklbw128(a, b);
4183     }
4184     else static if (DMD_with_32bit_asm)
4185     {
4186         asm pure nothrow @nogc @trusted
4187         {
4188             movdqu XMM0, a;
4189             movdqu XMM1, b;
4190             punpcklbw XMM0, XMM1;
4191             movdqu a, XMM0;
4192         }
4193         return a;
4194     }
4195     else
4196     {
4197         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4198                                                     4, 20, 5, 21, 6, 22, 7, 23)
4199                                            (cast(byte16)a, cast(byte16)b);
4200     }
4201 }
4202 
4203 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4204 {
4205     static if (GDC_with_SSE2)
4206     {
4207         return __builtin_ia32_unpcklpd(a, b);
4208     }
4209     else
4210     {
4211         return shufflevector!(__m128d, 0, 2)(a, b);
4212     }
4213 }
4214 
4215 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4216 {
4217     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4218 }
4219 
4220 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4221 {
4222     return a ^ b;
4223 }
4224 
4225 unittest
4226 {
4227     // distance between two points in 4D
4228     float distance(float[4] a, float[4] b) nothrow @nogc
4229     {
4230         __m128 va = _mm_loadu_ps(a.ptr);
4231         __m128 vb = _mm_loadu_ps(b.ptr);
4232         __m128 diffSquared = _mm_sub_ps(va, vb);
4233         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4234         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4235         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4236         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4237     }
4238     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4239 }