1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.emmintrin;
7 
8 public import inteli.types;
9 public import inteli.xmmintrin; // SSE2 includes SSE1
10 import inteli.mmx;
11 import inteli.internals;
12 
13 nothrow @nogc:
14 
15 
16 // SSE2 instructions
17 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
18 
19 /// Add packed 16-bit integers in `a` and `b`.
20 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
21 {
22     return cast(__m128i)(cast(short8)a + cast(short8)b);
23 }
24 unittest
25 {
26     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
27     short8 R = cast(short8) _mm_add_epi16(A, A);
28     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
29     assert(R.array == correct);
30 }
31 
32 /// Add packed 32-bit integers in `a` and `b`.
33 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
34 {
35     return cast(__m128i)(cast(int4)a + cast(int4)b);
36 }
37 unittest
38 {
39     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
40     int4 R = _mm_add_epi32(A, A);
41     int[4] correct = [ -14, -2, 0, 18 ];
42     assert(R.array == correct);
43 }
44 
45 /// Add packed 64-bit integers in `a` and `b`.
46 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
47 {
48     return cast(__m128i)(cast(long2)a + cast(long2)b);
49 }
50 unittest
51 {
52     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
53     long2 R = cast(long2) _mm_add_epi64(A, A);
54     long[2] correct = [ -2, 0 ];
55     assert(R.array == correct);
56 }
57 
58 /// Add packed 8-bit integers in `a` and `b`.
59 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
60 {
61     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
62 }
63 unittest
64 {
65     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
66     byte16 R = cast(byte16) _mm_add_epi8(A, A);
67     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
68     assert(R.array == correct);
69 }
70 
71 /// Add the lower double-precision (64-bit) floating-point element 
72 /// in `a` and `b`, store the result in the lower element of dst, 
73 /// and copy the upper element from `a` to the upper element of destination. 
74 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
75 {
76     static if (GDC_with_SSE2)
77     {
78         return __builtin_ia32_addsd(a, b);
79     }
80     else version(DigitalMars)
81     {
82         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
83         asm pure nothrow @nogc @trusted { nop;}
84         a[0] = a[0] + b[0];
85         return a;
86     }
87     else
88     {
89         a[0] += b[0];
90         return a;
91     }
92 }
93 unittest
94 {
95     __m128d a = [1.5, -2.0];
96     a = _mm_add_sd(a, a);
97     assert(a.array == [3.0, -2.0]);
98 }
99 
100 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
101 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
102 {
103     return a + b;
104 }
105 unittest
106 {
107     __m128d a = [1.5, -2.0];
108     a = _mm_add_pd(a, a);
109     assert(a.array == [3.0, -4.0]);
110 }
111 
112 /// Add 64-bit integers `a` and `b`.
113 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
114 {
115     return a + b;
116 }
117 
118 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
119 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
120 {
121     static if (GDC_with_SSE2)
122     {
123         return __builtin_ia32_paddsw128(a, b);
124     }
125     else version(LDC)
126     {
127         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
128         {
129             // Generates PADDSW since LDC 1.15 -O0
130             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
131             enum ir = `
132                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
133                 ret <8 x i16> %r`;
134             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
135         }
136         else
137             return __builtin_ia32_paddsw128(a, b);
138     }
139     else
140     {
141         short[8] res;
142         short8 sa = cast(short8)a;
143         short8 sb = cast(short8)b;
144         foreach(i; 0..8)
145             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
146         return _mm_loadu_si128(cast(int4*)res.ptr);
147     }
148 }
149 unittest
150 {
151     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
152                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
153     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
154     assert(res.array == correctResult);
155 }
156 
157 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
158 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
159 {
160     static if (GDC_with_SSE2)
161     {
162         return __builtin_ia32_paddsb128(a, b);
163     }
164     else version(LDC)
165     {
166         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
167         {
168             // Generates PADDSB since LDC 1.15 -O0
169             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
170             enum ir = `
171                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
172                 ret <16 x i8> %r`;
173             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
174         }
175         else
176             return __builtin_ia32_paddsb128(a, b);
177     }
178     else
179     {
180         byte[16] res;
181         byte16 sa = cast(byte16)a;
182         byte16 sb = cast(byte16)b;
183         foreach(i; 0..16)
184             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
185         return _mm_loadu_si128(cast(int4*)res.ptr);
186     }
187 }
188 unittest
189 {
190     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
191                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
192     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
193                                                16, 18, 20, 22, 24, 26, 28, 30];
194     assert(res.array == correctResult);
195 }
196 
197 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
198 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
199 {
200     version(LDC)
201     {
202         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
203         {
204             // Generates PADDUSB since LDC 1.15 -O0
205             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
206             enum ir = `
207                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
208                 ret <16 x i8> %r`;
209             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
210         }
211         else
212             return __builtin_ia32_paddusb128(a, b);
213     }
214     else
215     {
216         ubyte[16] res;
217         byte16 sa = cast(byte16)a;
218         byte16 sb = cast(byte16)b;
219         foreach(i; 0..16)
220             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
221         return _mm_loadu_si128(cast(int4*)res.ptr);
222     }
223 }
224 unittest
225 {
226     byte16 res = cast(byte16) _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
227                                             _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
228     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 0, cast(byte)255, 4, 6, 8, 10, 12, 14];
229     assert(res.array == correctResult);
230 }
231 
232 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
233 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
234 {
235     version(LDC)
236     {
237         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
238         {
239             // Generates PADDUSW since LDC 1.15 -O0
240             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
241             enum ir = `
242                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
243                 ret <8 x i16> %r`;
244             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
245         }
246         else
247             return __builtin_ia32_paddusw128(a, b);
248     }
249     else
250     {
251         ushort[8] res;
252         short8 sa = cast(short8)a;
253         short8 sb = cast(short8)b;
254         foreach(i; 0..8)
255             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
256         return _mm_loadu_si128(cast(int4*)res.ptr);
257     }
258 }
259 unittest
260 {
261     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
262                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
263     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
264     assert(res.array == correctResult);
265 }
266 
267 /// Compute the bitwise AND of packed double-precision (64-bit) 
268 /// floating-point elements in `a` and `b`.
269 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
270 {
271     return cast(__m128d)( cast(long2)a & cast(long2)b );
272 }
273 unittest
274 {
275     double a = 4.32;
276     double b = -78.99;
277     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
278     __m128d A = _mm_set_pd(a, b);
279     __m128d B = _mm_set_pd(b, a);
280     long2 R = cast(long2)( _mm_and_pd(A, B) );
281     assert(R.array[0] == correct);
282     assert(R.array[1] == correct);
283 }
284 
285 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
286 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
287 {
288     return a & b;
289 }
290 unittest
291 {
292     __m128i A = _mm_set1_epi32(7);
293     __m128i B = _mm_set1_epi32(14);
294     __m128i R = _mm_and_si128(A, B);
295     int[4] correct = [6, 6, 6, 6];
296     assert(R.array == correct);
297 }
298 
299 /// Compute the bitwise NOT of packed double-precision (64-bit) 
300 /// floating-point elements in `a` and then AND with `b`.
301 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
302 {
303     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
304 }
305 unittest
306 {
307     double a = 4.32;
308     double b = -78.99;
309     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
310     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
311     __m128d A = _mm_setr_pd(a, b);
312     __m128d B = _mm_setr_pd(b, a);
313     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
314     assert(R.array[0] == correct);
315     assert(R.array[1] == correct2);
316 }
317 
318 /// Compute the bitwise NOT of 128 bits (representing integer data) 
319 /// in `a` and then AND with `b`.
320 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
321 {
322     return (~a) & b;
323 }
324 unittest
325 {
326     __m128i A = _mm_set1_epi32(7);
327     __m128i B = _mm_set1_epi32(14);
328     __m128i R = _mm_andnot_si128(A, B);
329     int[4] correct = [8, 8, 8, 8];
330     assert(R.array == correct);
331 }
332 
333 /// Average packed unsigned 16-bit integers in `a` and `b`.
334 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
335 {
336     static if (GDC_with_SSE2)
337     {
338         return __builtin_ia32_pavgw128(a, b);
339     }
340     else version(LDC)
341     {
342         // Generates pavgw even in LDC 1.0, even in -O0
343         enum ir = `
344             %ia = zext <8 x i16> %0 to <8 x i32>
345             %ib = zext <8 x i16> %1 to <8 x i32>
346             %isum = add <8 x i32> %ia, %ib
347             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
348             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
349             %r = trunc <8 x i32> %isums to <8 x i16>
350             ret <8 x i16> %r`;
351         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
352     }
353     else
354     {
355         short8 sa = cast(short8)a;
356         short8 sb = cast(short8)b;
357         short8 sr = void;
358         foreach(i; 0..8)
359         {
360             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
361         }
362         return cast(int4)sr;
363     }
364 }
365 unittest
366 {
367     __m128i A = _mm_set1_epi16(31);
368     __m128i B = _mm_set1_epi16(64);
369     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
370     foreach(i; 0..8)
371         assert(avg.array[i] == 48);
372 }
373 
374 /// Average packed unsigned 8-bit integers in `a` and `b`.
375 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
376 {
377     static if (GDC_with_SSE2)
378     {
379         return __builtin_ia32_pavgb128(a, b);
380     }
381     else version(LDC)
382     {
383         // Generates pavgb even in LDC 1.0, even in -O0
384         enum ir = `
385             %ia = zext <16 x i8> %0 to <16 x i16>
386             %ib = zext <16 x i8> %1 to <16 x i16>
387             %isum = add <16 x i16> %ia, %ib
388             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
389             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
390             %r = trunc <16 x i16> %isums to <16 x i8>
391             ret <16 x i8> %r`;
392         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
393     }
394     else
395     {
396         byte16 sa = cast(byte16)a;
397         byte16 sb = cast(byte16)b;
398         byte16 sr = void;
399         foreach(i; 0..16)
400         {
401             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
402         }
403         return cast(int4)sr;
404     }
405 }
406 unittest
407 {
408     __m128i A = _mm_set1_epi8(31);
409     __m128i B = _mm_set1_epi8(64);
410     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
411     foreach(i; 0..16)
412         assert(avg.array[i] == 48);
413 }
414 
415 /// Shift `a` left by `bytes` bytes while shifting in zeros.
416 alias _mm_bslli_si128 = _mm_slli_si128;
417 unittest
418 {
419     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
420     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
421     __m128i result = _mm_bslli_si128!5(toShift);
422     assert( (cast(byte16)result).array == exact);
423 }
424 
425 /// Shift `v` right by `bytes` bytes while shifting in zeros.
426 alias _mm_bsrli_si128 = _mm_srli_si128;
427 unittest
428 {
429     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
430     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
431     __m128i result = _mm_bsrli_si128!5(toShift);
432     assert( (cast(byte16)result).array == exact);
433 }
434 
435 /// Cast vector of type `__m128d` to type `__m128`. 
436 /// Note: Also possible with a regular `cast(__m128)(a)`.
437 __m128 _mm_castpd_ps (__m128d a) pure @safe
438 {
439     return cast(__m128)a;
440 }
441 
442 /// Cast vector of type `__m128d` to type `__m128i`. 
443 /// Note: Also possible with a regular `cast(__m128i)(a)`.
444 __m128i _mm_castpd_si128 (__m128d a) pure @safe
445 {
446     return cast(__m128i)a;
447 }
448 
449 /// Cast vector of type `__m128` to type `__m128d`. 
450 /// Note: Also possible with a regular `cast(__m128d)(a)`.
451 __m128d _mm_castps_pd (__m128 a) pure @safe
452 {
453     return cast(__m128d)a;
454 }
455 
456 /// Cast vector of type `__m128` to type `__m128i`. 
457 /// Note: Also possible with a regular `cast(__m128i)(a)`.
458 __m128i _mm_castps_si128 (__m128 a) pure @safe
459 {
460     return cast(__m128i)a;
461 }
462 
463 /// Cast vector of type `__m128i` to type `__m128d`. 
464 /// Note: Also possible with a regular `cast(__m128d)(a)`.
465 __m128d _mm_castsi128_pd (__m128i a) pure @safe
466 {
467     return cast(__m128d)a;
468 }
469 
470 /// Cast vector of type `__m128i` to type `__m128`. 
471 /// Note: Also possible with a regular `cast(__m128)(a)`.
472 __m128 _mm_castsi128_ps (__m128i a) pure @safe
473 {
474     return cast(__m128)a;
475 }
476 
477 /// Invalidate and flush the cache line that contains `p` 
478 /// from all levels of the cache hierarchy.
479 void _mm_clflush (const(void)* p) @trusted
480 {
481     static if (GDC_with_SSE2)
482     {
483         __builtin_ia32_clflush(p);
484     }
485     else version(LDC)
486     {
487         __builtin_ia32_clflush(cast(void*)p);
488     }
489     else version(D_InlineAsm_X86)
490     {
491         asm pure nothrow @nogc @safe
492         {
493             mov EAX, p;
494             clflush [EAX];
495         }
496     }
497     else version(D_InlineAsm_X86_64)
498     {
499         asm pure nothrow @nogc @safe
500         {
501             mov RAX, p;
502             clflush [RAX];
503         }
504     }
505     else 
506     {
507         // Do nothing. Invalidating cacheline does
508         // not affect correctness.            
509     }
510 }
511 unittest
512 {
513     ubyte[64] cacheline;
514     _mm_clflush(cacheline.ptr);
515 }
516 
517 /// Compare packed 16-bit integers in `a` and `b` for equality.
518 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
519 {
520     static if (GDC_with_SSE2)
521     {
522         return __builtin_ia32_pcmpeqw128(a, b);
523     }
524     else
525     {
526         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
527     }
528 }
529 unittest
530 {
531     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
532     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
533     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
534     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
535     assert(R.array == E);
536 }
537 
538 /// Compare packed 32-bit integers in `a` and `b` for equality.
539 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
540 {
541     static if (GDC_with_SSE2)
542     {
543         return __builtin_ia32_pcmpeqd128(a, b);
544     }
545     else
546     {
547         return equalMask!__m128i(a, b);
548     }
549 }
550 unittest
551 {
552     int4   A = [-3, -2, -1,  0];
553     int4   B = [ 4, -2,  2,  0];
554     int[4] E = [ 0, -1,  0, -1];
555     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
556     assert(R.array == E);
557 }
558 
559 /// Compare packed 8-bit integers in `a` and `b` for equality.
560 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
561 {
562     static if (GDC_with_SSE2)
563     {
564         return __builtin_ia32_pcmpeqb128(a, b); 
565     }
566     else
567     {
568         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
569     }
570 }
571 unittest
572 {
573     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
574     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
575     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
576     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
577     assert(C.array == correct);
578 }
579 
580 /// Compare packed double-precision (64-bit) floating-point elements 
581 /// in `a` and `b` for equality.
582 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
583 {
584     static if (GDC_with_SSE2)
585     {
586         return __builtin_ia32_cmpeqpd(a, b);
587     }
588     else
589     {
590         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
591     }
592 }
593 
594 /// Compare the lower double-precision (64-bit) floating-point elements
595 /// in `a` and `b` for equality, store the result in the lower element,
596 /// and copy the upper element from `a`.
597 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
598 {
599     static if (GDC_with_SSE2)
600     {
601         return __builtin_ia32_cmpeqsd(a, b);
602     }
603     else
604     {
605         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
606     }
607 }
608 
609 /// Compare packed double-precision (64-bit) floating-point elements 
610 /// in `a` and `b` for greater-than-or-equal.
611 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
612 {
613     static if (GDC_with_SSE2)
614     {
615         return __builtin_ia32_cmpgepd(a, b);
616     }
617     else
618     {
619         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
620     }
621 }
622 
623 /// Compare the lower double-precision (64-bit) floating-point elements 
624 /// in `a` and `b` for greater-than-or-equal, store the result in the 
625 /// lower element, and copy the upper element from `a`.
626 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
627 {
628     // Note: There is no __builtin_ia32_cmpgesd builtin.
629     static if (GDC_with_SSE2)
630     {
631         return __builtin_ia32_cmpnltsd(b, a);
632     }
633     else
634     {
635         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
636     }
637 }
638 
639 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
640 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
641 {
642     static if (GDC_with_SSE2)
643     {
644         return __builtin_ia32_pcmpgtw128(a, b); 
645     }
646     else
647     {
648         return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
649     }
650 }
651 unittest
652 {
653     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
654     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
655     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
656     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
657     assert(R.array == E);
658 }
659 
660 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
661 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
662 {
663     static if (GDC_with_SSE2)
664     {
665         return __builtin_ia32_pcmpgtd128(a, b); 
666     }
667     else
668     {
669         return cast(__m128i)( greaterMask!int4(a, b));
670     }
671 }
672 unittest
673 {
674     int4   A = [-3,  2, -1,  0];
675     int4   B = [ 4, -2,  2,  0];
676     int[4] E = [ 0, -1,  0,  0];
677     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
678     assert(R.array == E);
679 }
680 
681 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
682 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
683 {
684     static if (GDC_with_SSE2)
685     {
686         return __builtin_ia32_pcmpgtb128(a, b); 
687     }
688     else
689     {
690         return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
691     }
692 }
693 unittest
694 {
695     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
696     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
697     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
698     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
699     __m128i D = _mm_cmpeq_epi8(A, B);
700     assert(C.array == correct);
701 }
702 
703 /// Compare packed double-precision (64-bit) floating-point elements 
704 /// in `a` and `b` for greater-than.
705 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
706 {
707     static if (GDC_with_SSE2)
708     {
709         return __builtin_ia32_cmpgtpd(a, b); 
710     }
711     else
712     {
713         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
714     }
715 }
716 
717 /// Compare the lower double-precision (64-bit) floating-point elements 
718 /// in `a` and `b` for greater-than, store the result in the lower element,
719 /// and copy the upper element from `a`.
720 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
721 {
722     // Note: There is no __builtin_ia32_cmpgtsd builtin.
723     static if (GDC_with_SSE2)
724     {
725         return __builtin_ia32_cmpnlesd(b, a);
726     }
727     else
728     {
729         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
730     }
731 }
732 
733 /// Compare packed double-precision (64-bit) floating-point elements 
734 /// in `a` and `b` for less-than-or-equal.
735 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
736 {
737     static if (GDC_with_SSE2)
738     {
739         return __builtin_ia32_cmplepd(a, b); 
740     }
741     else
742     {
743         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
744     }
745 }
746 
747 /// Compare the lower double-precision (64-bit) floating-point elements 
748 /// in `a` and `b` for less-than-or-equal, store the result in the 
749 /// lower element, and copy the upper element from `a`.
750 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
751 {
752     static if (GDC_with_SSE2)
753     {
754         return __builtin_ia32_cmplesd(a, b); 
755     }
756     else
757     {
758         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
759     }
760 }
761 
762 /// Compare packed 16-bit integers in `a` and `b` for less-than.
763 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
764 {
765     return _mm_cmpgt_epi16(b, a);
766 }
767 
768 /// Compare packed 32-bit integers in `a` and `b` for less-than.
769 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
770 {
771     return _mm_cmpgt_epi32(b, a);
772 }
773 
774 /// Compare packed 8-bit integers in `a` and `b` for less-than.
775 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
776 {
777     return _mm_cmpgt_epi8(b, a);
778 }
779 
780 /// Compare packed double-precision (64-bit) floating-point elements
781 /// in `a` and `b` for less-than.
782 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
783 {
784     static if (GDC_with_SSE2)
785     {
786         return __builtin_ia32_cmpltpd(a, b); 
787     }
788     else
789     {
790         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
791     }
792 }
793 
794 /// Compare the lower double-precision (64-bit) floating-point elements
795 /// in `a` and `b` for less-than, store the result in the lower 
796 /// element, and copy the upper element from `a`.
797 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
798 {
799     static if (GDC_with_SSE2)
800     {
801         return __builtin_ia32_cmpltsd(a, b); 
802     }
803     else
804     {
805         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
806     }
807 }
808 
809 /// Compare packed double-precision (64-bit) floating-point elements
810 /// in `a` and `b` for not-equal.
811 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
812 {
813     static if (GDC_with_SSE2)
814     {
815         return __builtin_ia32_cmpneqpd(a, b); 
816     }
817     else
818     {
819         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
820     }
821 }
822 
823 /// Compare the lower double-precision (64-bit) floating-point elements
824 /// in `a` and `b` for not-equal, store the result in the lower 
825 /// element, and copy the upper element from `a`.
826 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
827 {
828     static if (GDC_with_SSE2)
829     {
830         return __builtin_ia32_cmpneqsd(a, b); 
831     }
832     else
833     {
834         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
835     }
836 }
837 
838 /// Compare packed double-precision (64-bit) floating-point elements 
839 /// in `a` and `b` for not-greater-than-or-equal.
840 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
841 {
842     static if (GDC_with_SSE2)
843     {
844         return __builtin_ia32_cmpngepd(a, b); 
845     }
846     else
847     {
848         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
849     }
850 }
851 
852 /// Compare the lower double-precision (64-bit) floating-point elements 
853 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
854 /// the lower element, and copy the upper element from `a`.
855 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
856 {
857     // Note: There is no __builtin_ia32_cmpngesd builtin.
858     static if (GDC_with_SSE2)
859     {
860         return __builtin_ia32_cmpltsd(b, a); 
861     }
862     else
863     {
864         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
865     }
866 }
867 
868 /// Compare packed double-precision (64-bit) floating-point elements 
869 /// in `a` and `b` for not-greater-than.
870 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
871 {
872     static if (GDC_with_SSE2)
873     {
874         return __builtin_ia32_cmpngtpd(a, b);
875     }
876     else
877     {
878         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
879     }
880 }
881 
882 /// Compare the lower double-precision (64-bit) floating-point elements 
883 /// in `a` and `b` for not-greater-than, store the result in the 
884 /// lower element, and copy the upper element from `a`.
885 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
886 {
887     // Note: There is no __builtin_ia32_cmpngtsd builtin.
888     static if (GDC_with_SSE2)
889     {
890         return __builtin_ia32_cmplesd(b, a);
891     }
892     else
893     {
894         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
895     }
896 }
897 
898 /// Compare packed double-precision (64-bit) floating-point elements 
899 /// in `a` and `b` for not-less-than-or-equal.
900 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
901 {
902     static if (GDC_with_SSE2)
903     {
904         return __builtin_ia32_cmpnlepd(a, b);
905     }
906     else
907     {
908         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
909     }
910 }
911 
912 /// Compare the lower double-precision (64-bit) floating-point elements 
913 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
914 /// lower element, and copy the upper element from `a`.
915 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
916 {
917     static if (GDC_with_SSE2)
918     {
919         return __builtin_ia32_cmpnlesd(a, b);
920     }
921     else
922     {
923         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
924     }
925 }
926  
927 /// Compare packed double-precision (64-bit) floating-point elements 
928 /// in `a` and `b` for not-less-than.
929 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
930 {
931     static if (GDC_with_SSE2)
932     {
933         return __builtin_ia32_cmpnltpd(a, b);
934     }
935     else
936     {
937         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
938     }
939 }
940 
941 /// Compare the lower double-precision (64-bit) floating-point elements 
942 /// in `a` and `b` for not-less-than, store the result in the lower 
943 /// element, and copy the upper element from `a`.
944 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
945 {
946     static if (GDC_with_SSE2)
947     {
948         return __builtin_ia32_cmpnltsd(a, b);
949     }
950     else
951     {
952         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
953     }
954 }
955 
956 /// Compare packed double-precision (64-bit) floating-point elements 
957 /// in `a` and `b` to see if neither is NaN.
958 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
959 {
960     static if (GDC_with_SSE2)
961     {
962         return __builtin_ia32_cmpordpd(a, b);
963     }
964     else
965     {
966         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
967     }
968 }
969 
970 /// Compare the lower double-precision (64-bit) floating-point elements 
971 /// in `a` and `b` to see if neither is NaN, store the result in the 
972 /// lower element, and copy the upper element from `a` to the upper element.
973 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
974 {
975     static if (GDC_with_SSE2)
976     {
977         return __builtin_ia32_cmpordsd(a, b);
978     }
979     else
980     {
981         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
982     }
983 }
984 
985 /// Compare packed double-precision (64-bit) floating-point elements 
986 /// in `a` and `b` to see if either is NaN.
987 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
988 {
989     static if (GDC_with_SSE2)
990     {
991         return __builtin_ia32_cmpunordpd(a, b);
992     }
993     else
994     {
995         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
996     }
997 }
998 
999 /// Compare the lower double-precision (64-bit) floating-point elements 
1000 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1001 /// element, and copy the upper element from `a` to the upper element.
1002 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1003 {
1004     static if (GDC_with_SSE2)
1005     {
1006         return __builtin_ia32_cmpunordsd(a, b);
1007     }
1008     else
1009     {
1010         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1011     }
1012 }
1013 
1014 
1015 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
1016 // Some such comparisons yields true for NaNs, other don't.
1017 
1018 /// Compare the lower double-precision (64-bit) floating-point element 
1019 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1020 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1021 {
1022     static if (GDC_with_SSE2)
1023     {
1024         return __builtin_ia32_comieq(a, b);
1025     }
1026     else
1027     {
1028         return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC
1029     }
1030 }
1031 
1032 /// Compare the lower double-precision (64-bit) floating-point element 
1033 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1034 /// result (0 or 1).
1035 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1036 {
1037     static if (GDC_with_SSE2)
1038     {
1039         return __builtin_ia32_comige(a, b);
1040     }
1041     else
1042     {
1043         return comsd!(FPComparison.oge)(a, b);
1044     }
1045 }
1046 
1047 /// Compare the lower double-precision (64-bit) floating-point element 
1048 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1049 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1050 {
1051     static if (GDC_with_SSE2)
1052     {
1053         return __builtin_ia32_comigt(a, b);
1054     }
1055     else
1056     {
1057         return comsd!(FPComparison.ogt)(a, b);
1058     }
1059 }
1060 
1061 /// Compare the lower double-precision (64-bit) floating-point element 
1062 /// in `a` and `b` for less-than-or-equal.
1063 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1064 {
1065     static if (GDC_with_SSE2)
1066     {
1067         return __builtin_ia32_comile(a, b);
1068     }
1069     else
1070     {
1071         return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC
1072     }
1073 }
1074 
1075 /// Compare the lower double-precision (64-bit) floating-point element 
1076 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1077 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1078 {
1079     static if (GDC_with_SSE2)
1080     {
1081         return __builtin_ia32_comilt(a, b);
1082     }
1083     else
1084     {
1085         return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC
1086     }
1087 }
1088 
1089 /// Compare the lower double-precision (64-bit) floating-point element
1090 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1091 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1092 {
1093     static if (GDC_with_SSE2)
1094     {
1095         return __builtin_ia32_comineq(a, b);
1096     }
1097     else
1098     {
1099         return comsd!(FPComparison.one)(a, b);
1100     }
1101 }
1102 
1103 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1104 /// floating-point elements.
1105  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1106 {
1107     version(LDC)
1108     {
1109         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1110         enum ir = `
1111             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1112             %r = sitofp <2 x i32> %v to <2 x double>
1113             ret <2 x double> %r`;
1114         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1115     }
1116     else static if (GDC_with_SSE2)
1117     {
1118         return __builtin_ia32_cvtdq2pd(a);
1119     }
1120     else
1121     {
1122         double2 r = void;
1123         r.ptr[0] = a.array[0];
1124         r.ptr[1] = a.array[1];
1125         return r;
1126     }
1127 }
1128 unittest
1129 {
1130     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1131     assert(A.array[0] == 54.0);
1132     assert(A.array[1] == 54.0);
1133 }
1134 
1135 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1136 /// floating-point elements.
1137 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1138 {
1139     static if (GDC_with_SSE2)
1140     {
1141         return __builtin_ia32_cvtdq2ps(a);
1142     }
1143     else
1144     {
1145         // Generates cvtdq2ps since LDC 1.0.0 -O1
1146         __m128 res;
1147         res.ptr[0] = cast(float)a.array[0];
1148         res.ptr[1] = cast(float)a.array[1];
1149         res.ptr[2] = cast(float)a.array[2];
1150         res.ptr[3] = cast(float)a.array[3];
1151         return res;
1152     }
1153 }
1154 unittest
1155 {
1156     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1157     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1158 }
1159 
1160 /// Convert packed double-precision (64-bit) floating-point elements 
1161 /// in `a` to packed 32-bit integers.
1162 __m128i _mm_cvtpd_epi32 (__m128d a) pure @trusted
1163 {
1164     version(LDC)
1165     {
1166         // Like in clang, implemented with a magic intrinsic right now
1167         return __builtin_ia32_cvtpd2dq(a);
1168 
1169     /* Unfortunately this generates a cvttpd2dq instruction
1170         __m128i _mm_cvtpd_epi32 (__m128d a) pure  @safe
1171         {
1172             enum ir = `
1173                 %i = fptosi <2 x double> %0 to <2 x i32>
1174                 %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1175                 ret <4 x i32> %r`;
1176 
1177             return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a);
1178         } */
1179     }
1180     else static if (GDC_with_SSE2)
1181     {
1182         return __builtin_ia32_cvtpd2dq(a);
1183     }
1184     else
1185     {
1186         __m128i r = _mm_setzero_si128();
1187         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1188         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1189         return r;
1190     }
1191 }
1192 unittest
1193 {
1194     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1195     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1196 }
1197 
1198 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1199 /// to packed 32-bit integers
1200 __m64 _mm_cvtpd_pi32 (__m128d v) pure @safe
1201 {
1202     return to_m64(_mm_cvtpd_epi32(v));
1203 }
1204 unittest
1205 {
1206     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1207     assert(A.array[0] == 55 && A.array[1] == 61);
1208 }
1209 
1210 /// Convert packed double-precision (64-bit) floating-point elements 
1211 /// in `a` to packed single-precision (32-bit) floating-point elements.
1212 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1213 {
1214     version(LDC)
1215     {
1216         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1217     }
1218     else static if (GDC_with_SSE2)
1219     {
1220         return __builtin_ia32_cvtpd2ps(a);
1221     }
1222     else
1223     { 
1224         __m128 r = void;
1225         r.ptr[0] = a.array[0];
1226         r.ptr[1] = a.array[1];
1227         r.ptr[2] = 0;
1228         r.ptr[3] = 0;
1229         return r;
1230     }
1231 }
1232 unittest
1233 {
1234     __m128d A = _mm_set_pd(5.25, 4.0);
1235     __m128 B = _mm_cvtpd_ps(A);
1236     assert(B.array == [4.0f, 5.25f, 0, 0]);
1237 }
1238 
1239 /// Convert packed 32-bit integers in `v` to packed double-precision 
1240 /// (64-bit) floating-point elements.
1241 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1242 {
1243     return _mm_cvtepi32_pd(to_m128i(v));
1244 }
1245 unittest
1246 {
1247     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1248     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1249 }
1250 
1251 /// Convert packed single-precision (32-bit) floating-point elements 
1252 /// in `a` to packed 32-bit integers,
1253 __m128i _mm_cvtps_epi32 (__m128 a) pure @trusted
1254 {
1255     version(LDC)
1256     {
1257         // Disabled, since it fail with optimizations unfortunately
1258         //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
1259         return __asm!__m128i("cvtps2dq $1,$0","=x,x",a);
1260     }
1261     else static if (GDC_with_SSE2)
1262     {
1263         return __builtin_ia32_cvtps2dq(a);
1264     }
1265     else
1266     {
1267         __m128i r = void;
1268         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1269         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1270         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1271         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1272         return r;
1273     }
1274 }
1275 unittest
1276 {
1277     uint savedRounding = _MM_GET_ROUNDING_MODE();
1278 
1279     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1280     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1281     assert(A.array == [1, -2, 54, -3]);
1282 
1283     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1284     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1285     assert(A.array == [1, -3, 53, -3]);
1286 
1287     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1288     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1289     assert(A.array == [2, -2, 54, -2]);
1290 
1291     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1292     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1293     assert(A.array == [1, -2, 53, -2]);
1294 
1295     _MM_SET_ROUNDING_MODE(savedRounding);
1296 }
1297 
1298 /// Convert packed single-precision (32-bit) floating-point elements 
1299 /// in `a` to packed double-precision (64-bit) floating-point elements.
1300 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1301 {
1302     version(LDC)
1303     {
1304         // Generates cvtps2pd since LDC 1.0 -O0
1305         enum ir = `
1306             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1307             %r = fpext <2 x float> %v to <2 x double>
1308             ret <2 x double> %r`;
1309         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1310     }
1311     else static if (GDC_with_SSE2)
1312     {
1313         return __builtin_ia32_cvtps2pd(a);
1314     }
1315     else
1316     {
1317         double2 r = void;
1318         r.ptr[0] = a.array[0];
1319         r.ptr[1] = a.array[1];
1320         return r;
1321     }
1322 }
1323 unittest
1324 {
1325     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1326     assert(A.array[0] == 54.0);
1327     assert(A.array[1] == 54.0);
1328 }
1329 
1330 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1331 double _mm_cvtsd_f64 (__m128d a) pure @safe
1332 {
1333     return a.array[0];
1334 }
1335 
1336 /// Convert the lower double-precision (64-bit) floating-point element
1337 /// in `a` to a 32-bit integer.
1338 int _mm_cvtsd_si32 (__m128d a) pure @safe
1339 {
1340     version(LDC)
1341     {
1342         return __builtin_ia32_cvtsd2si(a);
1343     }
1344     else static if (GDC_with_SSE2)
1345     {
1346         return __builtin_ia32_cvtsd2si(a);
1347     }
1348     else
1349     {
1350         return convertDoubleToInt32UsingMXCSR(a[0]);
1351     }
1352 }
1353 unittest
1354 {
1355     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1356 }
1357 
1358 version(LDC)
1359 {
1360     // Unfortunately this builtin crashes in 32-bit
1361     version(X86_64)
1362         alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
1363     else
1364     {
1365         long _mm_cvtsd_si64 (__m128d a) pure @safe
1366         {
1367             return convertDoubleToInt64UsingMXCSR(a[0]);
1368         }
1369     }
1370 }
1371 else
1372 {
1373     long _mm_cvtsd_si64 (__m128d a) pure @safe
1374     {
1375         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1376     }
1377 }
1378 unittest
1379 {
1380     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1381 
1382     uint savedRounding = _MM_GET_ROUNDING_MODE();
1383 
1384     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1385     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5)));
1386 
1387     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1388     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1389 
1390     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1391     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1392 
1393     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1394     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1395 
1396     _MM_SET_ROUNDING_MODE(savedRounding);
1397 }
1398 
1399 alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
1400 
1401 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe
1402 {
1403     static if (GDC_with_SSE2)
1404     {
1405         return __builtin_ia32_cvtsd2ss(a, b); 
1406     }
1407     else
1408     {
1409         // Generates cvtsd2ss since LDC 1.3 -O0
1410         a[0] = b[0];
1411         return a;
1412     }
1413 }
1414 unittest
1415 {
1416     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1417     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1418 }
1419 
1420 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1421 {
1422     return a.array[0];
1423 }
1424 
1425 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1426 {
1427     long2 la = cast(long2)a;
1428     return la.array[0];
1429 }
1430 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1431 
1432 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted
1433 {
1434     v.ptr[0] = cast(double)x;
1435     return v;
1436 }
1437 unittest
1438 {
1439     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1440     assert(a.array == [42.0, 0]);
1441 }
1442 
1443 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1444 {
1445     int4 r = [0, 0, 0, 0];
1446     r.ptr[0] = a;
1447     return r;
1448 }
1449 unittest
1450 {
1451     __m128i a = _mm_cvtsi32_si128(65);
1452     assert(a.array == [65, 0, 0, 0]);
1453 }
1454 
1455 
1456 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
1457 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted
1458 {
1459     v.ptr[0] = cast(double)x;
1460     return v;
1461 }
1462 unittest
1463 {
1464     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1465     assert(a.array == [42.0, 0]);
1466 }
1467 
1468 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1469 {
1470     long2 r = [0, 0];
1471     r.ptr[0] = a;
1472     return cast(__m128i)(r);
1473 }
1474 
1475 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
1476 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
1477 
1478 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted
1479 {
1480     v.ptr[0] = x.array[0];
1481     return v;
1482 }
1483 unittest
1484 {
1485     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1486     assert(a.array == [42.0, 0]);
1487 }
1488 
1489 long _mm_cvttss_si64 (__m128 a) pure @safe
1490 {
1491     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1492 }
1493 unittest
1494 {
1495     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1496 }
1497 
1498 version(LDC)
1499 {
1500     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1501 }
1502 else
1503 {
1504     static if (GDC_with_SSE2)
1505     {
1506         alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1507     }
1508     else
1509     {
1510         __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe
1511         {
1512             // Note: doesn't generate cvttpd2dq as of LDC 1.13
1513             __m128i r;
1514             r.array[0] = cast(int)a.array[0];
1515             r.array[1] = cast(int)a.array[1];
1516             r.array[2] = 0;
1517             r.array[3] = 0;
1518             return r;
1519         }
1520     }
1521 }
1522 unittest
1523 {
1524     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1525     assert(R.array == [-4, 45641, 0, 0]);
1526 }
1527 
1528 
1529 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1530 /// to packed 32-bit integers with truncation.
1531 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1532 {
1533     return to_m64(_mm_cvttpd_epi32(v));
1534 }
1535 unittest
1536 {
1537     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1538     int[2] correct = [-4, 45641];
1539     assert(R.array == correct);
1540 }
1541 
1542 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1543 {
1544     // Note: Generates cvttps2dq since LDC 1.3 -O2
1545     __m128i r;
1546     r.ptr[0] = cast(int)a.array[0];
1547     r.ptr[1] = cast(int)a.array[1];
1548     r.ptr[2] = cast(int)a.array[2];
1549     r.ptr[3] = cast(int)a.array[3];
1550     return r;
1551 }
1552 unittest
1553 {
1554     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1555     assert(R.array == [-4, 45641, 0, 1]);
1556 }
1557 
1558 int _mm_cvttsd_si32 (__m128d a)
1559 {
1560     // Generates cvttsd2si since LDC 1.3 -O0
1561     return cast(int)a.array[0];
1562 }
1563 
1564 long _mm_cvttsd_si64 (__m128d a)
1565 {
1566     // Generates cvttsd2si since LDC 1.3 -O0
1567     // but in 32-bit instead, it's a long sequence that resort to FPU
1568     return cast(long)a.array[0];
1569 }
1570 
1571 alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
1572 
1573 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1574 {
1575     return a / b;
1576 }
1577 
1578 static if (GDC_with_SSE2)
1579 {
1580     __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1581     {
1582         return __builtin_ia32_divsd(a, b);
1583     }
1584 }
1585 else version(DigitalMars)
1586 {
1587     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1588     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1589     {
1590         asm pure nothrow @nogc @trusted { nop;}
1591         a.array[0] = a.array[0] / b.array[0];
1592         return a;
1593     }
1594 }
1595 else
1596 {
1597     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1598     {
1599         a.array[0] /= b.array[0];
1600         return a;
1601     }
1602 }
1603 unittest
1604 {
1605     __m128d a = [2.0, 4.5];
1606     a = _mm_div_sd(a, a);
1607     assert(a.array == [1.0, 4.5]);
1608 }
1609 
1610 /// Extract a 16-bit integer from `v`, selected with `index`
1611 int _mm_extract_epi16(__m128i v, int index) pure @safe
1612 {
1613     short8 r = cast(short8)v;
1614     return cast(ushort)(r.array[index]);
1615 }
1616 unittest
1617 {
1618     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1619     assert(_mm_extract_epi16(A, 6) == 6);
1620     assert(_mm_extract_epi16(A, 0) == 65535);
1621 }
1622 
1623 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1624 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1625 {
1626     short8 r = cast(short8)v;
1627     r.ptr[index & 7] = cast(short)i;
1628     return cast(__m128i)r;
1629 }
1630 unittest
1631 {
1632     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1633     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1634     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1635     assert(R.array == correct);
1636 }
1637 
1638 version(GNU)
1639 {
1640     void _mm_lfence() pure @trusted
1641     {
1642         static if (GDC_with_SSE2)
1643         {
1644             __builtin_ia32_lfence();
1645         }
1646         else version(X86)
1647         {
1648             asm pure nothrow @nogc @trusted
1649             {
1650                 "lfence;\n" : : : ;
1651             }
1652         }
1653         else
1654             static assert(false);
1655     }
1656 }
1657 else version(LDC)
1658 {
1659     alias _mm_lfence = __builtin_ia32_lfence;
1660 }
1661 else static if (DMD_with_asm)
1662 {
1663     void _mm_lfence() pure @safe
1664     {
1665         asm nothrow @nogc pure @safe
1666         {
1667             lfence;
1668         }
1669     }
1670 }
1671 else
1672     static assert(false);
1673 unittest
1674 {
1675     _mm_lfence();
1676 }
1677 
1678 
1679 __m128d _mm_load_pd (const(double) * mem_addr) pure
1680 {
1681     __m128d* aligned = cast(__m128d*)mem_addr;
1682     return *aligned;
1683 }
1684 
1685 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1686 {
1687     double[2] arr = [*mem_addr, *mem_addr];
1688     return loadUnaligned!(double2)(&arr[0]);
1689 }
1690 
1691 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1692 {
1693     double2 r = [0, 0];
1694     r.ptr[0] = *mem_addr;
1695     return r;
1696 }
1697 unittest
1698 {
1699     double x = -42;
1700     __m128d a = _mm_load_sd(&x);
1701     assert(a.array == [-42.0, 0.0]);
1702 }
1703 
1704 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
1705 {
1706     return *mem_addr;
1707 }
1708 
1709 alias _mm_load1_pd = _mm_load_pd1;
1710 
1711 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1712 {
1713     a.ptr[1] = *mem_addr;
1714     return a;
1715 }
1716 
1717 // Note: strange signature since the memory doesn't have to aligned
1718 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
1719 {
1720     auto pLong = cast(const(long)*)mem_addr;
1721     long2 r = [0, 0];
1722     r.ptr[0] = *pLong;
1723     return cast(__m128i)(r);
1724 }
1725 
1726 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1727 {
1728     a.ptr[0] = *mem_addr;
1729     return a;
1730 }
1731 
1732 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted
1733 {
1734     __m128d a = *cast(__m128d*)(mem_addr);
1735     __m128d r;
1736     r.ptr[0] = a.array[1];
1737     r.ptr[1] = a.array[0];
1738     return r;
1739 }
1740 
1741 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
1742 {
1743     static if (GDC_with_SSE2)
1744     {
1745         return __builtin_ia32_loadupd(mem_addr); 
1746     }
1747     else
1748     {
1749         return loadUnaligned!(double2)(mem_addr);
1750     }
1751 }
1752 
1753 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1754 {
1755     static if (GDC_with_SSE2)
1756     {
1757         return __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1758     }
1759     else
1760     {
1761         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1762     }
1763 }
1764 
1765 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1766 {
1767     int r = *cast(int*)(mem_addr);
1768     int4 result = [0, 0, 0, 0];
1769     result.ptr[0] = r;
1770     return result;
1771 }
1772 unittest
1773 {
1774     int r = 42;
1775     __m128i A = _mm_loadu_si32(&r);
1776     int[4] correct = [42, 0, 0, 0];
1777     assert(A.array == correct);
1778 }
1779 
1780 static if (GDC_with_SSE2)
1781 {
1782     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1783     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1784     /// and pack the results in destination.
1785     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1786 }
1787 else version(LDC)
1788 {
1789     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1790     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1791     /// and pack the results in destination.
1792     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1793 }
1794 else
1795 {
1796     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1797     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1798     /// and pack the results in destination.
1799     __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe
1800     {
1801         short8 sa = cast(short8)a;
1802         short8 sb = cast(short8)b;
1803 
1804         int4 r;
1805         foreach(i; 0..4)
1806         {
1807             r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
1808         }
1809         return r;
1810     }
1811 }
1812 unittest
1813 {
1814     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1815     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1816     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
1817     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
1818     assert(R.array == correct);
1819 }
1820 
1821 version(LDC)
1822 {
1823     /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
1824     /// (elements are not stored when the highest bit is not set in the corresponding element)
1825     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
1826     /// boundary.
1827     alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR
1828 }
1829 else
1830 {
1831     static if (GDC_with_SSE2)
1832     {
1833         ///ditto
1834         void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1835         {
1836             return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
1837         }
1838     }
1839     else
1840     {
1841         ///ditto
1842         void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1843         {
1844             byte16 b = cast(byte16)a;
1845             byte16 m = cast(byte16)mask;
1846             byte* dest = cast(byte*)(mem_addr);
1847             foreach(j; 0..16)
1848             {
1849                 if (m.array[j] & 128)
1850                 {
1851                     dest[j] = b.array[j];
1852                 }
1853             }
1854         }
1855     }
1856 }
1857 unittest
1858 {
1859     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
1860     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
1861     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
1862     _mm_maskmoveu_si128(A, mask, dest.ptr);
1863     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
1864     assert(dest == correct);
1865 }
1866 
1867 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
1868 {
1869     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1870     __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
1871     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1872     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1873     return _mm_xor_si128(b, mask);
1874 }
1875 unittest
1876 {
1877     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1878                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1879     short[8] correct =                                  [45, 1,  9,  7, 9,  7, 0,  0];
1880     assert(R.array == correct);
1881 }
1882 
1883 
1884 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1885 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
1886 {
1887     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1888     __m128i value128 = _mm_set1_epi8(-128);
1889     __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1890     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1891     __m128i mask = _mm_and_si128(aTob, higher);
1892     return _mm_xor_si128(b, mask);
1893 }
1894 unittest
1895 {
1896     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1897                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1898     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
1899     assert(R.array == correct);
1900 }
1901 
1902 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe
1903 {
1904     static if (GDC_with_SSE2)
1905     {
1906         return __builtin_ia32_maxpd(a, b);
1907     }
1908     else
1909     {
1910         // Generates maxpd starting with LDC 1.9
1911         a[0] = (a[0] > b[0]) ? a[0] : b[0];
1912         a[1] = (a[1] > b[1]) ? a[1] : b[1];
1913         return a;
1914     }
1915 }
1916 unittest
1917 {
1918     __m128d A = _mm_setr_pd(4.0, 1.0);
1919     __m128d B = _mm_setr_pd(1.0, 8.0);
1920     __m128d M = _mm_max_pd(A, B);
1921     assert(M.array[0] == 4.0);
1922     assert(M.array[1] == 8.0);
1923 }
1924 
1925 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe
1926 {
1927     static if (GDC_with_SSE2)
1928     {
1929         return __builtin_ia32_maxsd(a, b);
1930     }
1931     else
1932     {
1933          __m128d r = a;
1934         // Generates maxsd starting with LDC 1.3
1935         r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
1936         return r;
1937     }
1938 }
1939 unittest
1940 {
1941     __m128d A = _mm_setr_pd(1.0, 1.0);
1942     __m128d B = _mm_setr_pd(4.0, 2.0);
1943     __m128d M = _mm_max_sd(A, B);
1944     assert(M.array[0] == 4.0);
1945     assert(M.array[1] == 1.0);
1946 }
1947 
1948 version(GNU)
1949 {
1950     void _mm_mfence() pure @trusted
1951     {
1952         static if (GDC_with_SSE2)
1953         {
1954             __builtin_ia32_mfence();
1955         }
1956         else version(X86)
1957         {
1958             asm pure nothrow @nogc @trusted
1959             {
1960                 "mfence;\n" : : : ;
1961             }
1962         }
1963         else
1964             static assert(false);
1965     }
1966 }
1967 else version(LDC)
1968 {
1969     alias _mm_mfence = __builtin_ia32_mfence;
1970 }
1971 else static if (DMD_with_asm)
1972 {
1973     void _mm_mfence() pure @safe
1974     {
1975         asm nothrow @nogc pure @safe
1976         {
1977             mfence;
1978         }
1979     }
1980 }
1981 else
1982     static assert(false);
1983 unittest
1984 {
1985     _mm_mfence();
1986 }
1987 
1988 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
1989 {
1990     // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?)
1991     // Implemented using masks and XOR
1992     __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
1993     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1994     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1995     return _mm_xor_si128(b, mask);
1996 }
1997 unittest
1998 {
1999     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
2000                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2001     short[8] correct =  [-4,-8, -4, -8, 0,-57, 0, -57];
2002     assert(R.array == correct);
2003 }
2004 
2005 
2006 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2007 {
2008     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
2009     __m128i value128 = _mm_set1_epi8(-128);
2010     __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2011     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2012     __m128i mask = _mm_and_si128(aTob, lower);
2013     return _mm_xor_si128(b, mask);
2014 }
2015 unittest
2016 {
2017     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2018                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2019     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2020     assert(R.array == correct);
2021 }
2022 
2023 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe
2024 {
2025     static if (GDC_with_SSE2)
2026     {
2027         return __builtin_ia32_minpd(a, b);
2028     }
2029     else
2030     {
2031         // Generates minpd starting with LDC 1.9
2032         a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2033         a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2034         return a;
2035     }
2036 }
2037 unittest
2038 {
2039     __m128d A = _mm_setr_pd(1.0, 2.0);
2040     __m128d B = _mm_setr_pd(4.0, 1.0);
2041     __m128d M = _mm_min_pd(A, B);
2042     assert(M.array[0] == 1.0);
2043     assert(M.array[1] == 1.0);
2044 }
2045 
2046 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2047 {
2048     static if (GDC_with_SSE2)
2049     {
2050         return __builtin_ia32_minsd(a, b);
2051     }
2052     else
2053     {
2054         // Generates minsd starting with LDC 1.3
2055         __m128d r = a;
2056         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2057         return r;
2058     }
2059 }
2060 unittest
2061 {
2062     __m128d A = _mm_setr_pd(1.0, 3.0);
2063     __m128d B = _mm_setr_pd(4.0, 2.0);
2064     __m128d M = _mm_min_sd(A, B);
2065     assert(M.array[0] == 1.0);
2066     assert(M.array[1] == 3.0);
2067 }
2068 
2069 __m128i _mm_move_epi64 (__m128i a) pure @safe
2070 {
2071     static if (GDC_with_SSE2)
2072     {
2073         return __builtin_ia32_movq128(a);
2074     }
2075     else
2076     {
2077         long2 result = [ 0, 0 ];
2078         long2 la = cast(long2) a;
2079         result.array[0] = la.array[0];
2080         return cast(__m128i)(result);
2081     }
2082 }
2083 unittest
2084 {
2085     long2 A = [13, 47];
2086     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2087     long[2] correct = [13, 0];
2088     assert(B.array == correct);
2089 }
2090 
2091 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
2092 {
2093     static if (GDC_with_SSE2)
2094     {
2095         return __builtin_ia32_movsd(a, b); 
2096     }
2097     else
2098     {
2099         b.array[1] = a.array[1];
2100         return b;
2101     }
2102 }
2103 unittest
2104 {
2105     double2 A = [13.0, 47.0];
2106     double2 B = [34.0, 58.0];
2107     double2 C = _mm_move_sd(A, B);
2108     double[2] correct = [34.0, 47.0];
2109     assert(C.array == correct);
2110 }
2111 
2112 version(LDC)
2113 {
2114     /// Create mask from the most significant bit of each 8-bit element in `v`.
2115     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
2116 }
2117 else
2118 {
2119     static if (GDC_with_SSE2)
2120     {
2121         /// Create mask from the most significant bit of each 8-bit element in `v`.
2122         alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
2123     }
2124     else
2125     {
2126         /// Create mask from the most significant bit of each 8-bit element in `v`.
2127         int _mm_movemask_epi8(__m128i v) pure @safe
2128         {
2129             byte16 ai = cast(byte16)v;
2130             int r = 0;
2131             foreach(bit; 0..16)
2132             {
2133                 if (ai.array[bit] < 0) r += (1 << bit);
2134             }
2135             return r;
2136         }
2137     }
2138 }
2139 unittest
2140 {
2141     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0)));
2142 }
2143 
2144 version(LDC)
2145 {
2146     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2147     /// packed double-precision (64-bit) floating-point element in `v`.
2148     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2149 }
2150 else
2151 {
2152     static if (GDC_with_SSE2)
2153     {
2154         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2155         /// packed double-precision (64-bit) floating-point element in `v`.
2156         alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2157     }
2158     else
2159     {
2160         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2161         /// packed double-precision (64-bit) floating-point element in `v`.
2162         int _mm_movemask_pd(__m128d v) pure @safe
2163         {
2164             long2 lv = cast(long2)v;
2165             int r = 0;
2166             if (lv.array[0] < 0) r += 1;
2167             if (lv.array[1] < 0) r += 2;
2168             return r;
2169         }
2170     }
2171 }
2172 unittest
2173 {
2174     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2175     assert(_mm_movemask_pd(A) == 2);
2176 }
2177 
2178 /// Copy the lower 64-bit integer in `v`.
2179 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2180 {
2181     long2 lv = cast(long2)v;
2182     return long1(lv.array[0]);
2183 }
2184 unittest
2185 {
2186     __m128i A = _mm_set_epi64x(-1, -2);
2187     __m64 R = _mm_movepi64_pi64(A);
2188     assert(R.array[0] == -2);
2189 }
2190 
2191 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2192 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2193 {
2194     long2 r;
2195     r.ptr[0] = a.array[0];
2196     r.ptr[1] = 0;
2197     return cast(__m128i)r;
2198 }
2199 
2200 // Note: generates pmuludq in LDC with -O1
2201 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2202 {
2203     __m128i zero = _mm_setzero_si128();
2204 
2205     static if (__VERSION__ >= 2088)
2206     {
2207         // Need LLVM9 to avoid this shufflevector
2208         long2 la, lb;
2209         la.ptr[0] = cast(uint)a.array[0];
2210         la.ptr[1] = cast(uint)a.array[2];
2211         lb.ptr[0] = cast(uint)b.array[0];
2212         lb.ptr[1] = cast(uint)b.array[2];
2213     }
2214     else
2215     {
2216         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2217         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2218     }
2219 
2220     static if (__VERSION__ >= 2076)
2221     {
2222         return cast(__m128i)(la * lb);
2223     }
2224     else
2225     {
2226         // long2 mul not supported before LDC 1.5
2227         la.ptr[0] *= lb.array[0];
2228         la.ptr[1] *= lb.array[1];
2229         return cast(__m128i)(la);
2230     }
2231 }
2232 unittest
2233 {
2234     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2235     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2236     __m128i C = _mm_mul_epu32(A, B);
2237     long2 LC = cast(long2)C;
2238     assert(LC.array[0] == 18446744065119617025uL);
2239     assert(LC.array[1] == 12723420444339690338uL);
2240 }
2241 
2242 
2243 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2244 {
2245     return a * b;
2246 }
2247 unittest
2248 {
2249     __m128d a = [-2.0, 1.5];
2250     a = _mm_mul_pd(a, a);
2251     assert(a.array == [4.0, 2.25]);
2252 }
2253 
2254 version(DigitalMars)
2255 {
2256     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2257     __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2258     {
2259         asm pure nothrow @nogc @trusted { nop;}
2260         a.array[0] = a.array[0] * b.array[0];
2261         return a;
2262     }
2263 }
2264 else
2265 {
2266     static if (GDC_with_SSE2)
2267     {
2268         alias _mm_mul_sd = __builtin_ia32_mulsd;
2269     }
2270     else
2271     {
2272         __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2273         {
2274             a.array[0] *= b.array[0];
2275             return a;
2276         }
2277     }
2278 }
2279 unittest
2280 {
2281     __m128d a = [-2.0, 1.5];
2282     a = _mm_mul_sd(a, a);
2283     assert(a.array == [4.0, 1.5]);
2284 }
2285 
2286 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2287 /// and get an unsigned 64-bit result.
2288 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2289 {
2290     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2291 }
2292 unittest
2293 {
2294     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2295     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2296     __m64 C = _mm_mul_su32(A, B);
2297     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2298 }
2299 
2300 version(LDC)
2301 {
2302     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
2303 }
2304 else
2305 {
2306     static if (GDC_with_SSE2)
2307     {
2308         alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
2309     }
2310     else
2311     {
2312         __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe
2313         {
2314             short8 sa = cast(short8)a;
2315             short8 sb = cast(short8)b;
2316             short8 r = void;
2317             r.array[0] = (sa.array[0] * sb.array[0]) >> 16;
2318             r.array[1] = (sa.array[1] * sb.array[1]) >> 16;
2319             r.array[2] = (sa.array[2] * sb.array[2]) >> 16;
2320             r.array[3] = (sa.array[3] * sb.array[3]) >> 16;
2321             r.array[4] = (sa.array[4] * sb.array[4]) >> 16;
2322             r.array[5] = (sa.array[5] * sb.array[5]) >> 16;
2323             r.array[6] = (sa.array[6] * sb.array[6]) >> 16;
2324             r.array[7] = (sa.array[7] * sb.array[7]) >> 16;
2325             return cast(__m128i)r;
2326         }
2327     }
2328 }
2329 unittest
2330 {
2331     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2332     __m128i B = _mm_set1_epi16(16384);
2333     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2334     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2335     assert(R.array == correct);
2336 }
2337 
2338 version(LDC)
2339 {
2340     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
2341 }
2342 else
2343 {
2344     static if (GDC_with_SSE2)
2345     {
2346         alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
2347     }
2348     else
2349     {
2350         __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe
2351         {
2352             short8 sa = cast(short8)a;
2353             short8 sb = cast(short8)b;
2354             short8 r = void;
2355             r.array[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2356             r.array[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2357             r.array[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2358             r.array[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2359             r.array[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2360             r.array[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2361             r.array[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2362             r.array[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2363             return cast(__m128i)r;
2364         }
2365     }
2366 }
2367 unittest
2368 {
2369     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2370     __m128i B = _mm_set1_epi16(16384);
2371     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2372     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2373     assert(R.array == correct);
2374 }
2375 
2376 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2377 {
2378     return cast(__m128i)(cast(short8)a * cast(short8)b);
2379 }
2380 unittest
2381 {
2382     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2383     __m128i B = _mm_set1_epi16(16384);
2384     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2385     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2386     assert(R.array == correct);
2387 }
2388 
2389 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2390 {
2391     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2392 }
2393 
2394 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2395 {
2396     return a | b;
2397 }
2398 
2399 version(LDC)
2400 {
2401     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
2402 }
2403 else
2404 {
2405     static if (GDC_with_SSE2)
2406     {
2407         alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
2408     }
2409     else
2410     {
2411         __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe
2412         {
2413             short8 r;
2414             r.array[0] = saturateSignedIntToSignedShort(a.array[0]);
2415             r.array[1] = saturateSignedIntToSignedShort(a.array[1]);
2416             r.array[2] = saturateSignedIntToSignedShort(a.array[2]);
2417             r.array[3] = saturateSignedIntToSignedShort(a.array[3]);
2418             r.array[4] = saturateSignedIntToSignedShort(b.array[0]);
2419             r.array[5] = saturateSignedIntToSignedShort(b.array[1]);
2420             r.array[6] = saturateSignedIntToSignedShort(b.array[2]);
2421             r.array[7] = saturateSignedIntToSignedShort(b.array[3]);
2422             return cast(__m128i)r;
2423         }
2424     }
2425 }
2426 unittest
2427 {
2428     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2429     short8 R = cast(short8) _mm_packs_epi32(A, A);
2430     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2431     assert(R.array == correct);
2432 }
2433 
2434 version(LDC)
2435 {
2436     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
2437 }
2438 else
2439 {
2440     static if (GDC_with_SSE2)
2441     {
2442         alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
2443     }
2444     else
2445     {
2446         __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe
2447         {
2448             byte16 r;
2449             short8 sa = cast(short8)a;
2450             short8 sb = cast(short8)b;
2451             foreach(i; 0..8)
2452                 r.array[i] = saturateSignedWordToSignedByte(sa.array[i]);
2453             foreach(i; 0..8)
2454                 r.array[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2455             return cast(__m128i)r;
2456         }
2457     }
2458 }
2459 unittest
2460 {
2461     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2462     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2463     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2464                         127, -128, 127, 0, 127, -128, 127, 0];
2465     assert(R.array == correct);
2466 }
2467 
2468 version(LDC)
2469 {
2470     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
2471 }
2472 else
2473 {
2474     static if (GDC_with_SSE2)
2475     {
2476         alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
2477     }
2478     else
2479     {
2480         __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2481         {
2482             short8 sa = cast(short8)a;
2483             short8 sb = cast(short8)b;
2484             ubyte[16] result = void;
2485             for (int i = 0; i < 8; ++i)
2486             {
2487                 short s = sa[i];
2488                 if (s < 0) s = 0;
2489                 if (s > 255) s = 255;
2490                 result[i] = cast(ubyte)s;
2491 
2492                 s = sb[i];
2493                 if (s < 0) s = 0;
2494                 if (s > 255) s = 255;
2495                 result[i+8] = cast(ubyte)s;
2496             }
2497             return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2498         }
2499     }
2500 }
2501 unittest
2502 {
2503     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2504     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2505     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2506                                                 0, 255, 0, 255, 255, 2, 1, 0];
2507     foreach(i; 0..16)
2508         assert(AA.array[i] == cast(byte)(correctResult[i]));
2509 }
2510 
2511 
2512 version(GNU)
2513 {
2514     void _mm_pause() pure @trusted
2515     {
2516         static if (GDC_with_SSE2)
2517         {
2518             __builtin_ia32_pause();
2519         }
2520         else version(X86)
2521         {
2522             asm pure nothrow @nogc @trusted
2523             {
2524                 "pause;\n" : : : ;
2525             }
2526         }
2527         else
2528             static assert(false);
2529     }
2530 }
2531 else version(LDC)
2532 {
2533     alias _mm_pause = __builtin_ia32_pause;
2534 }
2535 else static if (DMD_with_asm)
2536 {
2537     void _mm_pause() pure @safe
2538     {
2539         asm nothrow @nogc pure @safe
2540         {
2541             rep; nop; // F3 90 =  pause
2542         }
2543     }
2544 }
2545 else
2546     static assert(false);
2547 unittest
2548 {
2549     _mm_pause();
2550 }
2551 
2552 
2553 version(LDC)
2554 {
2555     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
2556 }
2557 else
2558 {
2559     static if (GDC_with_SSE2)
2560     {
2561         alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
2562     }
2563     else
2564     {
2565         __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe
2566         {
2567             byte16 ab = cast(byte16)a;
2568             byte16 bb = cast(byte16)b;
2569             ubyte[16] t;
2570             foreach(i; 0..16)
2571             {
2572                 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2573                 if (diff < 0) diff = -diff;
2574                 t[i] = cast(ubyte)(diff);
2575             }
2576             int4 r = _mm_setzero_si128();
2577             r.array[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2578             r.array[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2579             return r;
2580         }
2581     }
2582 }
2583 unittest
2584 {
2585     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2586     __m128i B = _mm_set1_epi8(1);
2587     __m128i R = _mm_sad_epu8(A, B);
2588     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2589                       0,
2590                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2591                       0];
2592     assert(R.array == correct);
2593 }
2594 
2595 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2596 {
2597     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2598     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2599 }
2600 unittest
2601 {
2602     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2603     short8 B = cast(short8) A;
2604     foreach(i; 0..8)
2605         assert(B.array[i] == i);
2606 }
2607 
2608 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2609 {
2610     int[4] result = [e0, e1, e2, e3];
2611     return loadUnaligned!(int4)(result.ptr);
2612 }
2613 unittest
2614 {
2615     __m128i A = _mm_set_epi32(3, 2, 1, 0);
2616     foreach(i; 0..4)
2617         assert(A.array[i] == i);
2618 }
2619 
2620 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
2621 {
2622     long[2] result = [e0.array[0], e1.array[0]];
2623     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2624 }
2625 unittest
2626 {
2627     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
2628     long2 B = cast(long2) A;
2629     assert(B.array[0] == 5678);
2630     assert(B.array[1] == 1234);
2631 }
2632 
2633 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
2634 {
2635     long[2] result = [e0, e1];
2636     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2637 }
2638 unittest
2639 {
2640     __m128i A = _mm_set_epi64x(1234, 5678);
2641     long2 B = cast(long2) A;
2642     assert(B.array[0] == 5678);
2643     assert(B.array[1] == 1234);
2644 }
2645 
2646 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
2647                       byte e11, byte e10, byte e9, byte e8,
2648                       byte e7, byte e6, byte e5, byte e4,
2649                       byte e3, byte e2, byte e1, byte e0) pure @trusted
2650 {
2651     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
2652                      e8, e9, e10, e11, e12, e13, e14, e15];
2653     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2654 }
2655 
2656 __m128d _mm_set_pd (double e1, double e0) pure @trusted
2657 {
2658     double[2] result = [e0, e1];
2659     return loadUnaligned!(double2)(result.ptr);
2660 }
2661 unittest
2662 {
2663     __m128d A = _mm_set_pd(61.0, 55.0);
2664     double[2] correct = [55.0, 61.0];
2665     assert(A.array == correct);
2666 }
2667 
2668 __m128d _mm_set_pd1 (double a) pure @trusted
2669 {
2670     double[2] result = [a, a];
2671     return loadUnaligned!(double2)(result.ptr);
2672 }
2673 unittest
2674 {
2675     __m128d A = _mm_set_pd1(61.0);
2676     double[2] correct = [61.0, 61.0];
2677     assert(A.array == correct);
2678 }
2679 
2680 __m128d _mm_set_sd (double a) pure @trusted
2681 {
2682     double[2] result = [a, 0];
2683     return loadUnaligned!(double2)(result.ptr);
2684 }
2685 
2686 __m128i _mm_set1_epi16 (short a) pure @trusted
2687 {
2688     return cast(__m128i)(short8(a));
2689 }
2690 
2691 __m128i _mm_set1_epi32 (int a) pure @trusted
2692 {
2693     return cast(__m128i)(int4(a));
2694 }
2695 unittest
2696 {
2697     __m128 a = _mm_set1_ps(-1.0f);
2698     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
2699     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
2700 }
2701 
2702 /// Broadcast 64-bit integer `a` to all elements of `dst`.
2703 __m128i _mm_set1_epi64 (__m64 a) pure @safe
2704 {
2705     return _mm_set_epi64(a, a);
2706 }
2707 
2708 __m128i _mm_set1_epi64x (long a) pure @trusted
2709 {
2710     return cast(__m128i)(long2(a));
2711 }
2712 
2713 __m128i _mm_set1_epi8 (byte a) pure @trusted
2714 {
2715     return cast(__m128i)(byte16(a));
2716 }
2717 
2718 alias _mm_set1_pd = _mm_set_pd1;
2719 
2720 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
2721                         short e3, short e2, short e1, short e0) pure @trusted
2722 {
2723     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
2724     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
2725 }
2726 
2727 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2728 {
2729     int[4] result = [e3, e2, e1, e0];
2730     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2731 }
2732 
2733 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
2734 {
2735     long[2] result = [e1, e0];
2736     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2737 }
2738 
2739 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
2740                        byte e11, byte e10, byte e9,  byte e8,
2741                        byte e7,  byte e6,  byte e5,  byte e4,
2742                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
2743 {
2744     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
2745                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
2746     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2747 }
2748 
2749 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
2750 {
2751     double[2] result = [e1, e0];
2752     return loadUnaligned!(double2)(result.ptr);
2753 }
2754 unittest
2755 {
2756     __m128d A = _mm_setr_pd(61.0, 55.0);
2757     double[2] correct = [61.0, 55.0];
2758     assert(A.array == correct);
2759 }
2760 
2761 __m128d _mm_setzero_pd () pure @trusted
2762 {
2763     double[2] result = [0.0, 0.0];
2764     return loadUnaligned!(double2)(result.ptr);
2765 }
2766 
2767 __m128i _mm_setzero_si128() pure @trusted
2768 {
2769     int[4] result = [0, 0, 0, 0];
2770     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2771 }
2772 
2773 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
2774 {
2775     static if (GDC_with_SSE2)
2776     {
2777         return __builtin_ia32_pshufd(a, imm8);
2778     }
2779     else
2780     {
2781         return shufflevector!(int4, (imm8 >> 0) & 3,
2782                                     (imm8 >> 2) & 3,
2783                                     (imm8 >> 4) & 3,
2784                                     (imm8 >> 6) & 3)(a, a);
2785     }
2786 }
2787 unittest
2788 {
2789     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
2790     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2791     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
2792     int[4] expectedB = [ 3, 2, 1, 0 ];
2793     assert(B.array == expectedB);
2794 }
2795 
2796 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
2797 {
2798     static if (GDC_with_SSE2)
2799     {
2800         return __builtin_ia32_shufpd(a, b, imm8);
2801     }
2802     else
2803     {
2804         return shufflevector!(double2, 0 + ( imm8 & 1 ),
2805                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
2806     }
2807 }
2808 unittest
2809 {
2810     __m128d A = _mm_setr_pd(0.5, 2.0);
2811     __m128d B = _mm_setr_pd(4.0, 5.0);
2812     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
2813     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
2814     double[2] correct = [ 2.0, 5.0 ];
2815     assert(R.array == correct);
2816 }
2817 
2818 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
2819 {
2820     static if (GDC_with_SSE2)
2821     {
2822         return __builtin_ia32_pshufhw(a, imm8);
2823     }
2824     else
2825     {
2826         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
2827                                           4 + ( (imm8 >> 0) & 3 ),
2828                                           4 + ( (imm8 >> 2) & 3 ),
2829                                           4 + ( (imm8 >> 4) & 3 ),
2830                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
2831     }
2832 }
2833 unittest
2834 {
2835     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2836     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2837     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
2838     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
2839     assert(C.array == expectedC);
2840 }
2841 
2842 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
2843 {
2844     static if (GDC_with_SSE2)
2845     {
2846         return __builtin_ia32_pshuflw(a, imm8);
2847     }
2848     else
2849     {
2850         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
2851                                                     ( (imm8 >> 2) & 3 ),
2852                                                     ( (imm8 >> 4) & 3 ),
2853                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
2854     }
2855 }
2856 unittest
2857 {
2858     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2859     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2860     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
2861     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
2862     assert(B.array == expectedB);
2863 }
2864 
2865 version(LDC)
2866 {
2867     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
2868 }
2869 else static if (GDC_with_SSE2)
2870 {
2871     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
2872 }
2873 else static if (DMD_with_32bit_asm)
2874 {
2875     __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
2876     {
2877         asm pure nothrow @nogc @trusted
2878         {
2879             movdqu XMM0, a;
2880             movdqu XMM1, count;
2881             pslld XMM0, XMM1;
2882             movdqu a, XMM0;
2883         }
2884         return a;
2885     }
2886 }
2887 else
2888 {
2889 
2890     __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
2891     {
2892         int4 r = void;
2893         long2 lc = cast(long2)count;
2894         int bits = cast(int)(lc.array[0]);
2895         foreach(i; 0..4)
2896             r[i] = cast(uint)(a[i]) << bits;
2897         return r;
2898     }
2899 }
2900 unittest
2901 {
2902     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2903     __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1));
2904     int[4] expectedB = [ 0, 4, 6, -8];
2905     assert(B.array == expectedB);
2906 }
2907 
2908 version(LDC)
2909 {
2910     alias _mm_sll_epi64  = __builtin_ia32_psllq128;
2911 }
2912 else static if (GDC_with_SSE2)
2913 {
2914     alias _mm_sll_epi64  = __builtin_ia32_psllq128;
2915 }
2916 else static if (DMD_with_32bit_asm)
2917 {
2918     __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
2919     {
2920         asm pure nothrow @nogc @trusted
2921         {
2922             movdqu XMM0, a;
2923             movdqu XMM1, count;
2924             psllq XMM0, XMM1;
2925             movdqu a, XMM0;
2926         }
2927         return a;
2928     }
2929 }
2930 else
2931 {
2932     __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
2933     {
2934         long2 r = void;
2935         long2 sa = cast(long2)a;
2936         long2 lc = cast(long2)count;
2937         int bits = cast(int)(lc.array[0]);
2938         foreach(i; 0..2)
2939             r.array[i] = cast(ulong)(sa.array[i]) << bits;
2940         return cast(__m128i)r;
2941     }
2942 }
2943 unittest
2944 {
2945     __m128i A = _mm_setr_epi64(8, -4);
2946     long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1));
2947     long[2] expectedB = [ 16, -8];
2948     assert(B.array == expectedB);
2949 }
2950 
2951 version(LDC)
2952 {
2953     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
2954 }
2955 else static if (GDC_with_SSE2)
2956 {
2957     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
2958 }
2959 else static if (DMD_with_32bit_asm)
2960 {
2961     __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
2962     {
2963         asm pure nothrow @nogc
2964         {
2965             movdqu XMM0, a;
2966             movdqu XMM1, count;
2967             psllw XMM0, XMM1;
2968             movdqu a, XMM0;
2969         }
2970         return a;
2971     }
2972 }
2973 else
2974 {
2975     __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
2976     {
2977         short8 sa = cast(short8)a;
2978         long2 lc = cast(long2)count;
2979         int bits = cast(int)(lc.array[0]);
2980         short8 r = void;
2981         foreach(i; 0..8)
2982             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
2983         return cast(int4)r;
2984     }
2985 }
2986 unittest
2987 {
2988     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2989     short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) );
2990     short[8] expectedB =     [ 0, 2, 4, 6, -8, -10, 12, 14 ];
2991     assert(B.array == expectedB);
2992 }
2993 
2994 version(LDC)
2995 {
2996     alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
2997 }
2998 else
2999 {
3000     static if (GDC_with_SSE2)
3001     {
3002         alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
3003     }
3004     else
3005     {
3006         __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
3007         {
3008             int4 r = void;
3009             foreach(i; 0..4)
3010                 r.array[i] = cast(uint)(a.array[i]) << imm8;
3011             return r;
3012         }
3013     }
3014 }
3015 unittest
3016 {
3017     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3018     __m128i B = _mm_slli_epi32(A, 1);
3019     int[4] expectedB = [ 0, 4, 6, -8];
3020     assert(B.array == expectedB);
3021 }
3022 
3023 version(LDC)
3024 {
3025     alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
3026 }
3027 else
3028 {
3029     static if (GDC_with_SSE2)
3030     {
3031         alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
3032     }
3033     else
3034     {
3035         __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe
3036         {
3037             long2 r = void;
3038             long2 sa = cast(long2)a;
3039             foreach(i; 0..2)
3040                 r.array[i] = cast(ulong)(sa.array[i]) << imm8;
3041             return cast(__m128i)r;
3042         }
3043     }
3044 }
3045 unittest
3046 {
3047     __m128i A = _mm_setr_epi64(8, -4);
3048     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3049     long[2] expectedB = [ 16, -8];
3050     assert(B.array == expectedB);
3051 }
3052 
3053 version(LDC)
3054 {
3055     alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
3056 }
3057 else
3058 {
3059     static if (GDC_with_SSE2)
3060     {
3061         alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
3062     }
3063     else
3064     {
3065         __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe
3066         {
3067             short8 sa = cast(short8)a;
3068             short8 r = void;
3069             foreach(i; 0..8)
3070                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) << imm8);
3071             return cast(int4)r;
3072         }
3073     }
3074 }
3075 unittest
3076 {
3077     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3078     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3079     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3080     assert(B.array == expectedB);
3081 }
3082 
3083 
3084 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3085 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3086 {
3087     static if (bytes & 0xF0)
3088     {
3089         return _mm_setzero_si128();
3090     }
3091     else
3092     {
3093         static if (GDC_with_SSE2)
3094         {
3095             return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 
3096         }
3097         else version(DigitalMars)
3098         {
3099             version(D_InlineAsm_X86)
3100             {
3101                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3102                 {
3103                     movdqu XMM0, op;
3104                     pslldq XMM0, bytes;
3105                     movdqu op, XMM0;
3106                 }
3107                 return op;
3108             }
3109             else
3110             {
3111                 byte16 A = cast(byte16)op;
3112                 byte16 R;
3113                 for (int n = 15; n >= bytes; --n)
3114                     R.ptr[n] = A.array[n-bytes];
3115                 for (int n = bytes-1; n >= 0; --n)
3116                     R.ptr[n] = 0;
3117                 return cast(__m128i)R;
3118             }
3119         }
3120         else
3121         {
3122             return cast(__m128i) shufflevector!(byte16,
3123             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3124             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3125             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3126             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3127         }
3128     }
3129 }
3130 unittest
3131 {
3132     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3133     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3134     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3135     assert(R.array == correct);
3136 }
3137 
3138 version(LDC)
3139 {
3140     // Disappeared with LDC 1.11
3141     static if (__VERSION__ < 2081)
3142         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
3143     else
3144     {
3145         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
3146         {
3147             vec.array[0] = llvm_sqrt(vec.array[0]);
3148             vec.array[1] = llvm_sqrt(vec.array[1]);
3149             return vec;
3150         }
3151     }
3152 }
3153 else
3154 {
3155     static if (GDC_with_SSE2)
3156     {
3157         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
3158     }
3159     else
3160     {
3161         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
3162         {
3163             vec.array[0] = sqrt(vec.array[0]);
3164             vec.array[1] = sqrt(vec.array[1]);
3165             return vec;
3166         }
3167     }
3168 }
3169 
3170 
3171 version(LDC)
3172 {
3173     // Disappeared with LDC 1.11
3174     static if (__VERSION__ < 2081)
3175         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3176     else
3177     {
3178         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3179         {
3180             vec.array[0] = llvm_sqrt(vec.array[0]);
3181             vec.array[1] = vec.array[1];
3182             return vec;
3183         }
3184     }
3185 }
3186 else
3187 {
3188     static if (GDC_with_SSE2)
3189     {
3190         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3191     }
3192     else
3193     {
3194         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3195         {
3196             vec.array[0] = sqrt(vec.array[0]);
3197             vec.array[1] = vec.array[1];
3198             return vec;
3199         }
3200     }
3201 }
3202 
3203 
3204 version(LDC)
3205 {
3206     alias _mm_sra_epi16 = __builtin_ia32_psraw128;
3207 }
3208 else
3209 {
3210     static if (GDC_with_SSE2)
3211     {
3212         alias _mm_sra_epi16 = __builtin_ia32_psraw128;
3213     }
3214     else
3215     {
3216         __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
3217         {
3218             short8 sa = cast(short8)a;
3219             long2 lc = cast(long2)count;
3220             int bits = cast(int)(lc.array[0]);
3221             short8 r = void;
3222             foreach(i; 0..8)
3223                 r.array[i] = cast(short)(sa.array[i] >> bits);
3224             return cast(int4)r;
3225         }
3226     }
3227 }
3228 unittest
3229 {
3230     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3231     short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) );
3232     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3233     assert(B.array == expectedB);
3234 }
3235 
3236 version(LDC)
3237 {
3238     alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3239 }
3240 else
3241 {
3242     static if (GDC_with_SSE2)
3243     {
3244         alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3245     }
3246     else
3247     {
3248         __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe
3249         {
3250             int4 r = void;
3251             long2 lc = cast(long2)count;
3252             int bits = cast(int)(lc.array[0]);
3253             foreach(i; 0..4)
3254                 r.array[i] = (a.array[i] >> bits);
3255             return r;
3256         }
3257     }
3258 }
3259 unittest
3260 {
3261     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3262     __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1));
3263     int[4] expectedB = [ 0, 1, 1, -2];
3264     assert(B.array == expectedB);
3265 }
3266 
3267 
3268 version(LDC)
3269 {
3270     alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
3271 }
3272 else
3273 {
3274     static if (GDC_with_SSE2)
3275     {
3276         alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
3277     }
3278     else
3279     {
3280         __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe
3281         {
3282             short8 sa = cast(short8)a;
3283             short8 r = void;
3284             foreach(i; 0..8)
3285                 r.array[i] = cast(short)(sa.array[i] >> imm8);
3286             return cast(int4)r;
3287         }
3288     }
3289 }
3290 unittest
3291 {
3292     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3293     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3294     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3295     assert(B.array == expectedB);
3296 }
3297 
3298 version(LDC)
3299 {
3300     alias _mm_srai_epi32  = __builtin_ia32_psradi128;
3301 }
3302 else
3303 {
3304     static if (GDC_with_SSE2)
3305     {
3306         alias _mm_srai_epi32  = __builtin_ia32_psradi128;
3307     }
3308     else
3309     {
3310         __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe
3311         {
3312             int4 r = void;
3313             foreach(i; 0..4)
3314                 r.array[i] = (a.array[i] >> imm8);
3315             return r;
3316         }
3317     }
3318 }
3319 unittest
3320 {
3321     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3322     __m128i B = _mm_srai_epi32(A, 1);
3323     int[4] expectedB = [ 0, 1, 1, -2];
3324     assert(B.array == expectedB);
3325 }
3326 
3327 version(LDC)
3328 {
3329     alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
3330 }
3331 else
3332 {
3333     static if (GDC_with_SSE2)
3334     {
3335         alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
3336     }
3337     else
3338     {
3339         __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
3340         {
3341             short8 sa = cast(short8)a;
3342             long2 lc = cast(long2)count;
3343             int bits = cast(int)(lc.array[0]);
3344             short8 r = void;
3345             foreach(i; 0..8)
3346                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3347             return cast(int4)r;
3348         }
3349     }
3350 }
3351 unittest
3352 {
3353     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3354     short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) );
3355     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3356     assert(B.array == expectedB);
3357 }
3358 
3359 version(LDC)
3360 {
3361     alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3362 }
3363 else
3364 {
3365     static if (GDC_with_SSE2)
3366     {
3367         alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3368     }
3369     else
3370     {
3371         __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe
3372         {
3373             int4 r = void;
3374             long2 lc = cast(long2)count;
3375             int bits = cast(int)(lc.array[0]);
3376             foreach(i; 0..4)
3377                 r.array[i] = cast(uint)(a.array[i]) >> bits;
3378             return r;
3379         }
3380     }
3381 }
3382 unittest
3383 {
3384     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3385     __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1));
3386     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3387     assert(B.array == expectedB);
3388 }
3389 
3390 version(LDC)
3391 {
3392     alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
3393 }
3394 else
3395 {
3396     static if (GDC_with_SSE2)
3397     {
3398         alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
3399     }
3400     else
3401     {
3402         __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
3403         {
3404             long2 r = void;
3405             long2 sa = cast(long2)a;
3406             long2 lc = cast(long2)count;
3407             int bits = cast(int)(lc.array[0]);
3408             foreach(i; 0..2)
3409                 r.array[i] = cast(ulong)(sa.array[i]) >> bits;
3410             return cast(__m128i)r;
3411         }
3412     }
3413 }
3414 unittest
3415 {
3416     __m128i A = _mm_setr_epi64(8, -4);
3417     long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1));
3418     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
3419     assert(B.array == expectedB);
3420 }
3421 
3422 version(LDC)
3423 {
3424     alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
3425 }
3426 else
3427 {
3428     static if (GDC_with_SSE2)
3429     {
3430         alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
3431     }
3432     else
3433     {
3434         __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
3435         {
3436             short8 sa = cast(short8)a;
3437             short8 r = void;
3438             foreach(i; 0..8)
3439                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> imm8);
3440             return cast(int4)r;
3441         }
3442     }
3443 }
3444 unittest
3445 {
3446     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3447     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3448     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3449     assert(B.array == expectedB);
3450 }
3451 
3452 version(LDC)
3453 {
3454     alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
3455 }
3456 else
3457 {
3458     static if (GDC_with_SSE2)
3459     {
3460         alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
3461     }
3462     else
3463     {
3464         __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe
3465         {
3466             int4 r = void;
3467             foreach(i; 0..4)
3468                 r.array[i] = cast(uint)(a.array[i]) >> imm8;
3469             return r;
3470         }
3471     }
3472 }
3473 unittest
3474 {
3475     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3476     __m128i B = _mm_srli_epi32(A, 1);
3477     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3478     assert(B.array == expectedB);
3479 }
3480 
3481 version(LDC)
3482 {
3483     alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
3484 }
3485 else
3486 {
3487     static if (GDC_with_SSE2)
3488     {
3489         alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
3490     }
3491     else
3492     {
3493         __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe
3494         {
3495             long2 r = void;
3496             long2 sa = cast(long2)a;
3497             foreach(i; 0..2)
3498                 r.array[i] = cast(ulong)(sa.array[i]) >> imm8;
3499             return cast(__m128i)r;
3500         }
3501     }
3502 }
3503 unittest
3504 {
3505     __m128i A = _mm_setr_epi64(8, -4);
3506     long2 B = cast(long2) _mm_srli_epi64(A, 1);
3507     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
3508     assert(B.array == expectedB);
3509 }
3510 
3511 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3512 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
3513 {
3514     static if (bytes & 0xF0)
3515     {
3516         return _mm_setzero_si128();
3517     }
3518     else
3519     {
3520         static if (GDC_with_SSE2)
3521         {
3522             return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8));
3523         }
3524         else static if (DMD_with_32bit_asm)
3525         {
3526             asm pure nothrow @nogc @trusted
3527             {
3528                 movdqu XMM0, v;
3529                 psrldq XMM0, bytes;
3530                 movdqu v, XMM0;
3531             }
3532             return v;
3533         }
3534         else
3535         {
3536             return cast(__m128i) shufflevector!(byte16,
3537                                                 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
3538                                                 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
3539                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
3540         }
3541     }
3542 
3543 }
3544 
3545 unittest
3546 {
3547     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
3548     int[4] correct = [2, 3, 4, 0];
3549     assert(R.array == correct);
3550 }
3551 
3552 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3553 /// #BONUS
3554 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
3555 {
3556     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
3557 }
3558 unittest
3559 {
3560     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
3561     float[4] correct = [3.0f, 4.0f, 0, 0];
3562     assert(R.array == correct);
3563 }
3564 
3565 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3566 /// #BONUS
3567 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
3568 {
3569     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
3570 }
3571 
3572 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
3573 {
3574     __m128d* aligned = cast(__m128d*)mem_addr;
3575     *aligned = a;
3576 }
3577 
3578 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
3579 {
3580     __m128d* aligned = cast(__m128d*)mem_addr;
3581     __m128d r;
3582     r.ptr[0] = a.array[0];
3583     r.ptr[1] = a.array[0];
3584     *aligned = r;
3585 }
3586 
3587 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
3588 {
3589     *mem_addr = a.array[0];
3590 }
3591 
3592 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
3593 {
3594     *mem_addr = a;
3595 }
3596 
3597 alias _mm_store1_pd = _mm_store_pd1;
3598 
3599 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
3600 {
3601     *mem_addr = a.array[1];
3602 }
3603 
3604 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
3605 // expectations from the user point of view. This problem also exist in C++.
3606 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
3607 {
3608     long* dest = cast(long*)mem_addr;
3609     long2 la = cast(long2)a;
3610     *dest = la.array[0];
3611 }
3612 unittest
3613 {
3614     long[3] A = [1, 2, 3];
3615     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
3616     long[3] correct = [1, 0x1_0000_0000, 3];
3617     assert(A == correct);
3618 }
3619 
3620 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
3621 {
3622     *mem_addr = a.array[0];
3623 }
3624 
3625 void _mm_storer_pd (double* mem_addr, __m128d a) pure
3626 {
3627     __m128d* aligned = cast(__m128d*)mem_addr;
3628     *aligned = shufflevector!(double2, 1, 0)(a, a);
3629 }
3630 
3631 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
3632 {
3633     storeUnaligned!double2(a, mem_addr);
3634 }
3635 
3636 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
3637 {
3638     storeUnaligned!__m128i(a, cast(int*)mem_addr);
3639 }
3640 
3641 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
3642 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
3643 /// boundary or a general-protection exception may be generated.
3644 void _mm_stream_pd (double* mem_addr, __m128d a)
3645 {
3646     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3647     __m128d* dest = cast(__m128d*)mem_addr;
3648     *dest = a;
3649 }
3650 
3651 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
3652 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
3653 /// may be generated.
3654 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
3655 {
3656     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3657     __m128i* dest = cast(__m128i*)mem_addr;
3658     *dest = a;
3659 }
3660 
3661 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
3662 /// pollution. If the cache line containing address mem_addr is already in the cache,
3663 /// the cache will be updated.
3664 void _mm_stream_si32 (int* mem_addr, int a)
3665 {
3666     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3667     *mem_addr = a;
3668 }
3669 
3670 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
3671 /// cache pollution. If the cache line containing address mem_addr is already
3672 /// in the cache, the cache will be updated.
3673 void _mm_stream_si64 (long* mem_addr, long a)
3674 {
3675     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3676     *mem_addr = a;
3677 }
3678 
3679 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
3680 {
3681     return cast(__m128i)(cast(short8)a - cast(short8)b);
3682 }
3683 
3684 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
3685 {
3686     return cast(__m128i)(cast(int4)a - cast(int4)b);
3687 }
3688 
3689 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
3690 {
3691     return cast(__m128i)(cast(long2)a - cast(long2)b);
3692 }
3693 
3694 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
3695 {
3696     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
3697 }
3698 
3699 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
3700 {
3701     return a - b;
3702 }
3703 
3704 version(DigitalMars)
3705 {
3706     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
3707     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
3708     {
3709         asm pure nothrow @nogc @trusted { nop;}
3710         a[0] = a[0] - b[0];
3711         return a;
3712     }
3713 }
3714 else static if (GDC_with_SSE2)
3715 {
3716     alias _mm_sub_sd = __builtin_ia32_subsd;
3717 }
3718 else
3719 {
3720     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
3721     {
3722         a.array[0] -= b.array[0];
3723         return a;
3724     }
3725 }
3726 unittest
3727 {
3728     __m128d a = [1.5, -2.0];
3729     a = _mm_sub_sd(a, a);
3730     assert(a.array == [0.0, -2.0]);
3731 }
3732 
3733 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
3734 {
3735     return a - b;
3736 }
3737 
3738 version(LDC)
3739 {
3740     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3741     {
3742         // Generates PSUBSW since LDC 1.15 -O0
3743         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
3744         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
3745         {
3746             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
3747             enum ir = `
3748                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
3749                 ret <8 x i16> %r`;
3750             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
3751         }
3752     }
3753     else
3754         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
3755 }
3756 else
3757 {
3758     static if (GDC_with_SSE2)
3759     {
3760         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
3761     }
3762     else
3763     {
3764         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
3765         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
3766         {
3767             short[8] res;
3768             short8 sa = cast(short8)a;
3769             short8 sb = cast(short8)b;
3770             foreach(i; 0..8)
3771                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
3772             return _mm_loadu_si128(cast(int4*)res.ptr);
3773         }
3774     }
3775 }
3776 unittest
3777 {
3778     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
3779                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
3780     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
3781     assert(res.array == correctResult);
3782 }
3783 
3784 version(LDC)
3785 {
3786     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3787     {
3788         // Generates PSUBSB since LDC 1.15 -O0
3789         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
3790         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
3791         {
3792             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
3793             enum ir = `
3794                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
3795                 ret <16 x i8> %r`;
3796             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
3797         }
3798     }
3799     else
3800         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
3801 }
3802 else
3803 {
3804     static if (GDC_with_SSE2)
3805     {
3806         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
3807     }
3808     else
3809     {
3810         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
3811         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
3812         {
3813             byte[16] res;
3814             byte16 sa = cast(byte16)a;
3815             byte16 sb = cast(byte16)b;
3816             foreach(i; 0..16)
3817                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
3818             return _mm_loadu_si128(cast(int4*)res.ptr);
3819         }
3820     }
3821 }
3822 unittest
3823 {
3824     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
3825                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
3826     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3827     assert(res.array == correctResult);
3828 }
3829 
3830 version(LDC)
3831 {
3832     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3833     {
3834         // Generates PSUBUSW since LDC 1.15 -O0
3835         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
3836         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
3837         {
3838             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
3839             enum ir = `
3840                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
3841                 ret <8 x i16> %r`;
3842             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
3843         }
3844     }
3845     else
3846         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
3847 }
3848 else
3849 {
3850     static if (GDC_with_SSE2)
3851     {
3852         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
3853     }
3854     else
3855     {
3856         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
3857         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
3858         {
3859             short[8] res;
3860             short8 sa = cast(short8)a;
3861             short8 sb = cast(short8)b;
3862             foreach(i; 0..8)
3863             {
3864                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
3865                 res[i] = saturateSignedIntToUnsignedShort(sum);
3866             }
3867             return _mm_loadu_si128(cast(int4*)res.ptr);
3868         }
3869     }
3870 }
3871 unittest
3872 {
3873     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
3874                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
3875     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
3876     assert(R.array == correct);
3877 }
3878 
3879 version(LDC)
3880 {
3881     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3882     {
3883         // Generates PSUBUSB since LDC 1.15 -O0
3884         /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
3885         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
3886         {
3887             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
3888             enum ir = `
3889                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
3890                 ret <16 x i8> %r`;
3891             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
3892         }
3893     }
3894     else    
3895         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
3896 }
3897 else
3898 {
3899     static if (GDC_with_SSE2)
3900     {
3901         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
3902     }
3903     else
3904     {
3905         /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
3906         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
3907         {
3908             ubyte[16] res;
3909             byte16 sa = cast(byte16)a;
3910             byte16 sb = cast(byte16)b;
3911             foreach(i; 0..16)
3912                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
3913             return _mm_loadu_si128(cast(int4*)res.ptr);
3914         }
3915     }
3916 }
3917 unittest
3918 {
3919     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
3920                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
3921     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3922     assert(res.array == correctResult);
3923 }
3924 
3925 // Note: the only difference between these intrinsics is the signalling
3926 //       behaviour of quiet NaNs. This is incorrect but the case where
3927 //       you would want to differentiate between qNaN and sNaN and then
3928 //       treat them differently on purpose seems extremely rare.
3929 alias _mm_ucomieq_sd = _mm_comieq_sd;
3930 alias _mm_ucomige_sd = _mm_comige_sd;
3931 alias _mm_ucomigt_sd = _mm_comigt_sd;
3932 alias _mm_ucomile_sd = _mm_comile_sd;
3933 alias _mm_ucomilt_sd = _mm_comilt_sd;
3934 alias _mm_ucomineq_sd = _mm_comineq_sd;
3935 
3936 __m128d _mm_undefined_pd() pure @safe
3937 {
3938     __m128d result = void;
3939     return result;
3940 }
3941 __m128i _mm_undefined_si128() pure @safe
3942 {
3943     __m128i result = void;
3944     return result;
3945 }
3946 
3947 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
3948 {
3949     static if (GDC_with_SSE2)
3950     {
3951         return __builtin_ia32_punpckhwd128(a, b);
3952     }
3953     else static if (DMD_with_32bit_asm)
3954     {
3955         asm pure nothrow @nogc @trusted
3956         {
3957             movdqu XMM0, a;
3958             movdqu XMM1, b;
3959             punpckhwd XMM0, XMM1;
3960             movdqu a, XMM0;
3961         }
3962         return a;
3963     }
3964     else
3965     {
3966         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
3967                                            (cast(short8)a, cast(short8)b);
3968     }
3969 }
3970 unittest
3971 {
3972     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
3973     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
3974     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
3975     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
3976     assert(C.array == correct);
3977 }
3978 
3979 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
3980 {
3981     static if (GDC_with_SSE2)
3982     {
3983         return __builtin_ia32_punpckhdq128(a, b);
3984     }
3985     else
3986     {
3987         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
3988     }
3989 }
3990 
3991 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
3992 {
3993     static if (GDC_with_SSE2)
3994     {
3995         return __builtin_ia32_punpckhqdq128(a, b);
3996     }
3997     else
3998     {
3999         __m128i r = cast(__m128i)b;
4000         r[0] = a[2];
4001         r[1] = a[3];
4002         return r; 
4003     }
4004 }
4005 unittest // Issue #36
4006 {
4007     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4008     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4009     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4010     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4011     assert(C.array == correct);
4012 }
4013 
4014 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4015 {
4016     static if (GDC_with_SSE2)
4017     {
4018         return __builtin_ia32_punpckhbw128(a, b);
4019     }
4020     else static if (DMD_with_32bit_asm)
4021     {
4022         asm pure nothrow @nogc @trusted
4023         {
4024             movdqu XMM0, a;
4025             movdqu XMM1, b;
4026             punpckhbw XMM0, XMM1;
4027             movdqu a, XMM0;
4028         }
4029         return a;
4030     }
4031     else
4032     {
4033         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4034                                                    12, 28, 13, 29, 14, 30, 15, 31)
4035                                                    (cast(byte16)a, cast(byte16)b);
4036     }
4037 }
4038 
4039 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4040 {
4041     static if (GDC_with_SSE2)
4042     {
4043         return __builtin_ia32_unpckhpd(a, b);
4044     }
4045     else
4046     {
4047         return shufflevector!(__m128d, 1, 3)(a, b);
4048     }
4049 }
4050 
4051 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4052 {
4053     static if (GDC_with_SSE2)
4054     {
4055         return __builtin_ia32_punpcklwd128(a, b);
4056     }
4057     else static if (DMD_with_32bit_asm)
4058     {
4059         asm pure nothrow @nogc @trusted
4060         {
4061             movdqu XMM0, a;
4062             movdqu XMM1, b;
4063             punpcklwd XMM0, XMM1;
4064             movdqu a, XMM0;
4065         }
4066         return a;
4067     }
4068     else
4069     {
4070         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4071                                            (cast(short8)a, cast(short8)b);
4072     }
4073 }
4074 
4075 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
4076 {
4077     static if (GDC_with_SSE2)
4078     {
4079         return __builtin_ia32_punpckldq128(a, b);
4080     }
4081     else
4082     {
4083         return shufflevector!(int4, 0, 4, 1, 5)
4084                              (cast(int4)a, cast(int4)b);
4085     }
4086 }
4087 
4088 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4089 {
4090     static if (GDC_with_SSE2)
4091     {
4092         return __builtin_ia32_punpcklqdq128(a, b);
4093     }
4094     else
4095     {
4096         long2 lA = cast(long2)a;
4097         long2 lB = cast(long2)b;
4098         long2 R;
4099         R.ptr[0] = lA.array[0];
4100         R.ptr[1] = lB.array[0];
4101         return cast(__m128i)R;
4102     }
4103 }
4104 unittest // Issue #36
4105 {
4106     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4107     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4108     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4109     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4110     assert(C.array == correct);
4111 }
4112 
4113 
4114 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4115 {
4116     static if (GDC_with_SSE2)
4117     {
4118         return __builtin_ia32_punpcklbw128(a, b);
4119     }
4120     else static if (DMD_with_32bit_asm)
4121     {
4122         asm pure nothrow @nogc @trusted
4123         {
4124             movdqu XMM0, a;
4125             movdqu XMM1, b;
4126             punpcklbw XMM0, XMM1;
4127             movdqu a, XMM0;
4128         }
4129         return a;
4130     }
4131     else
4132     {
4133         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4134                                                     4, 20, 5, 21, 6, 22, 7, 23)
4135                                            (cast(byte16)a, cast(byte16)b);
4136     }
4137 }
4138 
4139 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4140 {
4141     static if (GDC_with_SSE2)
4142     {
4143         return __builtin_ia32_unpcklpd(a, b);
4144     }
4145     else
4146     {
4147         return shufflevector!(__m128d, 0, 2)(a, b);
4148     }
4149 }
4150 
4151 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4152 {
4153     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4154 }
4155 
4156 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4157 {
4158     return a ^ b;
4159 }
4160 
4161 unittest
4162 {
4163     // distance between two points in 4D
4164     float distance(float[4] a, float[4] b) nothrow @nogc
4165     {
4166         __m128 va = _mm_loadu_ps(a.ptr);
4167         __m128 vb = _mm_loadu_ps(b.ptr);
4168         __m128 diffSquared = _mm_sub_ps(va, vb);
4169         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4170         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4171         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4172         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4173     }
4174     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4175 }