1 /**
2 * SSE2 intrinsics. 
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 * Authors:   Guillaume Piolat
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     return cast(__m128i)(cast(short8)a + cast(short8)b);
25 }
26 unittest
27 {
28     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
29     short8 R = cast(short8) _mm_add_epi16(A, A);
30     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
31     assert(R.array == correct);
32 }
33 
34 /// Add packed 32-bit integers in `a` and `b`.
35 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
36 {
37     return cast(__m128i)(cast(int4)a + cast(int4)b);
38 }
39 unittest
40 {
41     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
42     int4 R = _mm_add_epi32(A, A);
43     int[4] correct = [ -14, -2, 0, 18 ];
44     assert(R.array == correct);
45 }
46 
47 /// Add packed 64-bit integers in `a` and `b`.
48 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
49 {
50     return cast(__m128i)(cast(long2)a + cast(long2)b);
51 }
52 unittest
53 {
54     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
55     long2 R = cast(long2) _mm_add_epi64(A, A);
56     long[2] correct = [ -2, 0 ];
57     assert(R.array == correct);
58 }
59 
60 /// Add packed 8-bit integers in `a` and `b`.
61 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
62 {
63     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
64 }
65 unittest
66 {
67     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
68     byte16 R = cast(byte16) _mm_add_epi8(A, A);
69     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
70     assert(R.array == correct);
71 }
72 
73 /// Add the lower double-precision (64-bit) floating-point element 
74 /// in `a` and `b`, store the result in the lower element of dst, 
75 /// and copy the upper element from `a` to the upper element of destination. 
76 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
77 {
78     static if (GDC_with_SSE2)
79     {
80         return __builtin_ia32_addsd(a, b);
81     }
82     else version(DigitalMars)
83     {
84         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
85         asm pure nothrow @nogc @trusted { nop;}
86         a[0] = a[0] + b[0];
87         return a;
88     }
89     else
90     {
91         a[0] += b[0];
92         return a;
93     }
94 }
95 unittest
96 {
97     __m128d a = [1.5, -2.0];
98     a = _mm_add_sd(a, a);
99     assert(a.array == [3.0, -2.0]);
100 }
101 
102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
104 {
105     return a + b;
106 }
107 unittest
108 {
109     __m128d a = [1.5, -2.0];
110     a = _mm_add_pd(a, a);
111     assert(a.array == [3.0, -4.0]);
112 }
113 
114 /// Add 64-bit integers `a` and `b`.
115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
116 {
117     return a + b;
118 }
119 
120 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
122 {
123     static if (GDC_with_SSE2)
124     {
125         return __builtin_ia32_paddsw128(a, b);
126     }
127     else version(LDC)
128     {
129         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
130         {
131             // x86: Generates PADDSW since LDC 1.15 -O0
132             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
133             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
134             enum ir = `
135                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
136                 ret <8 x i16> %r`;
137             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
138         }
139         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
140         {
141             short[8] res;
142             short8 sa = cast(short8)a;
143             short8 sb = cast(short8)b;
144             foreach(i; 0..8)
145                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
146             return _mm_loadu_si128(cast(int4*)res.ptr);
147         }
148         else
149             return __builtin_ia32_paddsw128(a, b);
150     }
151     else
152     {
153         short[8] res;
154         short8 sa = cast(short8)a;
155         short8 sb = cast(short8)b;
156         foreach(i; 0..8)
157             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
158         return _mm_loadu_si128(cast(int4*)res.ptr);
159     }
160 }
161 unittest
162 {
163     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
164                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
165     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
166     assert(res.array == correctResult);
167 }
168 
169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
171 {
172     static if (GDC_with_SSE2)
173     {
174         return __builtin_ia32_paddsb128(a, b);
175     }
176     else version(LDC)
177     {
178         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
179         {
180             // x86: Generates PADDSB since LDC 1.15 -O0
181             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
182             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
183             enum ir = `
184                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
185                 ret <16 x i8> %r`;
186             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
187         }
188         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
189         {
190             byte[16] res;
191             byte16 sa = cast(byte16)a;
192             byte16 sb = cast(byte16)b;
193             foreach(i; 0..16)
194                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
195             return _mm_loadu_si128(cast(int4*)res.ptr);
196         }
197         else
198             return __builtin_ia32_paddsb128(a, b);
199     }
200     else
201     {
202         byte[16] res;
203         byte16 sa = cast(byte16)a;
204         byte16 sb = cast(byte16)b;
205         foreach(i; 0..16)
206             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
207         return _mm_loadu_si128(cast(int4*)res.ptr);
208     }
209 }
210 unittest
211 {
212     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
213                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
214     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
215                                                16, 18, 20, 22, 24, 26, 28, 30];
216     assert(res.array == correctResult);
217 }
218 
219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
220 // PERF: #GDC version?
221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
222 {
223     version(LDC)
224     {
225         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
226         {
227             // x86: Generates PADDUSB since LDC 1.15 -O0
228             // ARM: Generates uqadd.16b since LDC 1.21 -O1
229             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
230             enum ir = `
231                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
232                 ret <16 x i8> %r`;
233             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
234         }
235         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
236         {
237             ubyte[16] res;
238             byte16 sa = cast(byte16)a;
239             byte16 sb = cast(byte16)b;
240             foreach(i; 0..16)
241                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
242             return _mm_loadu_si128(cast(int4*)res.ptr);
243         }
244         else
245             return __builtin_ia32_paddusb128(a, b);
246     }
247     else
248     {
249         ubyte[16] res;
250         byte16 sa = cast(byte16)a;
251         byte16 sb = cast(byte16)b;
252         foreach(i; 0..16)
253             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
254         return _mm_loadu_si128(cast(int4*)res.ptr);
255     }
256 }
257 unittest
258 {
259     byte16 res = cast(byte16) _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
260                                             _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
261     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 0, cast(byte)255, 4, 6, 8, 10, 12, 14];
262     assert(res.array == correctResult);
263 }
264 
265 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
266 // PERF: #GDC version?
267 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
268 {
269     version(LDC)
270     {
271         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
272         {
273             // x86: Generates PADDUSW since LDC 1.15 -O0
274             // ARM: Generates uqadd.8h since LDC 1.21 -O1
275             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
276             enum ir = `
277                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
278                 ret <8 x i16> %r`;
279             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
280         }
281         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
282         {
283             ushort[8] res;
284             short8 sa = cast(short8)a;
285             short8 sb = cast(short8)b;
286             foreach(i; 0..8)
287                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
288             return _mm_loadu_si128(cast(int4*)res.ptr);
289         }
290         else
291             return __builtin_ia32_paddusw128(a, b);
292     }
293     else
294     {
295         ushort[8] res;
296         short8 sa = cast(short8)a;
297         short8 sb = cast(short8)b;
298         foreach(i; 0..8)
299             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
300         return _mm_loadu_si128(cast(int4*)res.ptr);
301     }
302 }
303 unittest
304 {
305     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
306                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
307     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
308     assert(res.array == correctResult);
309 }
310 
311 /// Compute the bitwise AND of packed double-precision (64-bit) 
312 /// floating-point elements in `a` and `b`.
313 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
314 {
315     return cast(__m128d)( cast(long2)a & cast(long2)b );
316 }
317 unittest
318 {
319     double a = 4.32;
320     double b = -78.99;
321     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
322     __m128d A = _mm_set_pd(a, b);
323     __m128d B = _mm_set_pd(b, a);
324     long2 R = cast(long2)( _mm_and_pd(A, B) );
325     assert(R.array[0] == correct);
326     assert(R.array[1] == correct);
327 }
328 
329 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
330 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
331 {
332     return a & b;
333 }
334 unittest
335 {
336     __m128i A = _mm_set1_epi32(7);
337     __m128i B = _mm_set1_epi32(14);
338     __m128i R = _mm_and_si128(A, B);
339     int[4] correct = [6, 6, 6, 6];
340     assert(R.array == correct);
341 }
342 
343 /// Compute the bitwise NOT of packed double-precision (64-bit) 
344 /// floating-point elements in `a` and then AND with `b`.
345 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
346 {
347     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
348 }
349 unittest
350 {
351     double a = 4.32;
352     double b = -78.99;
353     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
354     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
355     __m128d A = _mm_setr_pd(a, b);
356     __m128d B = _mm_setr_pd(b, a);
357     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
358     assert(R.array[0] == correct);
359     assert(R.array[1] == correct2);
360 }
361 
362 /// Compute the bitwise NOT of 128 bits (representing integer data) 
363 /// in `a` and then AND with `b`.
364 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
365 {
366     return (~a) & b;
367 }
368 unittest
369 {
370     __m128i A = _mm_set1_epi32(7);
371     __m128i B = _mm_set1_epi32(14);
372     __m128i R = _mm_andnot_si128(A, B);
373     int[4] correct = [8, 8, 8, 8];
374     assert(R.array == correct);
375 }
376 
377 /// Average packed unsigned 16-bit integers in `a` and `b`.
378 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
379 {
380     static if (GDC_with_SSE2)
381     {
382         return __builtin_ia32_pavgw128(a, b);
383     }
384     else static if (LDC_with_ARM64)
385     {
386         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
387     }
388     else version(LDC)
389     {
390         // Generates pavgw even in LDC 1.0, even in -O0
391         // But not in ARM
392         enum ir = `
393             %ia = zext <8 x i16> %0 to <8 x i32>
394             %ib = zext <8 x i16> %1 to <8 x i32>
395             %isum = add <8 x i32> %ia, %ib
396             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
397             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
398             %r = trunc <8 x i32> %isums to <8 x i16>
399             ret <8 x i16> %r`;
400         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
401     }
402     else
403     {
404         short8 sa = cast(short8)a;
405         short8 sb = cast(short8)b;
406         short8 sr = void;
407         foreach(i; 0..8)
408         {
409             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
410         }
411         return cast(int4)sr;
412     }
413 }
414 unittest
415 {
416     __m128i A = _mm_set1_epi16(31);
417     __m128i B = _mm_set1_epi16(64);
418     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
419     foreach(i; 0..8)
420         assert(avg.array[i] == 48);
421 }
422 
423 /// Average packed unsigned 8-bit integers in `a` and `b`.
424 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
425 {
426     static if (GDC_with_SSE2)
427     {
428         return __builtin_ia32_pavgb128(a, b);
429     }
430     else static if (LDC_with_ARM64)
431     {
432         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
433     }
434     else version(LDC)
435     {
436         // Generates pavgb even in LDC 1.0, even in -O0
437         // But not in ARM
438         enum ir = `
439             %ia = zext <16 x i8> %0 to <16 x i16>
440             %ib = zext <16 x i8> %1 to <16 x i16>
441             %isum = add <16 x i16> %ia, %ib
442             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
443             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
444             %r = trunc <16 x i16> %isums to <16 x i8>
445             ret <16 x i8> %r`;
446         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
447     }
448     else
449     {
450         byte16 sa = cast(byte16)a;
451         byte16 sb = cast(byte16)b;
452         byte16 sr = void;
453         foreach(i; 0..16)
454         {
455             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
456         }
457         return cast(int4)sr;
458     }
459 }
460 unittest
461 {
462     __m128i A = _mm_set1_epi8(31);
463     __m128i B = _mm_set1_epi8(64);
464     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
465     foreach(i; 0..16)
466         assert(avg.array[i] == 48);
467 }
468 
469 /// Shift `a` left by `bytes` bytes while shifting in zeros.
470 alias _mm_bslli_si128 = _mm_slli_si128;
471 unittest
472 {
473     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
474     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
475     __m128i result = _mm_bslli_si128!5(toShift);
476     assert( (cast(byte16)result).array == exact);
477 }
478 
479 /// Shift `v` right by `bytes` bytes while shifting in zeros.
480 alias _mm_bsrli_si128 = _mm_srli_si128;
481 unittest
482 {
483     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
484     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
485     __m128i result = _mm_bsrli_si128!5(toShift);
486     assert( (cast(byte16)result).array == exact);
487 }
488 
489 /// Cast vector of type `__m128d` to type `__m128`. 
490 /// Note: Also possible with a regular `cast(__m128)(a)`.
491 __m128 _mm_castpd_ps (__m128d a) pure @safe
492 {
493     return cast(__m128)a;
494 }
495 
496 /// Cast vector of type `__m128d` to type `__m128i`. 
497 /// Note: Also possible with a regular `cast(__m128i)(a)`.
498 __m128i _mm_castpd_si128 (__m128d a) pure @safe
499 {
500     return cast(__m128i)a;
501 }
502 
503 /// Cast vector of type `__m128` to type `__m128d`. 
504 /// Note: Also possible with a regular `cast(__m128d)(a)`.
505 __m128d _mm_castps_pd (__m128 a) pure @safe
506 {
507     return cast(__m128d)a;
508 }
509 
510 /// Cast vector of type `__m128` to type `__m128i`. 
511 /// Note: Also possible with a regular `cast(__m128i)(a)`.
512 __m128i _mm_castps_si128 (__m128 a) pure @safe
513 {
514     return cast(__m128i)a;
515 }
516 
517 /// Cast vector of type `__m128i` to type `__m128d`. 
518 /// Note: Also possible with a regular `cast(__m128d)(a)`.
519 __m128d _mm_castsi128_pd (__m128i a) pure @safe
520 {
521     return cast(__m128d)a;
522 }
523 
524 /// Cast vector of type `__m128i` to type `__m128`. 
525 /// Note: Also possible with a regular `cast(__m128)(a)`.
526 __m128 _mm_castsi128_ps (__m128i a) pure @safe
527 {
528     return cast(__m128)a;
529 }
530 
531 /// Invalidate and flush the cache line that contains `p` 
532 /// from all levels of the cache hierarchy.
533 void _mm_clflush (const(void)* p) @trusted
534 {
535     static if (GDC_with_SSE2)
536     {
537         __builtin_ia32_clflush(p);
538     }
539     else static if (LDC_with_SSE2)
540     {
541         __builtin_ia32_clflush(cast(void*)p);
542     }
543     else version(D_InlineAsm_X86)
544     {
545         asm pure nothrow @nogc @safe
546         {
547             mov EAX, p;
548             clflush [EAX];
549         }
550     }
551     else version(D_InlineAsm_X86_64)
552     {
553         asm pure nothrow @nogc @safe
554         {
555             mov RAX, p;
556             clflush [RAX];
557         }
558     }
559     else 
560     {
561         // Do nothing. Invalidating cacheline does
562         // not affect correctness.
563     }
564 }
565 unittest
566 {
567     ubyte[64] cacheline;
568     _mm_clflush(cacheline.ptr);
569 }
570 
571 /// Compare packed 16-bit integers in `a` and `b` for equality.
572 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
573 {
574     static if (GDC_with_SSE2)
575     {
576         return __builtin_ia32_pcmpeqw128(a, b);
577     }
578     else
579     {
580         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
581     }
582 }
583 unittest
584 {
585     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
586     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
587     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
588     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
589     assert(R.array == E);
590 }
591 
592 /// Compare packed 32-bit integers in `a` and `b` for equality.
593 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
594 {
595     static if (GDC_with_SSE2)
596     {
597         return __builtin_ia32_pcmpeqd128(a, b);
598     }
599     else
600     {
601         return equalMask!__m128i(a, b);
602     }
603 }
604 unittest
605 {
606     int4   A = [-3, -2, -1,  0];
607     int4   B = [ 4, -2,  2,  0];
608     int[4] E = [ 0, -1,  0, -1];
609     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
610     assert(R.array == E);
611 }
612 
613 /// Compare packed 8-bit integers in `a` and `b` for equality.
614 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
615 {
616     static if (GDC_with_SSE2)
617     {
618         return __builtin_ia32_pcmpeqb128(a, b); 
619     }
620     else
621     {
622         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
623     }
624 }
625 unittest
626 {
627     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
628     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
629     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
630     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
631     assert(C.array == correct);
632 }
633 
634 /// Compare packed double-precision (64-bit) floating-point elements 
635 /// in `a` and `b` for equality.
636 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
637 {
638     static if (GDC_with_SSE2)
639     {
640         return __builtin_ia32_cmpeqpd(a, b);
641     }
642     else
643     {
644         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
645     }
646 }
647 
648 /// Compare the lower double-precision (64-bit) floating-point elements
649 /// in `a` and `b` for equality, store the result in the lower element,
650 /// and copy the upper element from `a`.
651 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
652 {
653     static if (GDC_with_SSE2)
654     {
655         return __builtin_ia32_cmpeqsd(a, b);
656     }
657     else
658     {
659         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
660     }
661 }
662 
663 /// Compare packed double-precision (64-bit) floating-point elements 
664 /// in `a` and `b` for greater-than-or-equal.
665 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
666 {
667     static if (GDC_with_SSE2)
668     {
669         return __builtin_ia32_cmpgepd(a, b);
670     }
671     else
672     {
673         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
674     }
675 }
676 
677 /// Compare the lower double-precision (64-bit) floating-point elements 
678 /// in `a` and `b` for greater-than-or-equal, store the result in the 
679 /// lower element, and copy the upper element from `a`.
680 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
681 {
682     // Note: There is no __builtin_ia32_cmpgesd builtin.
683     static if (GDC_with_SSE2)
684     {
685         return __builtin_ia32_cmpnltsd(b, a);
686     }
687     else
688     {
689         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
690     }
691 }
692 
693 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
694 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
695 {
696     static if (GDC_with_SSE2)
697     {
698         return __builtin_ia32_pcmpgtw128(a, b); 
699     }
700     else
701     {
702         return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
703     }
704 }
705 unittest
706 {
707     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
708     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
709     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
710     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
711     assert(R.array == E);
712 }
713 
714 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
715 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
716 {
717     static if (GDC_with_SSE2)
718     {
719         return __builtin_ia32_pcmpgtd128(a, b); 
720     }
721     else
722     {
723         return cast(__m128i)( greaterMask!int4(a, b));
724     }
725 }
726 unittest
727 {
728     int4   A = [-3,  2, -1,  0];
729     int4   B = [ 4, -2,  2,  0];
730     int[4] E = [ 0, -1,  0,  0];
731     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
732     assert(R.array == E);
733 }
734 
735 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
736 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
737 {
738     static if (GDC_with_SSE2)
739     {
740         return __builtin_ia32_pcmpgtb128(a, b); 
741     }
742     else
743     {
744         return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
745     }
746 }
747 unittest
748 {
749     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
750     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
751     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
752     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
753     __m128i D = _mm_cmpeq_epi8(A, B);
754     assert(C.array == correct);
755 }
756 
757 /// Compare packed double-precision (64-bit) floating-point elements 
758 /// in `a` and `b` for greater-than.
759 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
760 {
761     static if (GDC_with_SSE2)
762     {
763         return __builtin_ia32_cmpgtpd(a, b); 
764     }
765     else
766     {
767         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
768     }
769 }
770 
771 /// Compare the lower double-precision (64-bit) floating-point elements 
772 /// in `a` and `b` for greater-than, store the result in the lower element,
773 /// and copy the upper element from `a`.
774 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
775 {
776     // Note: There is no __builtin_ia32_cmpgtsd builtin.
777     static if (GDC_with_SSE2)
778     {
779         return __builtin_ia32_cmpnlesd(b, a);
780     }
781     else
782     {
783         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
784     }
785 }
786 
787 /// Compare packed double-precision (64-bit) floating-point elements 
788 /// in `a` and `b` for less-than-or-equal.
789 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
790 {
791     static if (GDC_with_SSE2)
792     {
793         return __builtin_ia32_cmplepd(a, b); 
794     }
795     else
796     {
797         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
798     }
799 }
800 
801 /// Compare the lower double-precision (64-bit) floating-point elements 
802 /// in `a` and `b` for less-than-or-equal, store the result in the 
803 /// lower element, and copy the upper element from `a`.
804 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
805 {
806     static if (GDC_with_SSE2)
807     {
808         return __builtin_ia32_cmplesd(a, b); 
809     }
810     else
811     {
812         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
813     }
814 }
815 
816 /// Compare packed 16-bit integers in `a` and `b` for less-than.
817 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
818 {
819     return _mm_cmpgt_epi16(b, a);
820 }
821 
822 /// Compare packed 32-bit integers in `a` and `b` for less-than.
823 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
824 {
825     return _mm_cmpgt_epi32(b, a);
826 }
827 
828 /// Compare packed 8-bit integers in `a` and `b` for less-than.
829 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
830 {
831     return _mm_cmpgt_epi8(b, a);
832 }
833 
834 /// Compare packed double-precision (64-bit) floating-point elements
835 /// in `a` and `b` for less-than.
836 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
837 {
838     static if (GDC_with_SSE2)
839     {
840         return __builtin_ia32_cmpltpd(a, b); 
841     }
842     else
843     {
844         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
845     }
846 }
847 
848 /// Compare the lower double-precision (64-bit) floating-point elements
849 /// in `a` and `b` for less-than, store the result in the lower 
850 /// element, and copy the upper element from `a`.
851 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
852 {
853     static if (GDC_with_SSE2)
854     {
855         return __builtin_ia32_cmpltsd(a, b); 
856     }
857     else
858     {
859         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
860     }
861 }
862 
863 /// Compare packed double-precision (64-bit) floating-point elements
864 /// in `a` and `b` for not-equal.
865 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
866 {
867     static if (GDC_with_SSE2)
868     {
869         return __builtin_ia32_cmpneqpd(a, b); 
870     }
871     else
872     {
873         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
874     }
875 }
876 
877 /// Compare the lower double-precision (64-bit) floating-point elements
878 /// in `a` and `b` for not-equal, store the result in the lower 
879 /// element, and copy the upper element from `a`.
880 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
881 {
882     static if (GDC_with_SSE2)
883     {
884         return __builtin_ia32_cmpneqsd(a, b); 
885     }
886     else
887     {
888         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
889     }
890 }
891 
892 /// Compare packed double-precision (64-bit) floating-point elements 
893 /// in `a` and `b` for not-greater-than-or-equal.
894 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
895 {
896     static if (GDC_with_SSE2)
897     {
898         return __builtin_ia32_cmpngepd(a, b); 
899     }
900     else
901     {
902         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
903     }
904 }
905 
906 /// Compare the lower double-precision (64-bit) floating-point elements 
907 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
908 /// the lower element, and copy the upper element from `a`.
909 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
910 {
911     // Note: There is no __builtin_ia32_cmpngesd builtin.
912     static if (GDC_with_SSE2)
913     {
914         return __builtin_ia32_cmpltsd(b, a); 
915     }
916     else
917     {
918         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
919     }
920 }
921 
922 /// Compare packed double-precision (64-bit) floating-point elements 
923 /// in `a` and `b` for not-greater-than.
924 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
925 {
926     static if (GDC_with_SSE2)
927     {
928         return __builtin_ia32_cmpngtpd(a, b);
929     }
930     else
931     {
932         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
933     }
934 }
935 
936 /// Compare the lower double-precision (64-bit) floating-point elements 
937 /// in `a` and `b` for not-greater-than, store the result in the 
938 /// lower element, and copy the upper element from `a`.
939 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
940 {
941     // Note: There is no __builtin_ia32_cmpngtsd builtin.
942     static if (GDC_with_SSE2)
943     {
944         return __builtin_ia32_cmplesd(b, a);
945     }
946     else
947     {
948         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
949     }
950 }
951 
952 /// Compare packed double-precision (64-bit) floating-point elements 
953 /// in `a` and `b` for not-less-than-or-equal.
954 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
955 {
956     static if (GDC_with_SSE2)
957     {
958         return __builtin_ia32_cmpnlepd(a, b);
959     }
960     else
961     {
962         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
963     }
964 }
965 
966 /// Compare the lower double-precision (64-bit) floating-point elements 
967 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
968 /// lower element, and copy the upper element from `a`.
969 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
970 {
971     static if (GDC_with_SSE2)
972     {
973         return __builtin_ia32_cmpnlesd(a, b);
974     }
975     else
976     {
977         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
978     }
979 }
980  
981 /// Compare packed double-precision (64-bit) floating-point elements 
982 /// in `a` and `b` for not-less-than.
983 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
984 {
985     static if (GDC_with_SSE2)
986     {
987         return __builtin_ia32_cmpnltpd(a, b);
988     }
989     else
990     {
991         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
992     }
993 }
994 
995 /// Compare the lower double-precision (64-bit) floating-point elements 
996 /// in `a` and `b` for not-less-than, store the result in the lower 
997 /// element, and copy the upper element from `a`.
998 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
999 {
1000     static if (GDC_with_SSE2)
1001     {
1002         return __builtin_ia32_cmpnltsd(a, b);
1003     }
1004     else
1005     {
1006         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1007     }
1008 }
1009 
1010 /// Compare packed double-precision (64-bit) floating-point elements 
1011 /// in `a` and `b` to see if neither is NaN.
1012 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1013 {
1014     static if (GDC_with_SSE2)
1015     {
1016         return __builtin_ia32_cmpordpd(a, b);
1017     }
1018     else
1019     {
1020         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1021     }
1022 }
1023 
1024 /// Compare the lower double-precision (64-bit) floating-point elements 
1025 /// in `a` and `b` to see if neither is NaN, store the result in the 
1026 /// lower element, and copy the upper element from `a` to the upper element.
1027 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1028 {
1029     static if (GDC_with_SSE2)
1030     {
1031         return __builtin_ia32_cmpordsd(a, b);
1032     }
1033     else
1034     {
1035         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1036     }
1037 }
1038 
1039 /// Compare packed double-precision (64-bit) floating-point elements 
1040 /// in `a` and `b` to see if either is NaN.
1041 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1042 {
1043     static if (GDC_with_SSE2)
1044     {
1045         return __builtin_ia32_cmpunordpd(a, b);
1046     }
1047     else
1048     {
1049         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1050     }
1051 }
1052 
1053 /// Compare the lower double-precision (64-bit) floating-point elements 
1054 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1055 /// element, and copy the upper element from `a` to the upper element.
1056 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1057 {
1058     static if (GDC_with_SSE2)
1059     {
1060         return __builtin_ia32_cmpunordsd(a, b);
1061     }
1062     else
1063     {
1064         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1065     }
1066 }
1067 
1068 
1069 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
1070 // Some such comparisons yields true for NaNs, other don't.
1071 
1072 /// Compare the lower double-precision (64-bit) floating-point element 
1073 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1074 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1075 {
1076     static if (GDC_with_SSE2)
1077     {
1078         return __builtin_ia32_comieq(a, b);
1079     }
1080     else
1081     {
1082         return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC
1083     }
1084 }
1085 
1086 /// Compare the lower double-precision (64-bit) floating-point element 
1087 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1088 /// result (0 or 1).
1089 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1090 {
1091     static if (GDC_with_SSE2)
1092     {
1093         return __builtin_ia32_comige(a, b);
1094     }
1095     else
1096     {
1097         return comsd!(FPComparison.oge)(a, b);
1098     }
1099 }
1100 
1101 /// Compare the lower double-precision (64-bit) floating-point element 
1102 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1103 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1104 {
1105     static if (GDC_with_SSE2)
1106     {
1107         return __builtin_ia32_comigt(a, b);
1108     }
1109     else
1110     {
1111         return comsd!(FPComparison.ogt)(a, b);
1112     }
1113 }
1114 
1115 /// Compare the lower double-precision (64-bit) floating-point element 
1116 /// in `a` and `b` for less-than-or-equal.
1117 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1118 {
1119     static if (GDC_with_SSE2)
1120     {
1121         return __builtin_ia32_comile(a, b);
1122     }
1123     else
1124     {
1125         return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC
1126     }
1127 }
1128 
1129 /// Compare the lower double-precision (64-bit) floating-point element 
1130 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1131 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1132 {
1133     static if (GDC_with_SSE2)
1134     {
1135         return __builtin_ia32_comilt(a, b);
1136     }
1137     else
1138     {
1139         return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC
1140     }
1141 }
1142 
1143 /// Compare the lower double-precision (64-bit) floating-point element
1144 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1145 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1146 {
1147     static if (GDC_with_SSE2)
1148     {
1149         return __builtin_ia32_comineq(a, b);
1150     }
1151     else
1152     {
1153         return comsd!(FPComparison.one)(a, b);
1154     }
1155 }
1156 
1157 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1158 /// floating-point elements.
1159  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1160 {
1161     version(LDC)
1162     {
1163         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1164         enum ir = `
1165             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1166             %r = sitofp <2 x i32> %v to <2 x double>
1167             ret <2 x double> %r`;
1168         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1169     }
1170     else static if (GDC_with_SSE2)
1171     {
1172         return __builtin_ia32_cvtdq2pd(a);
1173     }
1174     else
1175     {
1176         double2 r = void;
1177         r.ptr[0] = a.array[0];
1178         r.ptr[1] = a.array[1];
1179         return r;
1180     }
1181 }
1182 unittest
1183 {
1184     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1185     assert(A.array[0] == 54.0);
1186     assert(A.array[1] == 54.0);
1187 }
1188 
1189 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1190 /// floating-point elements.
1191 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1192 {
1193     static if (GDC_with_SSE2)
1194     {
1195         return __builtin_ia32_cvtdq2ps(a);
1196     }
1197     else
1198     {
1199         // x86: Generates cvtdq2ps since LDC 1.0.0 -O1
1200         // ARM: Generats scvtf.4s since LDC 1.8.0 -02        
1201         __m128 res;
1202         res.ptr[0] = cast(float)a.array[0];
1203         res.ptr[1] = cast(float)a.array[1];
1204         res.ptr[2] = cast(float)a.array[2];
1205         res.ptr[3] = cast(float)a.array[3];
1206         return res;
1207     }
1208 }
1209 unittest
1210 {
1211     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1212     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1213 }
1214 
1215 /// Convert packed double-precision (64-bit) floating-point elements 
1216 /// in `a` to packed 32-bit integers.
1217 // PERF #ARM
1218 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1219 {
1220     static if (LDC_with_SSE2)
1221     {
1222         // Like in clang, implemented with a magic intrinsic right now
1223         return __builtin_ia32_cvtpd2dq(a);
1224     }
1225     else static if (GDC_with_SSE2)
1226     {
1227         return __builtin_ia32_cvtpd2dq(a);
1228     }
1229     else
1230     {
1231         __m128i r = _mm_setzero_si128();
1232         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1233         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1234         return r;
1235     }
1236 }
1237 unittest
1238 {
1239     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1240     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1241 }
1242 
1243 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1244 /// to packed 32-bit integers
1245 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1246 {
1247     return to_m64(_mm_cvtpd_epi32(v));
1248 }
1249 unittest
1250 {
1251     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1252     assert(A.array[0] == 55 && A.array[1] == 61);
1253 }
1254 
1255 /// Convert packed double-precision (64-bit) floating-point elements 
1256 /// in `a` to packed single-precision (32-bit) floating-point elements.
1257 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1258 {
1259     static if (LDC_with_SSE2)
1260     {
1261         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1262     }
1263     else static if (GDC_with_SSE2)
1264     {
1265         return __builtin_ia32_cvtpd2ps(a);
1266     }
1267     else
1268     { 
1269         __m128 r = void;
1270         r.ptr[0] = a.array[0];
1271         r.ptr[1] = a.array[1];
1272         r.ptr[2] = 0;
1273         r.ptr[3] = 0;
1274         return r;
1275     }
1276 }
1277 unittest
1278 {
1279     __m128d A = _mm_set_pd(5.25, 4.0);
1280     __m128 B = _mm_cvtpd_ps(A);
1281     assert(B.array == [4.0f, 5.25f, 0, 0]);
1282 }
1283 
1284 /// Convert packed 32-bit integers in `v` to packed double-precision 
1285 /// (64-bit) floating-point elements.
1286 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1287 {
1288     return _mm_cvtepi32_pd(to_m128i(v));
1289 }
1290 unittest
1291 {
1292     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1293     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1294 }
1295 
1296 /// Convert packed single-precision (32-bit) floating-point elements 
1297 /// in `a` to packed 32-bit integers
1298 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1299 {
1300     static if (LDC_with_SSE2)
1301     {
1302         // Disabled, since it fail with optimizations unfortunately
1303         //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
1304         return __asm!__m128i("cvtps2dq $1,$0","=x,x",a);
1305     }
1306     else static if (GDC_with_SSE2)
1307     {
1308         return __builtin_ia32_cvtps2dq(a);
1309     }
1310     else static if (LDC_with_ARM64)
1311     {
1312         // Get current rounding mode.
1313         uint fpscr = arm_get_fpcr();
1314         switch(fpscr & _MM_ROUND_MASK_ARM)
1315         {
1316             default:
1317             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1318             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1319             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1320             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1321         }
1322     }
1323     else
1324     {
1325         __m128i r = void;
1326         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1327         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1328         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1329         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1330         return r;
1331     }
1332 }
1333 unittest
1334 {
1335     uint savedRounding = _MM_GET_ROUNDING_MODE();
1336 
1337     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1338     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1339     assert(A.array == [1, -2, 54, -3]);
1340 
1341     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1342     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1343     assert(A.array == [1, -3, 53, -3]);
1344 
1345     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1346     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1347     assert(A.array == [2, -2, 54, -2]);
1348 
1349     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1350     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1351     assert(A.array == [1, -2, 53, -2]);
1352 
1353     _MM_SET_ROUNDING_MODE(savedRounding);
1354 }
1355 
1356 /// Convert packed single-precision (32-bit) floating-point elements 
1357 /// in `a` to packed double-precision (64-bit) floating-point elements.
1358 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1359 {
1360     version(LDC)
1361     {
1362         // Generates cvtps2pd since LDC 1.0 -O0
1363         enum ir = `
1364             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1365             %r = fpext <2 x float> %v to <2 x double>
1366             ret <2 x double> %r`;
1367         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1368     }
1369     else static if (GDC_with_SSE2)
1370     {
1371         return __builtin_ia32_cvtps2pd(a);
1372     }
1373     else
1374     {
1375         double2 r = void;
1376         r.ptr[0] = a.array[0];
1377         r.ptr[1] = a.array[1];
1378         return r;
1379     }
1380 }
1381 unittest
1382 {
1383     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1384     assert(A.array[0] == 54.0);
1385     assert(A.array[1] == 54.0);
1386 }
1387 
1388 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1389 double _mm_cvtsd_f64 (__m128d a) pure @safe
1390 {
1391     return a.array[0];
1392 }
1393 
1394 /// Convert the lower double-precision (64-bit) floating-point element
1395 /// in `a` to a 32-bit integer.
1396 int _mm_cvtsd_si32 (__m128d a) @safe
1397 {
1398     static if (LDC_with_SSE2)
1399     {
1400         return __builtin_ia32_cvtsd2si(a);
1401     }
1402     else static if (GDC_with_SSE2)
1403     {
1404         return __builtin_ia32_cvtsd2si(a);
1405     }
1406     else
1407     {
1408         return convertDoubleToInt32UsingMXCSR(a[0]);
1409     }
1410 }
1411 unittest
1412 {
1413     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1414 }
1415 
1416 version(LDC)
1417 {
1418     // Unfortunately this builtin crashes in 32-bit
1419     version(X86_64)
1420         alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
1421     else
1422     {
1423         long _mm_cvtsd_si64 (__m128d a) @safe
1424         {
1425             return convertDoubleToInt64UsingMXCSR(a[0]);
1426         }
1427     }
1428 }
1429 else
1430 {
1431     long _mm_cvtsd_si64 (__m128d a) @safe
1432     {
1433         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1434     }
1435 }
1436 unittest
1437 {
1438     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1439 
1440     uint savedRounding = _MM_GET_ROUNDING_MODE();
1441 
1442     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1443     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1444 
1445     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1446     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1447 
1448     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1449     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1450 
1451     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1452     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1453 
1454     _MM_SET_ROUNDING_MODE(savedRounding);
1455 }
1456 
1457 alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
1458 
1459 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe
1460 {
1461     static if (GDC_with_SSE2)
1462     {
1463         return __builtin_ia32_cvtsd2ss(a, b); 
1464     }
1465     else
1466     {
1467         // Generates cvtsd2ss since LDC 1.3 -O0
1468         a[0] = b[0];
1469         return a;
1470     }
1471 }
1472 unittest
1473 {
1474     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1475     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1476 }
1477 
1478 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1479 {
1480     return a.array[0];
1481 }
1482 
1483 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1484 {
1485     long2 la = cast(long2)a;
1486     return la.array[0];
1487 }
1488 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1489 
1490 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted
1491 {
1492     v.ptr[0] = cast(double)x;
1493     return v;
1494 }
1495 unittest
1496 {
1497     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1498     assert(a.array == [42.0, 0]);
1499 }
1500 
1501 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1502 {
1503     int4 r = [0, 0, 0, 0];
1504     r.ptr[0] = a;
1505     return r;
1506 }
1507 unittest
1508 {
1509     __m128i a = _mm_cvtsi32_si128(65);
1510     assert(a.array == [65, 0, 0, 0]);
1511 }
1512 
1513 
1514 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
1515 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted
1516 {
1517     v.ptr[0] = cast(double)x;
1518     return v;
1519 }
1520 unittest
1521 {
1522     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1523     assert(a.array == [42.0, 0]);
1524 }
1525 
1526 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1527 {
1528     long2 r = [0, 0];
1529     r.ptr[0] = a;
1530     return cast(__m128i)(r);
1531 }
1532 
1533 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
1534 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
1535 
1536 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted
1537 {
1538     v.ptr[0] = x.array[0];
1539     return v;
1540 }
1541 unittest
1542 {
1543     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1544     assert(a.array == [42.0, 0]);
1545 }
1546 
1547 long _mm_cvttss_si64 (__m128 a) pure @safe
1548 {
1549     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1550 }
1551 unittest
1552 {
1553     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1554 }
1555 
1556 static if (LDC_with_SSE2)
1557 {
1558     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1559 }
1560 else static if (GDC_with_SSE2)
1561 {
1562     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1563 }
1564 else
1565 {
1566     __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe
1567     {
1568         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1569         __m128i r;
1570         r.array[0] = cast(int)a.array[0];
1571         r.array[1] = cast(int)a.array[1];
1572         r.array[2] = 0;
1573         r.array[3] = 0;
1574         return r;
1575     }
1576 }
1577 unittest
1578 {
1579     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1580     assert(R.array == [-4, 45641, 0, 0]);
1581 }
1582 
1583 
1584 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1585 /// to packed 32-bit integers with truncation.
1586 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1587 {
1588     return to_m64(_mm_cvttpd_epi32(v));
1589 }
1590 unittest
1591 {
1592     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1593     int[2] correct = [-4, 45641];
1594     assert(R.array == correct);
1595 }
1596 
1597 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1598 {
1599     // x86: Generates cvttps2dq since LDC 1.3 -O2
1600     // ARM64: generates fcvtze since LDC 1.8 -O2
1601     __m128i r;
1602     r.ptr[0] = cast(int)a.array[0];
1603     r.ptr[1] = cast(int)a.array[1];
1604     r.ptr[2] = cast(int)a.array[2];
1605     r.ptr[3] = cast(int)a.array[3];
1606     return r;
1607 }
1608 unittest
1609 {
1610     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1611     assert(R.array == [-4, 45641, 0, 1]);
1612 }
1613 
1614 int _mm_cvttsd_si32 (__m128d a)
1615 {
1616     // Generates cvttsd2si since LDC 1.3 -O0
1617     return cast(int)a.array[0];
1618 }
1619 
1620 long _mm_cvttsd_si64 (__m128d a)
1621 {
1622     // Generates cvttsd2si since LDC 1.3 -O0
1623     // but in 32-bit instead, it's a long sequence that resort to FPU
1624     return cast(long)a.array[0];
1625 }
1626 
1627 alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
1628 
1629 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1630 {
1631     return a / b;
1632 }
1633 
1634 static if (GDC_with_SSE2)
1635 {
1636     __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1637     {
1638         return __builtin_ia32_divsd(a, b);
1639     }
1640 }
1641 else version(DigitalMars)
1642 {
1643     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1644     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1645     {
1646         asm pure nothrow @nogc @trusted { nop;}
1647         a.array[0] = a.array[0] / b.array[0];
1648         return a;
1649     }
1650 }
1651 else
1652 {
1653     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1654     {
1655         a.array[0] /= b.array[0];
1656         return a;
1657     }
1658 }
1659 unittest
1660 {
1661     __m128d a = [2.0, 4.5];
1662     a = _mm_div_sd(a, a);
1663     assert(a.array == [1.0, 4.5]);
1664 }
1665 
1666 /// Extract a 16-bit integer from `v`, selected with `index`
1667 // PERF: ARM version has array bound check
1668 int _mm_extract_epi16(__m128i v, int index) pure @safe
1669 {
1670     short8 r = cast(short8)v;
1671     return cast(ushort)(r.array[index]);
1672 }
1673 unittest
1674 {
1675     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1676     assert(_mm_extract_epi16(A, 6) == 6);
1677     assert(_mm_extract_epi16(A, 0) == 65535);
1678 }
1679 
1680 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1681 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1682 {
1683     short8 r = cast(short8)v;
1684     r.ptr[index & 7] = cast(short)i;
1685     return cast(__m128i)r;
1686 }
1687 unittest
1688 {
1689     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1690     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1691     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1692     assert(R.array == correct);
1693 }
1694 
1695 version(GNU)
1696 {
1697     void _mm_lfence() pure @trusted
1698     {
1699         static if (GDC_with_SSE2)
1700         {
1701             __builtin_ia32_lfence();
1702         }
1703         else version(X86)
1704         {
1705             asm pure nothrow @nogc @trusted
1706             {
1707                 "lfence;\n" : : : ;
1708             }
1709         }
1710         else
1711             static assert(false);
1712     }
1713 }
1714 else static if (LDC_with_SSE2)
1715 {
1716     alias _mm_lfence = __builtin_ia32_lfence;
1717 }
1718 else static if (DMD_with_asm)
1719 {
1720     void _mm_lfence() pure @safe
1721     {
1722         asm nothrow @nogc pure @safe
1723         {
1724             lfence;
1725         }
1726     }
1727 }
1728 else version(LDC)
1729 {
1730     void _mm_lfence() pure @safe
1731     {
1732         llvm_memory_fence(); // Note: actually generates mfence
1733     }
1734 }
1735 else
1736     static assert(false);
1737 unittest
1738 {
1739     _mm_lfence();
1740 }
1741 
1742 
1743 __m128d _mm_load_pd (const(double) * mem_addr) pure
1744 {
1745     __m128d* aligned = cast(__m128d*)mem_addr;
1746     return *aligned;
1747 }
1748 
1749 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1750 {
1751     double[2] arr = [*mem_addr, *mem_addr];
1752     return loadUnaligned!(double2)(&arr[0]);
1753 }
1754 
1755 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1756 {
1757     double2 r = [0, 0];
1758     r.ptr[0] = *mem_addr;
1759     return r;
1760 }
1761 unittest
1762 {
1763     double x = -42;
1764     __m128d a = _mm_load_sd(&x);
1765     assert(a.array == [-42.0, 0.0]);
1766 }
1767 
1768 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
1769 {
1770     return *mem_addr;
1771 }
1772 
1773 alias _mm_load1_pd = _mm_load_pd1;
1774 
1775 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1776 {
1777     a.ptr[1] = *mem_addr;
1778     return a;
1779 }
1780 
1781 // Note: strange signature since the memory doesn't have to aligned
1782 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
1783 {
1784     auto pLong = cast(const(long)*)mem_addr;
1785     long2 r = [0, 0];
1786     r.ptr[0] = *pLong;
1787     return cast(__m128i)(r);
1788 }
1789 
1790 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1791 {
1792     a.ptr[0] = *mem_addr;
1793     return a;
1794 }
1795 
1796 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted
1797 {
1798     __m128d a = *cast(__m128d*)(mem_addr);
1799     __m128d r;
1800     r.ptr[0] = a.array[1];
1801     r.ptr[1] = a.array[0];
1802     return r;
1803 }
1804 
1805 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
1806 {
1807     static if (GDC_with_SSE2)
1808     {
1809         return __builtin_ia32_loadupd(mem_addr); 
1810     }
1811     else
1812     {
1813         return loadUnaligned!(double2)(mem_addr);
1814     }
1815 }
1816 
1817 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1818 {
1819     static if (GDC_with_SSE2)
1820     {
1821         return __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1822     }
1823     else
1824     {
1825         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1826     }
1827 }
1828 
1829 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1830 {
1831     int r = *cast(int*)(mem_addr);
1832     int4 result = [0, 0, 0, 0];
1833     result.ptr[0] = r;
1834     return result;
1835 }
1836 unittest
1837 {
1838     int r = 42;
1839     __m128i A = _mm_loadu_si32(&r);
1840     int[4] correct = [42, 0, 0, 0];
1841     assert(A.array == correct);
1842 }
1843 
1844 static if (GDC_with_SSE2)
1845 {
1846     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1847     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1848     /// and pack the results in destination.
1849     __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe
1850     {
1851         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
1852     }
1853 }
1854 else static if (LDC_with_SSE2)
1855 {
1856     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1857     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1858     /// and pack the results in destination.
1859     __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe
1860     {
1861         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
1862     }
1863 }
1864 else static if (LDC_with_ARM64)
1865 {
1866     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1867     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1868     /// and pack the results in destination.
1869     __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe
1870     {
1871         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
1872         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
1873         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
1874         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
1875         return vcombine_s32(rl, rh);
1876     }
1877 }
1878 else
1879 {
1880     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1881     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1882     /// and pack the results in destination.
1883     __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe
1884     {
1885         short8 sa = cast(short8)a;
1886         short8 sb = cast(short8)b;
1887 
1888         int4 r;
1889         foreach(i; 0..4)
1890         {
1891             r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
1892         }
1893         return r;
1894     }
1895 }
1896 unittest
1897 {
1898     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1899     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1900     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
1901     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
1902     assert(R.array == correct);
1903 }
1904 
1905 
1906 static if (GDC_with_SSE2)
1907 {
1908     /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
1909     /// (elements are not stored when the highest bit is not set in the corresponding element)
1910     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
1911     /// boundary.
1912     void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
1913     {
1914         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
1915     }
1916 }
1917 else static if (LDC_with_SSE2)
1918 {
1919     /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
1920     /// (elements are not stored when the highest bit is not set in the corresponding element)
1921     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
1922     /// boundary.
1923     void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
1924     {
1925         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
1926     }
1927 }
1928 else
1929 {
1930     /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
1931     /// (elements are not stored when the highest bit is not set in the corresponding element)
1932     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
1933     /// boundary.
1934     // PERF: catastrophic on ARM
1935     void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
1936     {
1937         byte16 b = cast(byte16)a;
1938         byte16 m = cast(byte16)mask;
1939         byte* dest = cast(byte*)(mem_addr);
1940         foreach(j; 0..16)
1941         {
1942             if (m.array[j] & 128)
1943             {
1944                 dest[j] = b.array[j];
1945             }
1946         }
1947     }
1948 }
1949 unittest
1950 {
1951     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
1952     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
1953     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
1954     _mm_maskmoveu_si128(A, mask, dest.ptr);
1955     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
1956     assert(dest == correct);
1957 }
1958 
1959 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
1960 {
1961     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1962     __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
1963     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1964     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1965     return _mm_xor_si128(b, mask);
1966 }
1967 unittest
1968 {
1969     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1970                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1971     short[8] correct =                                  [45, 1,  9,  7, 9,  7, 0,  0];
1972     assert(R.array == correct);
1973 }
1974 
1975 
1976 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1977 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
1978 {
1979     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1980     __m128i value128 = _mm_set1_epi8(-128);
1981     __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1982     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1983     __m128i mask = _mm_and_si128(aTob, higher);
1984     return _mm_xor_si128(b, mask);
1985 }
1986 unittest
1987 {
1988     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1989                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1990     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
1991     assert(R.array == correct);
1992 }
1993 
1994 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe
1995 {
1996     static if (GDC_with_SSE2)
1997     {
1998         return __builtin_ia32_maxpd(a, b);
1999     }
2000     else
2001     {
2002         // Generates maxpd starting with LDC 1.9
2003         a[0] = (a[0] > b[0]) ? a[0] : b[0];
2004         a[1] = (a[1] > b[1]) ? a[1] : b[1];
2005         return a;
2006     }
2007 }
2008 unittest
2009 {
2010     __m128d A = _mm_setr_pd(4.0, 1.0);
2011     __m128d B = _mm_setr_pd(1.0, 8.0);
2012     __m128d M = _mm_max_pd(A, B);
2013     assert(M.array[0] == 4.0);
2014     assert(M.array[1] == 8.0);
2015 }
2016 
2017 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe
2018 {
2019     static if (GDC_with_SSE2)
2020     {
2021         return __builtin_ia32_maxsd(a, b);
2022     }
2023     else
2024     {
2025          __m128d r = a;
2026         // Generates maxsd starting with LDC 1.3
2027         r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2028         return r;
2029     }
2030 }
2031 unittest
2032 {
2033     __m128d A = _mm_setr_pd(1.0, 1.0);
2034     __m128d B = _mm_setr_pd(4.0, 2.0);
2035     __m128d M = _mm_max_sd(A, B);
2036     assert(M.array[0] == 4.0);
2037     assert(M.array[1] == 1.0);
2038 }
2039 
2040 version(GNU)
2041 {
2042     void _mm_mfence() pure @trusted
2043     {
2044         static if (GDC_with_SSE2)
2045         {
2046             __builtin_ia32_mfence();
2047         }
2048         else version(X86)
2049         {
2050             asm pure nothrow @nogc @trusted
2051             {
2052                 "mfence;\n" : : : ;
2053             }
2054         }
2055         else
2056             static assert(false);
2057     }
2058 }
2059 else static if (LDC_with_SSE2)
2060 {
2061     alias _mm_mfence = __builtin_ia32_mfence;
2062 }
2063 else static if (DMD_with_asm)
2064 {
2065     void _mm_mfence() pure @safe
2066     {
2067         asm nothrow @nogc pure @safe
2068         {
2069             mfence;
2070         }
2071     }
2072 }
2073 else version(LDC)
2074 {
2075     void _mm_mfence() pure @safe
2076     {
2077         // Note: will generate the DMB instruction on ARM
2078         llvm_memory_fence();
2079     }
2080 }
2081 else
2082     static assert(false);
2083 unittest
2084 {
2085     _mm_mfence();
2086 }
2087 
2088 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2089 {
2090     // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?)
2091     // Implemented using masks and XOR
2092     __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2093     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2094     __m128i mask = _mm_and_si128(aTob, lowerShorts);
2095     return _mm_xor_si128(b, mask);
2096 }
2097 unittest
2098 {
2099     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
2100                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2101     short[8] correct =  [-4,-8, -4, -8, 0,-57, 0, -57];
2102     assert(R.array == correct);
2103 }
2104 
2105 
2106 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2107 {
2108     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
2109     __m128i value128 = _mm_set1_epi8(-128);
2110     __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2111     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2112     __m128i mask = _mm_and_si128(aTob, lower);
2113     return _mm_xor_si128(b, mask);
2114 }
2115 unittest
2116 {
2117     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2118                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2119     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2120     assert(R.array == correct);
2121 }
2122 
2123 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe
2124 {
2125     static if (GDC_with_SSE2)
2126     {
2127         return __builtin_ia32_minpd(a, b);
2128     }
2129     else
2130     {
2131         // Generates minpd starting with LDC 1.9
2132         a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2133         a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2134         return a;
2135     }
2136 }
2137 unittest
2138 {
2139     __m128d A = _mm_setr_pd(1.0, 2.0);
2140     __m128d B = _mm_setr_pd(4.0, 1.0);
2141     __m128d M = _mm_min_pd(A, B);
2142     assert(M.array[0] == 1.0);
2143     assert(M.array[1] == 1.0);
2144 }
2145 
2146 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2147 {
2148     static if (GDC_with_SSE2)
2149     {
2150         return __builtin_ia32_minsd(a, b);
2151     }
2152     else
2153     {
2154         // Generates minsd starting with LDC 1.3
2155         __m128d r = a;
2156         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2157         return r;
2158     }
2159 }
2160 unittest
2161 {
2162     __m128d A = _mm_setr_pd(1.0, 3.0);
2163     __m128d B = _mm_setr_pd(4.0, 2.0);
2164     __m128d M = _mm_min_sd(A, B);
2165     assert(M.array[0] == 1.0);
2166     assert(M.array[1] == 3.0);
2167 }
2168 
2169 __m128i _mm_move_epi64 (__m128i a) pure @safe
2170 {
2171     static if (GDC_with_SSE2)
2172     {
2173         return __builtin_ia32_movq128(a);
2174     }
2175     else
2176     {
2177         long2 result = [ 0, 0 ];
2178         long2 la = cast(long2) a;
2179         result.array[0] = la.array[0];
2180         return cast(__m128i)(result);
2181     }
2182 }
2183 unittest
2184 {
2185     long2 A = [13, 47];
2186     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2187     long[2] correct = [13, 0];
2188     assert(B.array == correct);
2189 }
2190 
2191 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
2192 {
2193     static if (GDC_with_SSE2)
2194     {
2195         return __builtin_ia32_movsd(a, b); 
2196     }
2197     else
2198     {
2199         b.array[1] = a.array[1];
2200         return b;
2201     }
2202 }
2203 unittest
2204 {
2205     double2 A = [13.0, 47.0];
2206     double2 B = [34.0, 58.0];
2207     double2 C = _mm_move_sd(A, B);
2208     double[2] correct = [34.0, 47.0];
2209     assert(C.array == correct);
2210 }
2211 
2212 static if (GDC_with_SSE2)
2213 {
2214     /// Create mask from the most significant bit of each 8-bit element in `v`.
2215     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
2216 }
2217 else static if (LDC_with_SSE2)
2218 {
2219     /// Create mask from the most significant bit of each 8-bit element in `v`.
2220     int _mm_movemask_epi8(__m128i v) pure @safe
2221     {
2222         return __builtin_ia32_pmovmskb128(cast(byte16)v);
2223     }
2224 }
2225 else static if (LDC_with_ARM64)
2226 {
2227     // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2228     // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2229     // SO there might be something a bit faster, but this one is reasonable and branchless.
2230 
2231     /// Create mask from the most significant bit of each 8-bit element in `v`.
2232     int _mm_movemask_epi8 (__m128i a) pure @trusted
2233     {
2234         byte8 mask_shift;
2235         mask_shift.ptr[0] = 7;
2236         mask_shift.ptr[1] = 6;
2237         mask_shift.ptr[2] = 5;
2238         mask_shift.ptr[3] = 4;
2239         mask_shift.ptr[4] = 3;
2240         mask_shift.ptr[5] = 2;
2241         mask_shift.ptr[6] = 1;
2242         mask_shift.ptr[7] = 0;
2243         byte8 mask_and = byte8(-128);
2244         byte8 lo = vget_low_u8(cast(byte16)a);
2245         byte8 hi = vget_high_u8(cast(byte16)a);
2246         lo = vand_u8(lo, mask_and);
2247         lo = vshr_u8(lo, mask_shift);
2248         hi = vand_u8(hi, mask_and);
2249         hi = vshr_u8(hi, mask_shift);
2250         lo = vpadd_u8(lo,lo);
2251         lo = vpadd_u8(lo,lo);
2252         lo = vpadd_u8(lo,lo);
2253         hi = vpadd_u8(hi,hi);
2254         hi = vpadd_u8(hi,hi);
2255         hi = vpadd_u8(hi,hi);
2256         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2257     }
2258 }
2259 else 
2260 {
2261     /// Create mask from the most significant bit of each 8-bit element in `v`.
2262     int _mm_movemask_epi8(__m128i v) pure @safe
2263     {
2264         byte16 ai = cast(byte16)v;
2265         int r = 0;
2266         foreach(bit; 0..16)
2267         {
2268             if (ai.array[bit] < 0) r += (1 << bit);
2269         }
2270         return r;
2271     }
2272 }
2273 unittest
2274 {
2275     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2276 }
2277 
2278 static if (GDC_with_SSE2)
2279 {
2280     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2281     /// packed double-precision (64-bit) floating-point element in `v`.
2282     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2283 }
2284 else static if (LDC_with_SSE2)
2285 {
2286     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2287     /// packed double-precision (64-bit) floating-point element in `v`.
2288     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2289 }
2290 else
2291 {
2292     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2293     /// packed double-precision (64-bit) floating-point element in `v`.
2294     int _mm_movemask_pd(__m128d v) pure @safe
2295     {
2296         long2 lv = cast(long2)v;
2297         int r = 0;
2298         if (lv.array[0] < 0) r += 1;
2299         if (lv.array[1] < 0) r += 2;
2300         return r;
2301     }
2302 }
2303 unittest
2304 {
2305     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2306     assert(_mm_movemask_pd(A) == 2);
2307 }
2308 
2309 /// Copy the lower 64-bit integer in `v`.
2310 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2311 {
2312     long2 lv = cast(long2)v;
2313     return long1(lv.array[0]);
2314 }
2315 unittest
2316 {
2317     __m128i A = _mm_set_epi64x(-1, -2);
2318     __m64 R = _mm_movepi64_pi64(A);
2319     assert(R.array[0] == -2);
2320 }
2321 
2322 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2323 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2324 {
2325     long2 r;
2326     r.ptr[0] = a.array[0];
2327     r.ptr[1] = 0;
2328     return cast(__m128i)r;
2329 }
2330 
2331 // Note: generates pmuludq in LDC with -O1
2332 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2333 {
2334     __m128i zero = _mm_setzero_si128();
2335 
2336     static if (__VERSION__ >= 2088)
2337     {
2338         // Need LLVM9 to avoid this shufflevector
2339         long2 la, lb;
2340         la.ptr[0] = cast(uint)a.array[0];
2341         la.ptr[1] = cast(uint)a.array[2];
2342         lb.ptr[0] = cast(uint)b.array[0];
2343         lb.ptr[1] = cast(uint)b.array[2];
2344     }
2345     else
2346     {
2347         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2348         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2349     }
2350 
2351     static if (__VERSION__ >= 2076)
2352     {
2353         return cast(__m128i)(la * lb);
2354     }
2355     else
2356     {
2357         // long2 mul not supported before LDC 1.5
2358         la.ptr[0] *= lb.array[0];
2359         la.ptr[1] *= lb.array[1];
2360         return cast(__m128i)(la);
2361     }
2362 }
2363 unittest
2364 {
2365     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2366     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2367     __m128i C = _mm_mul_epu32(A, B);
2368     long2 LC = cast(long2)C;
2369     assert(LC.array[0] == 18446744065119617025uL);
2370     assert(LC.array[1] == 12723420444339690338uL);
2371 }
2372 
2373 
2374 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2375 {
2376     return a * b;
2377 }
2378 unittest
2379 {
2380     __m128d a = [-2.0, 1.5];
2381     a = _mm_mul_pd(a, a);
2382     assert(a.array == [4.0, 2.25]);
2383 }
2384 
2385 version(DigitalMars)
2386 {
2387     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2388     __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2389     {
2390         asm pure nothrow @nogc @trusted { nop;}
2391         a.array[0] = a.array[0] * b.array[0];
2392         return a;
2393     }
2394 }
2395 else
2396 {
2397     static if (GDC_with_SSE2)
2398     {
2399         alias _mm_mul_sd = __builtin_ia32_mulsd;
2400     }
2401     else
2402     {
2403         __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2404         {
2405             a.array[0] *= b.array[0];
2406             return a;
2407         }
2408     }
2409 }
2410 unittest
2411 {
2412     __m128d a = [-2.0, 1.5];
2413     a = _mm_mul_sd(a, a);
2414     assert(a.array == [4.0, 1.5]);
2415 }
2416 
2417 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2418 /// and get an unsigned 64-bit result.
2419 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2420 {
2421     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2422 }
2423 unittest
2424 {
2425     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2426     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2427     __m64 C = _mm_mul_su32(A, B);
2428     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2429 }
2430 
2431 static if (GDC_with_SSE2)
2432 {
2433     __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2434     {
2435         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2436     }
2437 }
2438 else static if (LDC_with_SSE2)
2439 {
2440     __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2441     {
2442         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2443     }
2444 }
2445 else
2446 {    
2447     __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2448     {
2449         short8 sa = cast(short8)a;
2450         short8 sb = cast(short8)b;
2451         short8 r = void;
2452         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2453         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2454         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2455         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2456         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2457         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2458         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2459         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2460         return cast(__m128i)r;
2461     }
2462 }
2463 unittest
2464 {
2465     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2466     __m128i B = _mm_set1_epi16(16384);
2467     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2468     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2469     assert(R.array == correct);
2470 }
2471 
2472 static if (GDC_with_SSE2)
2473 {
2474     __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2475     {
2476         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2477     }
2478 }
2479 else static if (LDC_with_SSE2)
2480 {
2481     __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2482     {
2483         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2484     }
2485 }
2486 else
2487 {   
2488     __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2489     {
2490         short8 sa = cast(short8)a;
2491         short8 sb = cast(short8)b;
2492         short8 r = void;
2493         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2494         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2495         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2496         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2497         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2498         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2499         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2500         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2501         return cast(__m128i)r;
2502     }
2503 }
2504 unittest
2505 {
2506     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2507     __m128i B = _mm_set1_epi16(16384);
2508     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2509     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2510     assert(R.array == correct);
2511 }
2512 
2513 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2514 {
2515     return cast(__m128i)(cast(short8)a * cast(short8)b);
2516 }
2517 unittest
2518 {
2519     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2520     __m128i B = _mm_set1_epi16(16384);
2521     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2522     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2523     assert(R.array == correct);
2524 }
2525 
2526 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2527 {
2528     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2529 }
2530 
2531 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2532 {
2533     return a | b;
2534 }
2535 
2536 static if (GDC_with_SSE2)
2537 {
2538     __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2539     {
2540         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2541     }    
2542 }
2543 else static if (LDC_with_SSE2)
2544 {
2545     __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2546     {
2547         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2548     }
2549 }
2550 else
2551 {
2552     __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2553     {
2554         short8 r;
2555         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2556         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2557         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2558         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2559         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2560         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2561         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2562         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2563         return cast(__m128i)r;
2564     }
2565 }
2566 unittest
2567 {
2568     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2569     short8 R = cast(short8) _mm_packs_epi32(A, A);
2570     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2571     assert(R.array == correct);
2572 }
2573 
2574 static if (GDC_with_SSE2)
2575 {
2576     __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2577     {
2578         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2579     }
2580 }
2581 else static if (LDC_with_SSE2)
2582 {
2583     __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2584     {
2585         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2586     }
2587 }
2588 else static if (LDC_with_ARM64)
2589 {
2590     __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2591     {
2592         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2593         byte8 ra = vqmovn_s16(cast(short8)a);
2594         byte8 rb = vqmovn_s16(cast(short8)b);
2595         return cast(__m128i)vcombine_s8(ra, rb);
2596     }
2597 }
2598 else
2599 {   
2600     // PERF: ARM
2601     __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2602     {
2603         byte16 r;
2604         short8 sa = cast(short8)a;
2605         short8 sb = cast(short8)b;
2606         foreach(i; 0..8)
2607             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2608         foreach(i; 0..8)
2609             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2610         return cast(__m128i)r;
2611     }
2612 }
2613 unittest
2614 {
2615     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2616     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2617     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2618                         127, -128, 127, 0, 127, -128, 127, 0];
2619     assert(R.array == correct);
2620 }
2621 
2622 static if (GDC_with_SSE2)
2623 {
2624     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2625     {
2626         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2627     }
2628 }
2629 else static if (LDC_with_SSE2)
2630 {
2631     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2632     {
2633         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2634     }
2635 }
2636 else static if (LDC_with_ARM64)
2637 {        
2638     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2639     {
2640         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2641         byte8 ra = vqmovun_s16(cast(short8)a);
2642         byte8 rb = vqmovun_s16(cast(short8)b);
2643         return cast(__m128i)vcombine_s8(ra, rb);
2644     }
2645 }
2646 else
2647 {   
2648     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2649     {
2650         short8 sa = cast(short8)a;
2651         short8 sb = cast(short8)b;
2652         ubyte[16] result = void;
2653         for (int i = 0; i < 8; ++i)
2654         {
2655             short s = sa[i];
2656             if (s < 0) s = 0;
2657             if (s > 255) s = 255;
2658             result[i] = cast(ubyte)s;
2659 
2660             s = sb[i];
2661             if (s < 0) s = 0;
2662             if (s > 255) s = 255;
2663             result[i+8] = cast(ubyte)s;
2664         }
2665         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2666     }
2667 }
2668 unittest
2669 {
2670     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2671     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2672     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2673                                                 0, 255, 0, 255, 255, 2, 1, 0];
2674     foreach(i; 0..16)
2675         assert(AA.array[i] == cast(byte)(correctResult[i]));
2676 }
2677 
2678 
2679 version(GNU)
2680 {
2681     void _mm_pause() pure @trusted
2682     {
2683         static if (GDC_with_SSE2)
2684         {
2685             __builtin_ia32_pause();
2686         }
2687         else version(X86)
2688         {
2689             asm pure nothrow @nogc @trusted
2690             {
2691                 "pause;\n" : : : ;
2692             }
2693         }
2694         else
2695             static assert(false);
2696     }
2697 }
2698 else static if (LDC_with_SSE2)
2699 {
2700     alias _mm_pause = __builtin_ia32_pause;
2701 }
2702 else static if (DMD_with_asm)
2703 {
2704     void _mm_pause() pure @safe
2705     {
2706         asm nothrow @nogc pure @safe
2707         {
2708             rep; nop; // F3 90 =  pause
2709         }
2710     }
2711 }
2712 else version (LDC)
2713 {
2714     void _mm_pause() pure @safe
2715     {
2716         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2717     }
2718 }
2719 else
2720     static assert(false);
2721 unittest
2722 {
2723     _mm_pause();
2724 }
2725 
2726 static if (GDC_with_SSE2)
2727 {
2728     __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2729     {
2730         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2731     }
2732 }
2733 else static if (LDC_with_SSE2)
2734 {
2735     __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2736     {
2737         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2738     }
2739 }
2740 else
2741 {   
2742     __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2743     {
2744         byte16 ab = cast(byte16)a;
2745         byte16 bb = cast(byte16)b;
2746         ubyte[16] t;
2747         foreach(i; 0..16)
2748         {
2749             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2750             if (diff < 0) diff = -diff;
2751             t[i] = cast(ubyte)(diff);
2752         }
2753         int4 r = _mm_setzero_si128();
2754         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2755         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2756         return r;
2757     }
2758 }
2759 unittest
2760 {
2761     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2762     __m128i B = _mm_set1_epi8(1);
2763     __m128i R = _mm_sad_epu8(A, B);
2764     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2765                       0,
2766                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2767                       0];
2768     assert(R.array == correct);
2769 }
2770 
2771 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2772 {
2773     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2774     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2775 }
2776 unittest
2777 {
2778     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2779     short8 B = cast(short8) A;
2780     foreach(i; 0..8)
2781         assert(B.array[i] == i);
2782 }
2783 
2784 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2785 {
2786     int[4] result = [e0, e1, e2, e3];
2787     return loadUnaligned!(int4)(result.ptr);
2788 }
2789 unittest
2790 {
2791     __m128i A = _mm_set_epi32(3, 2, 1, 0);
2792     foreach(i; 0..4)
2793         assert(A.array[i] == i);
2794 }
2795 
2796 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
2797 {
2798     long[2] result = [e0.array[0], e1.array[0]];
2799     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2800 }
2801 unittest
2802 {
2803     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
2804     long2 B = cast(long2) A;
2805     assert(B.array[0] == 5678);
2806     assert(B.array[1] == 1234);
2807 }
2808 
2809 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
2810 {
2811     long[2] result = [e0, e1];
2812     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2813 }
2814 unittest
2815 {
2816     __m128i A = _mm_set_epi64x(1234, 5678);
2817     long2 B = cast(long2) A;
2818     assert(B.array[0] == 5678);
2819     assert(B.array[1] == 1234);
2820 }
2821 
2822 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
2823                       byte e11, byte e10, byte e9, byte e8,
2824                       byte e7, byte e6, byte e5, byte e4,
2825                       byte e3, byte e2, byte e1, byte e0) pure @trusted
2826 {
2827     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
2828                      e8, e9, e10, e11, e12, e13, e14, e15];
2829     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2830 }
2831 
2832 __m128d _mm_set_pd (double e1, double e0) pure @trusted
2833 {
2834     double[2] result = [e0, e1];
2835     return loadUnaligned!(double2)(result.ptr);
2836 }
2837 unittest
2838 {
2839     __m128d A = _mm_set_pd(61.0, 55.0);
2840     double[2] correct = [55.0, 61.0];
2841     assert(A.array == correct);
2842 }
2843 
2844 __m128d _mm_set_pd1 (double a) pure @trusted
2845 {
2846     double[2] result = [a, a];
2847     return loadUnaligned!(double2)(result.ptr);
2848 }
2849 unittest
2850 {
2851     __m128d A = _mm_set_pd1(61.0);
2852     double[2] correct = [61.0, 61.0];
2853     assert(A.array == correct);
2854 }
2855 
2856 __m128d _mm_set_sd (double a) pure @trusted
2857 {
2858     double[2] result = [a, 0];
2859     return loadUnaligned!(double2)(result.ptr);
2860 }
2861 
2862 __m128i _mm_set1_epi16 (short a) pure @trusted
2863 {
2864     return cast(__m128i)(short8(a));
2865 }
2866 
2867 __m128i _mm_set1_epi32 (int a) pure @trusted
2868 {
2869     return cast(__m128i)(int4(a));
2870 }
2871 unittest
2872 {
2873     __m128 a = _mm_set1_ps(-1.0f);
2874     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
2875     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
2876 }
2877 
2878 /// Broadcast 64-bit integer `a` to all elements of `dst`.
2879 __m128i _mm_set1_epi64 (__m64 a) pure @safe
2880 {
2881     return _mm_set_epi64(a, a);
2882 }
2883 
2884 __m128i _mm_set1_epi64x (long a) pure @trusted
2885 {
2886     return cast(__m128i)(long2(a));
2887 }
2888 
2889 __m128i _mm_set1_epi8 (byte a) pure @trusted
2890 {
2891     return cast(__m128i)(byte16(a));
2892 }
2893 
2894 alias _mm_set1_pd = _mm_set_pd1;
2895 
2896 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
2897                         short e3, short e2, short e1, short e0) pure @trusted
2898 {
2899     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
2900     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
2901 }
2902 
2903 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2904 {
2905     int[4] result = [e3, e2, e1, e0];
2906     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2907 }
2908 
2909 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
2910 {
2911     long[2] result = [e1, e0];
2912     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2913 }
2914 
2915 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
2916                        byte e11, byte e10, byte e9,  byte e8,
2917                        byte e7,  byte e6,  byte e5,  byte e4,
2918                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
2919 {
2920     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
2921                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
2922     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2923 }
2924 
2925 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
2926 {
2927     double[2] result = [e1, e0];
2928     return loadUnaligned!(double2)(result.ptr);
2929 }
2930 unittest
2931 {
2932     __m128d A = _mm_setr_pd(61.0, 55.0);
2933     double[2] correct = [61.0, 55.0];
2934     assert(A.array == correct);
2935 }
2936 
2937 __m128d _mm_setzero_pd () pure @trusted
2938 {
2939     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
2940     double[2] result = [0.0, 0.0];
2941     return loadUnaligned!(double2)(result.ptr);
2942 }
2943 
2944 __m128i _mm_setzero_si128() pure @trusted
2945 {
2946     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
2947     int[4] result = [0, 0, 0, 0];
2948     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2949 }
2950 
2951 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
2952 {
2953     static if (GDC_with_SSE2)
2954     {
2955         return __builtin_ia32_pshufd(a, imm8);
2956     }
2957     else
2958     {
2959         return shufflevector!(int4, (imm8 >> 0) & 3,
2960                                     (imm8 >> 2) & 3,
2961                                     (imm8 >> 4) & 3,
2962                                     (imm8 >> 6) & 3)(a, a);
2963     }
2964 }
2965 unittest
2966 {
2967     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
2968     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2969     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
2970     int[4] expectedB = [ 3, 2, 1, 0 ];
2971     assert(B.array == expectedB);
2972 }
2973 
2974 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
2975 {
2976     static if (GDC_with_SSE2)
2977     {
2978         return __builtin_ia32_shufpd(a, b, imm8);
2979     }
2980     else
2981     {
2982         return shufflevector!(double2, 0 + ( imm8 & 1 ),
2983                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
2984     }
2985 }
2986 unittest
2987 {
2988     __m128d A = _mm_setr_pd(0.5, 2.0);
2989     __m128d B = _mm_setr_pd(4.0, 5.0);
2990     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
2991     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
2992     double[2] correct = [ 2.0, 5.0 ];
2993     assert(R.array == correct);
2994 }
2995 
2996 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
2997 {
2998     static if (GDC_with_SSE2)
2999     {
3000         return __builtin_ia32_pshufhw(a, imm8);
3001     }
3002     else
3003     {
3004         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3005                                           4 + ( (imm8 >> 0) & 3 ),
3006                                           4 + ( (imm8 >> 2) & 3 ),
3007                                           4 + ( (imm8 >> 4) & 3 ),
3008                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3009     }
3010 }
3011 unittest
3012 {
3013     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3014     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3015     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3016     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3017     assert(C.array == expectedC);
3018 }
3019 
3020 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3021 {
3022     static if (GDC_with_SSE2)
3023     {
3024         return __builtin_ia32_pshuflw(a, imm8);
3025     }
3026     else
3027     {
3028         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3029                                                     ( (imm8 >> 2) & 3 ),
3030                                                     ( (imm8 >> 4) & 3 ),
3031                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3032     }
3033 }
3034 unittest
3035 {
3036     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3037     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3038     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3039     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3040     assert(B.array == expectedB);
3041 }
3042 
3043 static if (LDC_with_SSE2)
3044 {
3045     deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128;
3046 }
3047 else static if (GDC_with_SSE2)
3048 {
3049     deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128;
3050 }
3051 else static if (DMD_with_32bit_asm)
3052 {
3053     deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
3054     {
3055         asm pure nothrow @nogc @trusted
3056         {
3057             movdqu XMM0, a;
3058             movdqu XMM1, count;
3059             pslld XMM0, XMM1;
3060             movdqu a, XMM0;
3061         }
3062         return a;
3063     }
3064 }
3065 else
3066 {
3067     deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
3068     {
3069         int4 r = void;
3070         long2 lc = cast(long2)count;
3071         int bits = cast(int)(lc.array[0]);
3072         foreach(i; 0..4)
3073             r[i] = cast(uint)(a[i]) << bits;
3074         return r;
3075     }
3076 }
3077 
3078 static if (LDC_with_SSE2)
3079 {
3080     deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
3081     {
3082         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3083     }
3084 }
3085 else static if (GDC_with_SSE2)
3086 {
3087     deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
3088     {
3089         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3090     }
3091 }
3092 else static if (DMD_with_32bit_asm)
3093 {
3094     deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
3095     {
3096         asm pure nothrow @nogc @trusted
3097         {
3098             movdqu XMM0, a;
3099             movdqu XMM1, count;
3100             psllq XMM0, XMM1;
3101             movdqu a, XMM0;
3102         }
3103         return a;
3104     }
3105 }
3106 else
3107 {
3108     deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
3109     {
3110         // ARM: good since LDC 1.12 -O2
3111         // ~but -O0 version is catastrophic
3112         long2 r = void;
3113         long2 sa = cast(long2)a;
3114         long2 lc = cast(long2)count;
3115         int bits = cast(int)(lc.array[0]);
3116         foreach(i; 0..2)
3117             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3118         return cast(__m128i)r;
3119     }
3120 }
3121 
3122 static if (LDC_with_SSE2)
3123 {
3124     deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3125     {
3126         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3127     }
3128 }
3129 else static if (GDC_with_SSE2)
3130 {
3131     deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3132     {
3133         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3134     }
3135 }
3136 else static if (DMD_with_32bit_asm)
3137 {
3138     deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3139     {
3140         asm pure nothrow @nogc
3141         {
3142             movdqu XMM0, a;
3143             movdqu XMM1, count;
3144             psllw XMM0, XMM1;
3145             movdqu a, XMM0;
3146         }
3147         return a;
3148     }
3149 }
3150 else
3151 {
3152     deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3153     {
3154         short8 sa = cast(short8)a;
3155         long2 lc = cast(long2)count;
3156         int bits = cast(int)(lc.array[0]);
3157         short8 r = void;
3158         foreach(i; 0..8)
3159             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3160         return cast(int4)r;
3161     }
3162 }
3163 
3164 static if (GDC_with_SSE2)
3165 {
3166     /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3167     __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
3168     {
3169         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3170     }
3171 }
3172 else static if (LDC_with_SSE2)
3173 {
3174     /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3175     __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
3176     {
3177         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3178     }
3179 }
3180 else
3181 {
3182     /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3183     __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
3184     {
3185         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3186         //       D says "It's illegal to shift by the same or more bits 
3187         //       than the size of the quantity being shifted"
3188         //       and it's UB instead.
3189         int4 r = _mm_setzero_si128();
3190 
3191         ubyte count = cast(ubyte) imm8;
3192         if (count > 31)
3193             return r;
3194         
3195         foreach(i; 0..4)
3196             r.array[i] = cast(uint)(a.array[i]) << count;
3197         return r;
3198     }
3199 }
3200 unittest
3201 {
3202     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3203     __m128i B = _mm_slli_epi32(A, 1);
3204     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3205     int[4] expectedB = [ 0, 4, 6, -8];
3206     assert(B.array == expectedB);
3207     assert(B2.array == expectedB);
3208 
3209     __m128i C = _mm_slli_epi32(A, 0);
3210     int[4] expectedC = [ 0, 2, 3, -4];
3211     assert(C.array == expectedC);
3212 
3213     __m128i D = _mm_slli_epi32(A, 65);
3214     int[4] expectedD = [ 0, 0, 0, 0];
3215     assert(D.array == expectedD);
3216 }
3217 
3218 static if (GDC_with_SSE2)
3219 {
3220     /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3221     __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe
3222     {
3223         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3224     }
3225 }
3226 else static if (LDC_with_SSE2)
3227 {
3228     /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3229     __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe
3230     {
3231         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3232     }
3233 }
3234 else
3235 {
3236     /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3237     __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3238     {
3239         long2 sa = cast(long2)a;
3240 
3241         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3242         //       D says "It's illegal to shift by the same or more bits 
3243         //       than the size of the quantity being shifted"
3244         //       and it's UB instead.
3245         long2 r = cast(long2) _mm_setzero_si128();
3246         ubyte count = cast(ubyte) imm8;
3247         if (count > 63)
3248             return cast(__m128i)r;
3249 
3250         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3251         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3252         return cast(__m128i)r;
3253     }
3254 }
3255 unittest
3256 {
3257     __m128i A = _mm_setr_epi64(8, -4);
3258     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3259     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3260     long[2] expectedB = [ 16, -8];
3261     assert(B.array == expectedB);
3262     assert(B2.array == expectedB);
3263 
3264     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3265     long[2] expectedC = [ 8, -4];
3266     assert(C.array == expectedC);
3267 
3268     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3269     long[2] expectedD = [ 0, -0];
3270     assert(D.array == expectedD);
3271 }
3272 
3273 static if (GDC_with_SSE2)
3274 {
3275     /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3276     __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3277     {
3278         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3279     }
3280 }
3281 else static if (LDC_with_SSE2)
3282 {
3283     /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3284     __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3285     {
3286         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3287     }
3288 }
3289 else static if (LDC_with_ARM64)
3290 {
3291     /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3292     __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @trusted
3293     {
3294         short8 sa = cast(short8)a;
3295         short8 r = cast(short8)_mm_setzero_si128();
3296         ubyte count = cast(ubyte) imm8;
3297         if (count > 15)
3298             return cast(__m128i)r;
3299         r = sa << short8(count);
3300         return cast(__m128i)r;
3301     }
3302 }
3303 else
3304 {
3305     /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3306     __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @trusted
3307     {
3308         short8 sa = cast(short8)a;
3309         short8 r = cast(short8)_mm_setzero_si128();
3310         ubyte count = cast(ubyte) imm8;
3311         if (count > 15)
3312             return cast(__m128i)r;
3313         foreach(i; 0..8)
3314             r.ptr[i] = cast(short)(sa.array[i] << count);
3315         return cast(__m128i)r;
3316     }
3317 }
3318 unittest
3319 {
3320     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3321     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3322     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3323     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3324     assert(B.array == expectedB);
3325     assert(B2.array == expectedB);
3326 
3327     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3328     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3329     assert(C.array == expectedC);
3330 }
3331 
3332 
3333 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3334 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3335 {
3336     static if (bytes & 0xF0)
3337     {
3338         return _mm_setzero_si128();
3339     }
3340     else
3341     {
3342         static if (GDC_with_SSE2)
3343         {
3344             return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 
3345         }
3346         else version(DigitalMars)
3347         {
3348             version(D_InlineAsm_X86)
3349             {
3350                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3351                 {
3352                     movdqu XMM0, op;
3353                     pslldq XMM0, bytes;
3354                     movdqu op, XMM0;
3355                 }
3356                 return op;
3357             }
3358             else
3359             {
3360                 byte16 A = cast(byte16)op;
3361                 byte16 R;
3362                 for (int n = 15; n >= bytes; --n)
3363                     R.ptr[n] = A.array[n-bytes];
3364                 for (int n = bytes-1; n >= 0; --n)
3365                     R.ptr[n] = 0;
3366                 return cast(__m128i)R;
3367             }
3368         }
3369         else
3370         {
3371             return cast(__m128i) shufflevector!(byte16,
3372             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3373             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3374             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3375             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3376         }
3377     }
3378 }
3379 unittest
3380 {
3381     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3382     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3383     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3384     assert(R.array == correct);
3385 
3386     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3387     int[4] expectedB = [0, 0, 0, 0];
3388     assert(B.array == expectedB);
3389 }
3390 
3391 version(LDC)
3392 {
3393     // Disappeared with LDC 1.11
3394     static if (__VERSION__ < 2081)
3395         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
3396     else
3397     {
3398         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
3399         {
3400             vec.array[0] = llvm_sqrt(vec.array[0]);
3401             vec.array[1] = llvm_sqrt(vec.array[1]);
3402             return vec;
3403         }
3404     }
3405 }
3406 else
3407 {
3408     static if (GDC_with_SSE2)
3409     {
3410         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
3411     }
3412     else
3413     {
3414         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
3415         {
3416             vec.array[0] = sqrt(vec.array[0]);
3417             vec.array[1] = sqrt(vec.array[1]);
3418             return vec;
3419         }
3420     }
3421 }
3422 
3423 
3424 version(LDC)
3425 {
3426     // Disappeared with LDC 1.11
3427     static if (__VERSION__ < 2081)
3428         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3429     else
3430     {
3431         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3432         {
3433             vec.array[0] = llvm_sqrt(vec.array[0]);
3434             vec.array[1] = vec.array[1];
3435             return vec;
3436         }
3437     }
3438 }
3439 else
3440 {
3441     static if (GDC_with_SSE2)
3442     {
3443         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3444     }
3445     else
3446     {
3447         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3448         {
3449             vec.array[0] = sqrt(vec.array[0]);
3450             vec.array[1] = vec.array[1];
3451             return vec;
3452         }
3453     }
3454 }
3455 
3456 
3457 static if (GDC_with_SSE2)
3458 {
3459     deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
3460     {
3461         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3462     }
3463 }
3464 else static if (LDC_with_SSE2)
3465 {
3466     deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
3467     {
3468         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3469     }
3470 }
3471 else
3472 {
3473     deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
3474     {
3475         short8 sa = cast(short8)a;
3476         long2 lc = cast(long2)count;
3477         int bits = cast(int)(lc.array[0]);
3478         short8 r = void;
3479         foreach(i; 0..8)
3480             r.array[i] = cast(short)(sa.array[i] >> bits);
3481         return cast(int4)r;
3482     }
3483 }
3484 
3485 static if (LDC_with_SSE2)
3486 {
3487     deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3488 }
3489 else static if (GDC_with_SSE2)
3490 {
3491     deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3492 }
3493 else
3494 {
3495     deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe
3496     {
3497         int4 r = void;
3498         long2 lc = cast(long2)count;
3499         int bits = cast(int)(lc.array[0]);
3500         r.array[0] = (a.array[0] >> bits);
3501         r.array[1] = (a.array[1] >> bits);
3502         r.array[2] = (a.array[2] >> bits);
3503         r.array[3] = (a.array[3] >> bits);
3504         return r;
3505     }
3506 }
3507 
3508 
3509 static if (GDC_with_SSE2)
3510 {
3511     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3512     __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3513     {
3514         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3515     }
3516 }
3517 else static if (LDC_with_SSE2)
3518 {
3519     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3520     __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3521     {
3522         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3523     }
3524 }
3525 else static if (LDC_with_ARM64)
3526 {
3527     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3528     __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3529     {
3530         short8 sa = cast(short8)a;
3531         ubyte count = cast(ubyte)imm8;
3532         if (count > 15) 
3533             count = 15;
3534         short8 r = sa >> short8(count);
3535         return cast(__m128i)r;
3536     }
3537 }
3538 else
3539 {
3540     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3541     __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3542     {
3543         short8 sa = cast(short8)a;
3544         short8 r = void;
3545 
3546         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3547         //       D says "It's illegal to shift by the same or more bits 
3548         //       than the size of the quantity being shifted"
3549         //       and it's UB instead.
3550         ubyte count = cast(ubyte)imm8;
3551         if (count > 15) 
3552             count = 15;
3553         foreach(i; 0..8)
3554             r.ptr[i] = cast(short)(sa.array[i] >> count);
3555         return cast(int4)r;
3556     }
3557 }
3558 unittest
3559 {
3560     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3561     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3562     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3563     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3564     assert(B.array == expectedB);
3565     assert(B2.array == expectedB);
3566 
3567     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3568     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3569     assert(C.array == expectedC);
3570 }
3571 
3572 static if (LDC_with_SSE2)
3573 {
3574     /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3575     __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe
3576     {
3577         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3578     }
3579 }
3580 else static if (GDC_with_SSE2)
3581 {
3582     /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3583     __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe
3584     {
3585         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3586     }
3587 }
3588 else
3589 {
3590     /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3591     __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3592     {
3593         int4 r = void;
3594 
3595         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3596         //       D says "It's illegal to shift by the same or more bits 
3597         //       than the size of the quantity being shifted"
3598         //       and it's UB instead.
3599         ubyte count = cast(ubyte) imm8;
3600         if (count > 31)
3601             count = 31;
3602 
3603         r.ptr[0] = (a.array[0] >> count);
3604         r.ptr[1] = (a.array[1] >> count);
3605         r.ptr[2] = (a.array[2] >> count);
3606         r.ptr[3] = (a.array[3] >> count);
3607         return r;
3608     }
3609 }
3610 unittest
3611 {
3612     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3613     __m128i B = _mm_srai_epi32(A, 1);
3614     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3615     int[4] expectedB = [ 0, 1, 1, -2];
3616     assert(B.array == expectedB);
3617     assert(B2.array == expectedB);
3618 
3619     __m128i C = _mm_srai_epi32(A, 32);
3620     int[4] expectedC = [ 0, 0, 0, -1];
3621     assert(C.array == expectedC);
3622 
3623     __m128i D = _mm_srai_epi32(A, 0);
3624     int[4] expectedD = [ 0, 2, 3, -4];
3625     assert(D.array == expectedD);
3626 }
3627 
3628 static if (LDC_with_SSE2)
3629 {
3630     deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
3631     {
3632         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3633     }
3634 }
3635 else static if (GDC_with_SSE2)
3636 {
3637     deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
3638     {
3639         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3640     }
3641 }
3642 else
3643 {
3644     deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
3645     {
3646         short8 sa = cast(short8)a;
3647         long2 lc = cast(long2)count;
3648         int bits = cast(int)(lc.array[0]);
3649         short8 r = void;
3650         foreach(i; 0..8)
3651             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3652         return cast(int4)r;
3653     }
3654 }
3655 
3656 static if (LDC_with_SSE2)
3657 {
3658     deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3659 }
3660 else static if (GDC_with_SSE2)
3661 {
3662     deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3663 }
3664 else
3665 {
3666     deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe
3667     {
3668         int4 r = void;
3669         long2 lc = cast(long2)count;
3670         int bits = cast(int)(lc.array[0]);
3671         r.array[0] = cast(uint)(a.array[0]) >> bits;
3672         r.array[1] = cast(uint)(a.array[1]) >> bits;
3673         r.array[2] = cast(uint)(a.array[2]) >> bits;
3674         r.array[3] = cast(uint)(a.array[3]) >> bits;
3675         return r;
3676     }
3677 }
3678 
3679 static if (LDC_with_SSE2)
3680 {
3681     deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
3682     {
3683         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3684     }
3685 }
3686 else static if (GDC_with_SSE2)
3687 {
3688     deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
3689     {
3690         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3691     }
3692 }
3693 else
3694 {
3695     deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
3696     {
3697         long2 r = void;
3698         long2 sa = cast(long2)a;
3699         long2 lc = cast(long2)count;
3700         int bits = cast(int)(lc.array[0]);
3701         r.array[0] = cast(ulong)(sa.array[0]) >> bits;
3702         r.array[1] = cast(ulong)(sa.array[1]) >> bits;
3703         return cast(__m128i)r;
3704     }
3705 }
3706 
3707 
3708 static if (GDC_with_SSE2)
3709 {
3710     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3711     __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
3712     {
3713         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3714     }
3715 }
3716 else static if (LDC_with_SSE2)
3717 {
3718     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3719     __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
3720     {
3721         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3722     }
3723 }
3724 else static if (LDC_with_ARM64)
3725 {
3726     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3727     __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3728     {
3729         short8 sa = cast(short8)a;
3730         short8 r = cast(short8) _mm_setzero_si128();
3731 
3732         ubyte count = cast(ubyte)imm8;
3733         if (count >= 16)
3734             return cast(__m128i)r;
3735 
3736         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3737         return cast(__m128i)r;
3738     }
3739 }
3740 else
3741 {
3742     /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3743     __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
3744     {
3745         short8 sa = cast(short8)a;
3746         ubyte count = cast(ubyte)imm8;
3747 
3748         short8 r = cast(short8) _mm_setzero_si128();
3749         if (count >= 16)
3750             return cast(__m128i)r;
3751 
3752         foreach(i; 0..8)
3753             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
3754         return cast(__m128i)r;
3755     }
3756 }
3757 unittest
3758 {
3759     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3760     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3761     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
3762     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3763     assert(B.array == expectedB);
3764     assert(B2.array == expectedB);
3765 
3766     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
3767     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
3768     assert(C.array == expectedC);
3769 
3770     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
3771     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
3772     assert(D.array == expectedD);
3773 }
3774 
3775 
3776 static if (GDC_with_SSE2)
3777 {
3778     /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
3779     __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3780     {
3781         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3782     }
3783 }
3784 else static if (LDC_with_SSE2)
3785 {
3786     /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
3787     __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3788     {
3789         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3790     }
3791 }
3792 else
3793 {
3794     /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
3795     __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3796     {
3797         ubyte count = cast(ubyte) imm8;
3798 
3799         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3800         //       D says "It's illegal to shift by the same or more bits 
3801         //       than the size of the quantity being shifted"
3802         //       and it's UB instead.
3803         int4 r = _mm_setzero_si128();
3804         if (count >= 32)
3805             return r;
3806         r.ptr[0] = a.array[0] >>> count;
3807         r.ptr[1] = a.array[1] >>> count;
3808         r.ptr[2] = a.array[2] >>> count;
3809         r.ptr[3] = a.array[3] >>> count;
3810         return r;
3811     }
3812 }
3813 unittest
3814 {
3815     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3816     __m128i B = _mm_srli_epi32(A, 1);
3817     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
3818     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3819     assert(B.array == expectedB);
3820     assert(B2.array == expectedB);
3821  
3822     __m128i C = _mm_srli_epi32(A, 255);
3823     int[4] expectedC = [ 0, 0, 0, 0 ];
3824     assert(C.array == expectedC);
3825 }
3826 
3827 static if (GDC_with_SSE2)
3828 {
3829     /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
3830     __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
3831     {
3832         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
3833     }
3834 }
3835 else static if (LDC_with_SSE2)
3836 {
3837     /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
3838     __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
3839     {
3840         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
3841     }
3842 }
3843 else
3844 {
3845     /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
3846     __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
3847     {
3848         long2 r = cast(long2) _mm_setzero_si128();
3849         long2 sa = cast(long2)a;
3850 
3851         ubyte count = cast(ubyte) imm8;
3852         if (count >= 64)
3853             return cast(__m128i)r;
3854 
3855         r.ptr[0] = sa.array[0] >>> count;
3856         r.ptr[1] = sa.array[1] >>> count;
3857         return cast(__m128i)r;
3858     }
3859 }
3860 unittest
3861 {
3862     __m128i A = _mm_setr_epi64(8, -4);
3863     long2 B = cast(long2) _mm_srli_epi64(A, 1);
3864     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
3865     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
3866     assert(B.array == expectedB);
3867     assert(B2.array == expectedB);
3868 
3869     long2 C = cast(long2) _mm_srli_epi64(A, 64);
3870     long[2] expectedC = [ 0, 0 ];
3871     assert(C.array == expectedC);
3872 }
3873 
3874 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3875 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
3876 {
3877     static if (bytes & 0xF0)
3878     {
3879         return _mm_setzero_si128();
3880     }
3881     else
3882     {
3883         static if (GDC_with_SSE2)
3884         {
3885             return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8));
3886         }
3887         else static if (DMD_with_32bit_asm)
3888         {
3889             asm pure nothrow @nogc @trusted
3890             {
3891                 movdqu XMM0, v;
3892                 psrldq XMM0, bytes;
3893                 movdqu v, XMM0;
3894             }
3895             return v;
3896         }
3897         else
3898         {
3899             return cast(__m128i) shufflevector!(byte16,
3900                                                 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
3901                                                 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
3902                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
3903         }
3904     }
3905 
3906 }
3907 
3908 unittest
3909 {
3910     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
3911     int[4] correct = [2, 3, 4, 0];
3912     assert(R.array == correct);
3913 
3914     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
3915     int[4] expectedA = [0, 0, 0, 0];
3916     assert(A.array == expectedA);
3917 }
3918 
3919 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3920 /// #BONUS
3921 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
3922 {
3923     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
3924 }
3925 unittest
3926 {
3927     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
3928     float[4] correct = [3.0f, 4.0f, 0, 0];
3929     assert(R.array == correct);
3930 }
3931 
3932 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3933 /// #BONUS
3934 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
3935 {
3936     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
3937 }
3938 
3939 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
3940 {
3941     __m128d* aligned = cast(__m128d*)mem_addr;
3942     *aligned = a;
3943 }
3944 
3945 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
3946 {
3947     __m128d* aligned = cast(__m128d*)mem_addr;
3948     __m128d r;
3949     r.ptr[0] = a.array[0];
3950     r.ptr[1] = a.array[0];
3951     *aligned = r;
3952 }
3953 
3954 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
3955 {
3956     *mem_addr = a.array[0];
3957 }
3958 
3959 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
3960 {
3961     *mem_addr = a;
3962 }
3963 
3964 alias _mm_store1_pd = _mm_store_pd1;
3965 
3966 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
3967 {
3968     *mem_addr = a.array[1];
3969 }
3970 
3971 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
3972 // expectations from the user point of view. This problem also exist in C++.
3973 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
3974 {
3975     long* dest = cast(long*)mem_addr;
3976     long2 la = cast(long2)a;
3977     *dest = la.array[0];
3978 }
3979 unittest
3980 {
3981     long[3] A = [1, 2, 3];
3982     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
3983     long[3] correct = [1, 0x1_0000_0000, 3];
3984     assert(A == correct);
3985 }
3986 
3987 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
3988 {
3989     *mem_addr = a.array[0];
3990 }
3991 
3992 void _mm_storer_pd (double* mem_addr, __m128d a) pure
3993 {
3994     __m128d* aligned = cast(__m128d*)mem_addr;
3995     *aligned = shufflevector!(double2, 1, 0)(a, a);
3996 }
3997 
3998 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
3999 {
4000     storeUnaligned!double2(a, mem_addr);
4001 }
4002 
4003 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4004 {
4005     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4006 }
4007 
4008 /// Store 32-bit integer from the first element of `a` into memory. 
4009 /// `mem_addr` does not need to be aligned on any particular boundary.
4010 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4011 {
4012     int* dest = cast(int*)mem_addr;
4013     *dest = a.array[0];
4014 }
4015 unittest
4016 {
4017     int[2] arr = [-24, 12];
4018     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4019     assert(arr == [-24, -1]);
4020 }
4021 
4022 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4023 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4024 /// boundary or a general-protection exception may be generated.
4025 void _mm_stream_pd (double* mem_addr, __m128d a)
4026 {
4027     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4028     __m128d* dest = cast(__m128d*)mem_addr;
4029     *dest = a;
4030 }
4031 
4032 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4033 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4034 /// may be generated.
4035 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4036 {
4037     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4038     __m128i* dest = cast(__m128i*)mem_addr;
4039     *dest = a;
4040 }
4041 
4042 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4043 /// pollution. If the cache line containing address mem_addr is already in the cache,
4044 /// the cache will be updated.
4045 void _mm_stream_si32 (int* mem_addr, int a)
4046 {
4047     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4048     *mem_addr = a;
4049 }
4050 
4051 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4052 /// cache pollution. If the cache line containing address mem_addr is already
4053 /// in the cache, the cache will be updated.
4054 void _mm_stream_si64 (long* mem_addr, long a)
4055 {
4056     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4057     *mem_addr = a;
4058 }
4059 
4060 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4061 {
4062     return cast(__m128i)(cast(short8)a - cast(short8)b);
4063 }
4064 
4065 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4066 {
4067     return cast(__m128i)(cast(int4)a - cast(int4)b);
4068 }
4069 
4070 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4071 {
4072     return cast(__m128i)(cast(long2)a - cast(long2)b);
4073 }
4074 
4075 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4076 {
4077     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4078 }
4079 
4080 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4081 {
4082     return a - b;
4083 }
4084 
4085 version(DigitalMars)
4086 {
4087     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4088     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
4089     {
4090         asm pure nothrow @nogc @trusted { nop;}
4091         a[0] = a[0] - b[0];
4092         return a;
4093     }
4094 }
4095 else static if (GDC_with_SSE2)
4096 {
4097     alias _mm_sub_sd = __builtin_ia32_subsd;
4098 }
4099 else
4100 {
4101     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
4102     {
4103         a.array[0] -= b.array[0];
4104         return a;
4105     }
4106 }
4107 unittest
4108 {
4109     __m128d a = [1.5, -2.0];
4110     a = _mm_sub_sd(a, a);
4111     assert(a.array == [0.0, -2.0]);
4112 }
4113 
4114 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4115 {
4116     return a - b;
4117 }
4118 
4119 version(LDC)
4120 {
4121     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4122     {
4123         // Generates PSUBSW since LDC 1.15 -O0
4124         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4125         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4126         {
4127             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4128             enum ir = `
4129                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4130                 ret <8 x i16> %r`;
4131             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4132         }
4133     }
4134     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4135     {
4136         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4137         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4138         {
4139             short[8] res;
4140             short8 sa = cast(short8)a;
4141             short8 sb = cast(short8)b;
4142             foreach(i; 0..8)
4143                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4144             return _mm_loadu_si128(cast(int4*)res.ptr);
4145         }
4146     }
4147     else
4148         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
4149 }
4150 else
4151 {
4152     static if (GDC_with_SSE2)
4153     {
4154         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
4155     }
4156     else
4157     {
4158         /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4159         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4160         {
4161             short[8] res;
4162             short8 sa = cast(short8)a;
4163             short8 sb = cast(short8)b;
4164             foreach(i; 0..8)
4165                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4166             return _mm_loadu_si128(cast(int4*)res.ptr);
4167         }
4168     }
4169 }
4170 unittest
4171 {
4172     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4173                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4174     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4175     assert(res.array == correctResult);
4176 }
4177 
4178 version(LDC)
4179 {
4180     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4181     {
4182         // x86: Generates PSUBSB since LDC 1.15 -O0
4183         // ARM: Generates sqsub.16b since LDC 1.21 -O0
4184         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4185         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4186         {
4187             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4188             enum ir = `
4189                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4190                 ret <16 x i8> %r`;
4191             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4192         }
4193     }
4194     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4195     {
4196         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4197         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4198         {
4199             byte[16] res;
4200             byte16 sa = cast(byte16)a;
4201             byte16 sb = cast(byte16)b;
4202             foreach(i; 0..16)
4203                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4204             return _mm_loadu_si128(cast(int4*)res.ptr);
4205         }
4206     }
4207     else
4208         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
4209 }
4210 else
4211 {
4212     static if (GDC_with_SSE2)
4213     {
4214         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
4215     }
4216     else
4217     {
4218         /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4219         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4220         {
4221             byte[16] res;
4222             byte16 sa = cast(byte16)a;
4223             byte16 sb = cast(byte16)b;
4224             foreach(i; 0..16)
4225                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4226             return _mm_loadu_si128(cast(int4*)res.ptr);
4227         }
4228     }
4229 }
4230 unittest
4231 {
4232     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4233                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4234     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4235     assert(res.array == correctResult);
4236 }
4237 
4238 version(LDC)
4239 {
4240     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4241     {
4242         // x86: Generates PSUBUSW since LDC 1.15 -O0
4243         // ARM: Generates uqsub.8h since LDC 1.21 -O0
4244         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4245         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4246         {
4247             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4248             enum ir = `
4249                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4250                 ret <8 x i16> %r`;
4251             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4252         }
4253     }
4254     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4255     {
4256         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4257         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4258         {
4259             short[8] res;
4260             short8 sa = cast(short8)a;
4261             short8 sb = cast(short8)b;
4262             foreach(i; 0..8)
4263             {
4264                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4265                 res[i] = saturateSignedIntToUnsignedShort(sum);
4266             }
4267             return _mm_loadu_si128(cast(int4*)res.ptr);
4268         }
4269     }
4270     else
4271         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
4272 }
4273 else
4274 {
4275     static if (GDC_with_SSE2)
4276     {
4277         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
4278     }
4279     else
4280     {
4281         /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4282         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4283         {
4284             short[8] res;
4285             short8 sa = cast(short8)a;
4286             short8 sb = cast(short8)b;
4287             foreach(i; 0..8)
4288             {
4289                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4290                 res[i] = saturateSignedIntToUnsignedShort(sum);
4291             }
4292             return _mm_loadu_si128(cast(int4*)res.ptr);
4293         }
4294     }
4295 }
4296 unittest
4297 {
4298     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4299                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4300     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4301     assert(R.array == correct);
4302 }
4303 
4304 version(LDC)
4305 {
4306     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4307     {
4308         // x86: Generates PSUBUSB since LDC 1.15 -O0
4309         // ARM: Generates uqsub.16b since LDC 1.21 -O0
4310         /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4311         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4312         {
4313             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4314             enum ir = `
4315                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4316                 ret <16 x i8> %r`;
4317             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4318         }
4319     }
4320     else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4321     {
4322          /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4323         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4324         {
4325             ubyte[16] res;
4326             byte16 sa = cast(byte16)a;
4327             byte16 sb = cast(byte16)b;
4328             foreach(i; 0..16)
4329                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4330             return _mm_loadu_si128(cast(int4*)res.ptr);
4331         }
4332     }
4333     else    
4334         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
4335 }
4336 else
4337 {
4338     static if (GDC_with_SSE2)
4339     {
4340         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
4341     }
4342     else
4343     {
4344         /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4345         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4346         {
4347             ubyte[16] res;
4348             byte16 sa = cast(byte16)a;
4349             byte16 sb = cast(byte16)b;
4350             foreach(i; 0..16)
4351                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4352             return _mm_loadu_si128(cast(int4*)res.ptr);
4353         }
4354     }
4355 }
4356 unittest
4357 {
4358     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4359                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4360     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4361     assert(res.array == correctResult);
4362 }
4363 
4364 // Note: the only difference between these intrinsics is the signalling
4365 //       behaviour of quiet NaNs. This is incorrect but the case where
4366 //       you would want to differentiate between qNaN and sNaN and then
4367 //       treat them differently on purpose seems extremely rare.
4368 alias _mm_ucomieq_sd = _mm_comieq_sd;
4369 alias _mm_ucomige_sd = _mm_comige_sd;
4370 alias _mm_ucomigt_sd = _mm_comigt_sd;
4371 alias _mm_ucomile_sd = _mm_comile_sd;
4372 alias _mm_ucomilt_sd = _mm_comilt_sd;
4373 alias _mm_ucomineq_sd = _mm_comineq_sd;
4374 
4375 __m128d _mm_undefined_pd() pure @safe
4376 {
4377     __m128d result = void;
4378     return result;
4379 }
4380 __m128i _mm_undefined_si128() pure @safe
4381 {
4382     __m128i result = void;
4383     return result;
4384 }
4385 
4386 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4387 {
4388     static if (GDC_with_SSE2)
4389     {
4390         return __builtin_ia32_punpckhwd128(a, b);
4391     }
4392     else static if (DMD_with_32bit_asm)
4393     {
4394         asm pure nothrow @nogc @trusted
4395         {
4396             movdqu XMM0, a;
4397             movdqu XMM1, b;
4398             punpckhwd XMM0, XMM1;
4399             movdqu a, XMM0;
4400         }
4401         return a;
4402     }
4403     else
4404     {
4405         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4406                                            (cast(short8)a, cast(short8)b);
4407     }
4408 }
4409 unittest
4410 {
4411     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4412     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4413     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4414     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4415     assert(C.array == correct);
4416 }
4417 
4418 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
4419 {
4420     static if (GDC_with_SSE2)
4421     {
4422         return __builtin_ia32_punpckhdq128(a, b);
4423     }
4424     else
4425     {
4426         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4427     }
4428 }
4429 
4430 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4431 {
4432     static if (GDC_with_SSE2)
4433     {
4434         return __builtin_ia32_punpckhqdq128(a, b);
4435     }
4436     else
4437     {
4438         __m128i r = cast(__m128i)b;
4439         r[0] = a[2];
4440         r[1] = a[3];
4441         return r; 
4442     }
4443 }
4444 unittest // Issue #36
4445 {
4446     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4447     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4448     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4449     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4450     assert(C.array == correct);
4451 }
4452 
4453 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4454 {
4455     static if (GDC_with_SSE2)
4456     {
4457         return __builtin_ia32_punpckhbw128(a, b);
4458     }
4459     else static if (DMD_with_32bit_asm)
4460     {
4461         asm pure nothrow @nogc @trusted
4462         {
4463             movdqu XMM0, a;
4464             movdqu XMM1, b;
4465             punpckhbw XMM0, XMM1;
4466             movdqu a, XMM0;
4467         }
4468         return a;
4469     }
4470     else
4471     {
4472         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4473                                                    12, 28, 13, 29, 14, 30, 15, 31)
4474                                                    (cast(byte16)a, cast(byte16)b);
4475     }
4476 }
4477 
4478 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4479 {
4480     static if (GDC_with_SSE2)
4481     {
4482         return __builtin_ia32_unpckhpd(a, b);
4483     }
4484     else
4485     {
4486         return shufflevector!(__m128d, 1, 3)(a, b);
4487     }
4488 }
4489 
4490 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4491 {
4492     static if (GDC_with_SSE2)
4493     {
4494         return __builtin_ia32_punpcklwd128(a, b);
4495     }
4496     else static if (DMD_with_32bit_asm)
4497     {
4498         asm pure nothrow @nogc @trusted
4499         {
4500             movdqu XMM0, a;
4501             movdqu XMM1, b;
4502             punpcklwd XMM0, XMM1;
4503             movdqu a, XMM0;
4504         }
4505         return a;
4506     }
4507     else
4508     {
4509         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4510                                            (cast(short8)a, cast(short8)b);
4511     }
4512 }
4513 
4514 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
4515 {
4516     static if (GDC_with_SSE2)
4517     {
4518         return __builtin_ia32_punpckldq128(a, b);
4519     }
4520     else
4521     {
4522         return shufflevector!(int4, 0, 4, 1, 5)
4523                              (cast(int4)a, cast(int4)b);
4524     }
4525 }
4526 
4527 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4528 {
4529     static if (GDC_with_SSE2)
4530     {
4531         return __builtin_ia32_punpcklqdq128(a, b);
4532     }
4533     else
4534     {
4535         long2 lA = cast(long2)a;
4536         long2 lB = cast(long2)b;
4537         long2 R;
4538         R.ptr[0] = lA.array[0];
4539         R.ptr[1] = lB.array[0];
4540         return cast(__m128i)R;
4541     }
4542 }
4543 unittest // Issue #36
4544 {
4545     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4546     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4547     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4548     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4549     assert(C.array == correct);
4550 }
4551 
4552 
4553 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4554 {
4555     static if (GDC_with_SSE2)
4556     {
4557         return __builtin_ia32_punpcklbw128(a, b);
4558     }
4559     else static if (DMD_with_32bit_asm)
4560     {
4561         asm pure nothrow @nogc @trusted
4562         {
4563             movdqu XMM0, a;
4564             movdqu XMM1, b;
4565             punpcklbw XMM0, XMM1;
4566             movdqu a, XMM0;
4567         }
4568         return a;
4569     }
4570     else
4571     {
4572         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4573                                                     4, 20, 5, 21, 6, 22, 7, 23)
4574                                            (cast(byte16)a, cast(byte16)b);
4575     }
4576 }
4577 
4578 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4579 {
4580     static if (GDC_with_SSE2)
4581     {
4582         return __builtin_ia32_unpcklpd(a, b);
4583     }
4584     else
4585     {
4586         return shufflevector!(__m128d, 0, 2)(a, b);
4587     }
4588 }
4589 
4590 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4591 {
4592     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4593 }
4594 
4595 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4596 {
4597     return a ^ b;
4598 }
4599 
4600 unittest
4601 {
4602     // distance between two points in 4D
4603     float distance(float[4] a, float[4] b) nothrow @nogc
4604     {
4605         __m128 va = _mm_loadu_ps(a.ptr);
4606         __m128 vb = _mm_loadu_ps(b.ptr);
4607         __m128 diffSquared = _mm_sub_ps(va, vb);
4608         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4609         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4610         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4611         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4612     }
4613     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4614 }