1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (GDC_with_SSE2)
83     {
84         return __builtin_ia32_addsd(a, b);
85     }
86     else version(DigitalMars)
87     {
88         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
89         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
90         asm pure nothrow @nogc @trusted { nop;}
91         a[0] = a[0] + b[0];
92         return a;
93     }
94     else
95     {
96         a[0] += b[0];
97         return a;
98     }
99 }
100 unittest
101 {
102     __m128d a = [1.5, -2.0];
103     a = _mm_add_sd(a, a);
104     assert(a.array == [3.0, -2.0]);
105 }
106 
107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
109 {
110     pragma(inline, true);
111     return a + b;
112 }
113 unittest
114 {
115     __m128d a = [1.5, -2.0];
116     a = _mm_add_pd(a, a);
117     assert(a.array == [3.0, -4.0]);
118 }
119 
120 /// Add 64-bit integers `a` and `b`.
121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
122 {
123     pragma(inline, true);
124     return a + b;
125 }
126 
127 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
129 {
130     static if (GDC_with_SSE2)
131     {
132         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
133     }
134     else version(LDC)
135     {
136         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
137         {
138             // x86: Generates PADDSW since LDC 1.15 -O0
139             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
140             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
141             enum ir = `
142                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
143                 ret <8 x i16> %r`;
144             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
145         }
146         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
147         {
148             short[8] res; // PERF =void;
149             short8 sa = cast(short8)a;
150             short8 sb = cast(short8)b;
151             foreach(i; 0..8)
152                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
153             return _mm_loadu_si128(cast(int4*)res.ptr);
154         }
155         else
156             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
157     }
158     else
159     {
160         short[8] res; // PERF =void;
161         short8 sa = cast(short8)a;
162         short8 sb = cast(short8)b;
163         foreach(i; 0..8)
164             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
165         return _mm_loadu_si128(cast(int4*)res.ptr);
166     }
167 }
168 unittest
169 {
170     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
171                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
172     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
173     assert(res.array == correctResult);
174 }
175 
176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
178 {
179     static if (GDC_with_SSE2)
180     {
181         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
182     }
183     else version(LDC)
184     {
185         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
186         {
187             // x86: Generates PADDSB since LDC 1.15 -O0
188             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
189             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
190             enum ir = `
191                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
192                 ret <16 x i8> %r`;
193             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
194         }
195         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
196         {
197             byte[16] res; // PERF =void;
198             byte16 sa = cast(byte16)a;
199             byte16 sb = cast(byte16)b;
200             foreach(i; 0..16)
201                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
202             return _mm_loadu_si128(cast(int4*)res.ptr);
203         }
204         else
205             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
206     }
207     else
208     {
209         byte[16] res; // PERF =void;
210         byte16 sa = cast(byte16)a;
211         byte16 sb = cast(byte16)b;
212         foreach(i; 0..16)
213             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
214         return _mm_loadu_si128(cast(int4*)res.ptr);
215     }
216 }
217 unittest
218 {
219     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
220                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
221     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
222                                                16, 18, 20, 22, 24, 26, 28, 30];
223     assert(res.array == correctResult);
224 }
225 
226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
227 // PERF: #GDC version?
228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
229 {
230     version(LDC)
231     {
232         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
233         {
234             // x86: Generates PADDUSB since LDC 1.15 -O0
235             // ARM: Generates uqadd.16b since LDC 1.21 -O1
236             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
237             enum ir = `
238                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
239                 ret <16 x i8> %r`;
240             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
241         }
242         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
243         {
244             ubyte[16] res; // PERF =void;
245             byte16 sa = cast(byte16)a;
246             byte16 sb = cast(byte16)b;
247             foreach(i; 0..16)
248                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
249             return _mm_loadu_si128(cast(int4*)res.ptr);
250         }
251         else
252             return __builtin_ia32_paddusb128(a, b);
253     }
254     else
255     {
256         ubyte[16] res; // PERF =void;
257         byte16 sa = cast(byte16)a;
258         byte16 sb = cast(byte16)b;
259         foreach(i; 0..16)
260             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
261         return _mm_loadu_si128(cast(int4*)res.ptr);
262     }
263 }
264 unittest
265 {
266     byte16 res = cast(byte16) 
267         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
268                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
269     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
270                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
271     assert(res.array == correctResult);
272 }
273 
274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
275 // PERF: #GDC version?
276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
277 {
278     version(LDC)
279     {
280         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
281         {
282             // x86: Generates PADDUSW since LDC 1.15 -O0
283             // ARM: Generates uqadd.8h since LDC 1.21 -O1
284             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
285             enum ir = `
286                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
287                 ret <8 x i16> %r`;
288             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
289         }
290         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
291         {
292             ushort[8] res; // PERF =void;
293             short8 sa = cast(short8)a;
294             short8 sb = cast(short8)b;
295             foreach(i; 0..8)
296                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
297             return _mm_loadu_si128(cast(int4*)res.ptr);
298         }
299         else
300             return __builtin_ia32_paddusw128(a, b);
301     }
302     else
303     {
304         ushort[8] res; // PERF =void;
305         short8 sa = cast(short8)a;
306         short8 sb = cast(short8)b;
307         foreach(i; 0..8)
308             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
309         return _mm_loadu_si128(cast(int4*)res.ptr);
310     }
311 }
312 unittest
313 {
314     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
315                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
316     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
317     assert(res.array == correctResult);
318 }
319 
320 /// Compute the bitwise AND of packed double-precision (64-bit) 
321 /// floating-point elements in `a` and `b`.
322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
323 {
324     pragma(inline, true);
325     return cast(__m128d)( cast(long2)a & cast(long2)b );
326 }
327 unittest
328 {
329     double a = 4.32;
330     double b = -78.99;
331     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
332     __m128d A = _mm_set_pd(a, b);
333     __m128d B = _mm_set_pd(b, a);
334     long2 R = cast(long2)( _mm_and_pd(A, B) );
335     assert(R.array[0] == correct);
336     assert(R.array[1] == correct);
337 }
338 
339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
341 {
342     pragma(inline, true);
343     return a & b;
344 }
345 unittest
346 {
347     __m128i A = _mm_set1_epi32(7);
348     __m128i B = _mm_set1_epi32(14);
349     __m128i R = _mm_and_si128(A, B);
350     int[4] correct = [6, 6, 6, 6];
351     assert(R.array == correct);
352 }
353 
354 /// Compute the bitwise NOT of packed double-precision (64-bit) 
355 /// floating-point elements in `a` and then AND with `b`.
356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
357 {
358     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
359 }
360 unittest
361 {
362     double a = 4.32;
363     double b = -78.99;
364     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
365     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
366     __m128d A = _mm_setr_pd(a, b);
367     __m128d B = _mm_setr_pd(b, a);
368     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
369     assert(R.array[0] == correct);
370     assert(R.array[1] == correct2);
371 }
372 
373 /// Compute the bitwise NOT of 128 bits (representing integer data) 
374 /// in `a` and then AND with `b`.
375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
376 {
377     return (~a) & b;
378 }
379 unittest
380 {
381     __m128i A = _mm_set1_epi32(7);
382     __m128i B = _mm_set1_epi32(14);
383     __m128i R = _mm_andnot_si128(A, B);
384     int[4] correct = [8, 8, 8, 8];
385     assert(R.array == correct);
386 }
387 
388 /// Average packed unsigned 16-bit integers in `a` and `b`.
389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
390 {
391     static if (GDC_with_SSE2)
392     {
393         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
394     }
395     else static if (LDC_with_ARM64)
396     {
397         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
398     }
399     else version(LDC)
400     {
401         // Generates pavgw even in LDC 1.0, even in -O0
402         // But not in ARM
403         enum ir = `
404             %ia = zext <8 x i16> %0 to <8 x i32>
405             %ib = zext <8 x i16> %1 to <8 x i32>
406             %isum = add <8 x i32> %ia, %ib
407             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
408             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
409             %r = trunc <8 x i32> %isums to <8 x i16>
410             ret <8 x i16> %r`;
411         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
412     }
413     else
414     {
415         short8 sa = cast(short8)a;
416         short8 sb = cast(short8)b;
417         short8 sr = void;
418         foreach(i; 0..8)
419         {
420             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
421         }
422         return cast(int4)sr;
423     }
424 }
425 unittest
426 {
427     __m128i A = _mm_set1_epi16(31);
428     __m128i B = _mm_set1_epi16(64);
429     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
430     foreach(i; 0..8)
431         assert(avg.array[i] == 48);
432 }
433 
434 /// Average packed unsigned 8-bit integers in `a` and `b`.
435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
436 {
437     static if (GDC_with_SSE2)
438     {
439         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
440     }
441     else static if (LDC_with_ARM64)
442     {
443         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
444     }
445     else version(LDC)
446     {
447         // Generates pavgb even in LDC 1.0, even in -O0
448         // But not in ARM
449         enum ir = `
450             %ia = zext <16 x i8> %0 to <16 x i16>
451             %ib = zext <16 x i8> %1 to <16 x i16>
452             %isum = add <16 x i16> %ia, %ib
453             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
454             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
455             %r = trunc <16 x i16> %isums to <16 x i8>
456             ret <16 x i8> %r`;
457         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
458     }
459     else
460     {
461         byte16 sa = cast(byte16)a;
462         byte16 sb = cast(byte16)b;
463         byte16 sr = void;
464         foreach(i; 0..16)
465         {
466             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
467         }
468         return cast(int4)sr;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_set1_epi8(31);
474     __m128i B = _mm_set1_epi8(64);
475     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
476     foreach(i; 0..16)
477         assert(avg.array[i] == 48);
478 }
479 
480 /// Shift `a` left by `bytes` bytes while shifting in zeros.
481 alias _mm_bslli_si128 = _mm_slli_si128;
482 unittest
483 {
484     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
485     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
486     __m128i result = _mm_bslli_si128!5(toShift);
487     assert( (cast(byte16)result).array == exact);
488 }
489 
490 /// Shift `v` right by `bytes` bytes while shifting in zeros.
491 alias _mm_bsrli_si128 = _mm_srli_si128;
492 unittest
493 {
494     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
495     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
496     __m128i result = _mm_bsrli_si128!5(toShift);
497     assert( (cast(byte16)result).array == exact);
498 }
499 
500 /// Cast vector of type `__m128d` to type `__m128`. 
501 /// Note: Also possible with a regular `cast(__m128)(a)`.
502 __m128 _mm_castpd_ps (__m128d a) pure @safe
503 {
504     return cast(__m128)a;
505 }
506 
507 /// Cast vector of type `__m128d` to type `__m128i`. 
508 /// Note: Also possible with a regular `cast(__m128i)(a)`.
509 __m128i _mm_castpd_si128 (__m128d a) pure @safe
510 {
511     return cast(__m128i)a;
512 }
513 
514 /// Cast vector of type `__m128` to type `__m128d`. 
515 /// Note: Also possible with a regular `cast(__m128d)(a)`.
516 __m128d _mm_castps_pd (__m128 a) pure @safe
517 {
518     return cast(__m128d)a;
519 }
520 
521 /// Cast vector of type `__m128` to type `__m128i`. 
522 /// Note: Also possible with a regular `cast(__m128i)(a)`.
523 __m128i _mm_castps_si128 (__m128 a) pure @safe
524 {
525     return cast(__m128i)a;
526 }
527 
528 /// Cast vector of type `__m128i` to type `__m128d`. 
529 /// Note: Also possible with a regular `cast(__m128d)(a)`.
530 __m128d _mm_castsi128_pd (__m128i a) pure @safe
531 {
532     return cast(__m128d)a;
533 }
534 
535 /// Cast vector of type `__m128i` to type `__m128`. 
536 /// Note: Also possible with a regular `cast(__m128)(a)`.
537 __m128 _mm_castsi128_ps (__m128i a) pure @safe
538 {
539     return cast(__m128)a;
540 }
541 
542 /// Invalidate and flush the cache line that contains `p` 
543 /// from all levels of the cache hierarchy.
544 void _mm_clflush (const(void)* p) @trusted
545 {
546     static if (GDC_with_SSE2)
547     {
548         __builtin_ia32_clflush(p);
549     }
550     else static if (LDC_with_SSE2)
551     {
552         __builtin_ia32_clflush(cast(void*)p);
553     }
554     else version(D_InlineAsm_X86)
555     {
556         asm pure nothrow @nogc @safe
557         {
558             mov EAX, p;
559             clflush [EAX];
560         }
561     }
562     else version(D_InlineAsm_X86_64)
563     {
564         asm pure nothrow @nogc @safe
565         {
566             mov RAX, p;
567             clflush [RAX];
568         }
569     }
570     else 
571     {
572         // Do nothing. Invalidating cacheline does
573         // not affect correctness.
574     }
575 }
576 unittest
577 {
578     ubyte[64] cacheline;
579     _mm_clflush(cacheline.ptr);
580 }
581 
582 /// Compare packed 16-bit integers in `a` and `b` for equality.
583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
584 {
585     static if (GDC_with_SSE2)
586     {
587         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
588     }
589     else
590     {
591         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
592     }
593 }
594 unittest
595 {
596     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
597     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
598     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
599     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
600     assert(R.array == E);
601 }
602 
603 /// Compare packed 32-bit integers in `a` and `b` for equality.
604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
605 {
606     static if (GDC_with_SSE2)
607     {
608         return __builtin_ia32_pcmpeqd128(a, b);
609     }
610     else
611     {
612         return equalMask!__m128i(a, b);
613     }
614 }
615 unittest
616 {
617     int4   A = [-3, -2, -1,  0];
618     int4   B = [ 4, -2,  2,  0];
619     int[4] E = [ 0, -1,  0, -1];
620     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
621     assert(R.array == E);
622 }
623 
624 /// Compare packed 8-bit integers in `a` and `b` for equality.
625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
626 {
627     static if (GDC_with_SSE2)
628     {
629         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
630     }
631     else
632     {
633         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
634     }
635 }
636 unittest
637 {
638     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
639     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
640     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
641     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
642     assert(C.array == correct);
643 }
644 
645 /// Compare packed double-precision (64-bit) floating-point elements 
646 /// in `a` and `b` for equality.
647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
648 {
649     static if (GDC_with_SSE2)
650     {
651         return __builtin_ia32_cmpeqpd(a, b);
652     }
653     else
654     {
655         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
656     }
657 }
658 
659 /// Compare the lower double-precision (64-bit) floating-point elements
660 /// in `a` and `b` for equality, store the result in the lower element,
661 /// and copy the upper element from `a`.
662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
663 {
664     static if (GDC_with_SSE2)
665     {
666         return __builtin_ia32_cmpeqsd(a, b);
667     }
668     else
669     {
670         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
671     }
672 }
673 
674 /// Compare packed double-precision (64-bit) floating-point elements 
675 /// in `a` and `b` for greater-than-or-equal.
676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
677 {
678     static if (GDC_with_SSE2)
679     {
680         return __builtin_ia32_cmpgepd(a, b);
681     }
682     else
683     {
684         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
685     }
686 }
687 
688 /// Compare the lower double-precision (64-bit) floating-point elements 
689 /// in `a` and `b` for greater-than-or-equal, store the result in the 
690 /// lower element, and copy the upper element from `a`.
691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
692 {
693     // Note: There is no __builtin_ia32_cmpgesd builtin.
694     static if (GDC_with_SSE2)
695     {
696         return __builtin_ia32_cmpnltsd(b, a);
697     }
698     else
699     {
700         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
701     }
702 }
703 
704 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
706 {
707     static if (GDC_with_SSE2)
708     {
709         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
710     }
711     else
712     {
713         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
714     }
715 }
716 unittest
717 {
718     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
719     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
720     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
721     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
722     assert(R.array == E);
723 }
724 
725 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
727 {
728     static if (GDC_with_SSE2)
729     {
730         return __builtin_ia32_pcmpgtd128(a, b); 
731     }
732     else
733     {
734         return cast(__m128i)( greaterMask!int4(a, b));
735     }
736 }
737 unittest
738 {
739     int4   A = [-3,  2, -1,  0];
740     int4   B = [ 4, -2,  2,  0];
741     int[4] E = [ 0, -1,  0,  0];
742     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
743     assert(R.array == E);
744 }
745 
746 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
748 {
749     // Workaround of a GCC bug here.
750     // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence
751     // with __builtin_ia32_pcmpgtb128.
752     // GCC's emmintrin.h uses comparison operators we don't have instead.
753     // PERF: this is a quite severe GDC performance problem.
754     // Could be workarounded with inline assembly, or another algorithm I guess.
755   
756   /*
757     static if (GDC_with_SSE2)
758     {
759         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
760     }
761     else */
762     {
763         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
764     }
765 }
766 unittest
767 {
768     __m128i A = _mm_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
769     __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
770     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
771     byte[16] correct =       [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
772     __m128i D = _mm_cmpeq_epi8(A, B);
773     assert(C.array == correct);
774 }
775 
776 /// Compare packed double-precision (64-bit) floating-point elements 
777 /// in `a` and `b` for greater-than.
778 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
779 {
780     static if (GDC_with_SSE2)
781     {
782         return __builtin_ia32_cmpgtpd(a, b); 
783     }
784     else
785     {
786         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
787     }
788 }
789 
790 /// Compare the lower double-precision (64-bit) floating-point elements 
791 /// in `a` and `b` for greater-than, store the result in the lower element,
792 /// and copy the upper element from `a`.
793 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
794 {
795     // Note: There is no __builtin_ia32_cmpgtsd builtin.
796     static if (GDC_with_SSE2)
797     {
798         return __builtin_ia32_cmpnlesd(b, a);
799     }
800     else
801     {
802         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
803     }
804 }
805 
806 /// Compare packed double-precision (64-bit) floating-point elements 
807 /// in `a` and `b` for less-than-or-equal.
808 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
809 {
810     static if (GDC_with_SSE2)
811     {
812         return __builtin_ia32_cmplepd(a, b); 
813     }
814     else
815     {
816         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
817     }
818 }
819 
820 /// Compare the lower double-precision (64-bit) floating-point elements 
821 /// in `a` and `b` for less-than-or-equal, store the result in the 
822 /// lower element, and copy the upper element from `a`.
823 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
824 {
825     static if (GDC_with_SSE2)
826     {
827         return __builtin_ia32_cmplesd(a, b); 
828     }
829     else
830     {
831         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
832     }
833 }
834 
835 /// Compare packed 16-bit integers in `a` and `b` for less-than.
836 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
837 {
838     return _mm_cmpgt_epi16(b, a);
839 }
840 
841 /// Compare packed 32-bit integers in `a` and `b` for less-than.
842 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
843 {
844     return _mm_cmpgt_epi32(b, a);
845 }
846 
847 /// Compare packed 8-bit integers in `a` and `b` for less-than.
848 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
849 {
850     return _mm_cmpgt_epi8(b, a);
851 }
852 
853 /// Compare packed double-precision (64-bit) floating-point elements
854 /// in `a` and `b` for less-than.
855 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
856 {
857     static if (GDC_with_SSE2)
858     {
859         return __builtin_ia32_cmpltpd(a, b); 
860     }
861     else
862     {
863         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
864     }
865 }
866 
867 /// Compare the lower double-precision (64-bit) floating-point elements
868 /// in `a` and `b` for less-than, store the result in the lower 
869 /// element, and copy the upper element from `a`.
870 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
871 {
872     static if (GDC_with_SSE2)
873     {
874         return __builtin_ia32_cmpltsd(a, b); 
875     }
876     else
877     {
878         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
879     }
880 }
881 
882 /// Compare packed double-precision (64-bit) floating-point elements
883 /// in `a` and `b` for not-equal.
884 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
885 {
886     static if (GDC_with_SSE2)
887     {
888         return __builtin_ia32_cmpneqpd(a, b); 
889     }
890     else
891     {
892         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
893     }
894 }
895 
896 /// Compare the lower double-precision (64-bit) floating-point elements
897 /// in `a` and `b` for not-equal, store the result in the lower 
898 /// element, and copy the upper element from `a`.
899 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
900 {
901     static if (GDC_with_SSE2)
902     {
903         return __builtin_ia32_cmpneqsd(a, b); 
904     }
905     else
906     {
907         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
908     }
909 }
910 
911 /// Compare packed double-precision (64-bit) floating-point elements 
912 /// in `a` and `b` for not-greater-than-or-equal.
913 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
914 {
915     static if (GDC_with_SSE2)
916     {
917         return __builtin_ia32_cmpngepd(a, b); 
918     }
919     else
920     {
921         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
922     }
923 }
924 
925 /// Compare the lower double-precision (64-bit) floating-point elements 
926 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
927 /// the lower element, and copy the upper element from `a`.
928 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
929 {
930     // Note: There is no __builtin_ia32_cmpngesd builtin.
931     static if (GDC_with_SSE2)
932     {
933         return __builtin_ia32_cmpltsd(b, a); 
934     }
935     else
936     {
937         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
938     }
939 }
940 
941 /// Compare packed double-precision (64-bit) floating-point elements 
942 /// in `a` and `b` for not-greater-than.
943 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
944 {
945     static if (GDC_with_SSE2)
946     {
947         return __builtin_ia32_cmpngtpd(a, b);
948     }
949     else
950     {
951         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
952     }
953 }
954 
955 /// Compare the lower double-precision (64-bit) floating-point elements 
956 /// in `a` and `b` for not-greater-than, store the result in the 
957 /// lower element, and copy the upper element from `a`.
958 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
959 {
960     // Note: There is no __builtin_ia32_cmpngtsd builtin.
961     static if (GDC_with_SSE2)
962     {
963         return __builtin_ia32_cmplesd(b, a);
964     }
965     else
966     {
967         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
968     }
969 }
970 
971 /// Compare packed double-precision (64-bit) floating-point elements 
972 /// in `a` and `b` for not-less-than-or-equal.
973 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
974 {
975     static if (GDC_with_SSE2)
976     {
977         return __builtin_ia32_cmpnlepd(a, b);
978     }
979     else
980     {
981         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
982     }
983 }
984 
985 /// Compare the lower double-precision (64-bit) floating-point elements 
986 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
987 /// lower element, and copy the upper element from `a`.
988 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
989 {
990     static if (GDC_with_SSE2)
991     {
992         return __builtin_ia32_cmpnlesd(a, b);
993     }
994     else
995     {
996         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
997     }
998 }
999  
1000 /// Compare packed double-precision (64-bit) floating-point elements 
1001 /// in `a` and `b` for not-less-than.
1002 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
1003 {
1004     static if (GDC_with_SSE2)
1005     {
1006         return __builtin_ia32_cmpnltpd(a, b);
1007     }
1008     else
1009     {
1010         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1011     }
1012 }
1013 
1014 /// Compare the lower double-precision (64-bit) floating-point elements 
1015 /// in `a` and `b` for not-less-than, store the result in the lower 
1016 /// element, and copy the upper element from `a`.
1017 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1018 {
1019     static if (GDC_with_SSE2)
1020     {
1021         return __builtin_ia32_cmpnltsd(a, b);
1022     }
1023     else
1024     {
1025         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1026     }
1027 }
1028 
1029 /// Compare packed double-precision (64-bit) floating-point elements 
1030 /// in `a` and `b` to see if neither is NaN.
1031 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1032 {
1033     static if (GDC_with_SSE2)
1034     {
1035         return __builtin_ia32_cmpordpd(a, b);
1036     }
1037     else
1038     {
1039         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1040     }
1041 }
1042 
1043 /// Compare the lower double-precision (64-bit) floating-point elements 
1044 /// in `a` and `b` to see if neither is NaN, store the result in the 
1045 /// lower element, and copy the upper element from `a` to the upper element.
1046 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1047 {
1048     static if (GDC_with_SSE2)
1049     {
1050         return __builtin_ia32_cmpordsd(a, b);
1051     }
1052     else
1053     {
1054         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1055     }
1056 }
1057 
1058 /// Compare packed double-precision (64-bit) floating-point elements 
1059 /// in `a` and `b` to see if either is NaN.
1060 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1061 {
1062     static if (GDC_with_SSE2)
1063     {
1064         return __builtin_ia32_cmpunordpd(a, b);
1065     }
1066     else
1067     {
1068         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1069     }
1070 }
1071 
1072 /// Compare the lower double-precision (64-bit) floating-point elements 
1073 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1074 /// element, and copy the upper element from `a` to the upper element.
1075 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1076 {
1077     static if (GDC_with_SSE2)
1078     {
1079         return __builtin_ia32_cmpunordsd(a, b);
1080     }
1081     else
1082     {
1083         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1084     }
1085 }
1086 
1087 /// Compare the lower double-precision (64-bit) floating-point element 
1088 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1089 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1090 {
1091     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1092     // comisd instruction, it returns false in case of unordered instead.
1093     //
1094     // Actually C++ compilers disagree over the meaning of that instruction.
1095     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1096     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1097     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1098     return a.array[0] == b.array[0];
1099 }
1100 unittest
1101 {
1102     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1103     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1104     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1105     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1106     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1107 }
1108 
1109 /// Compare the lower double-precision (64-bit) floating-point element 
1110 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1111 /// result (0 or 1).
1112 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1113 {
1114     return a.array[0] >= b.array[0];
1115 }
1116 unittest
1117 {
1118     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1119     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1120     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1121     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1122     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1123     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1124 }
1125 
1126 /// Compare the lower double-precision (64-bit) floating-point element 
1127 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1128 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1129 {
1130     return a.array[0] > b.array[0];
1131 }
1132 unittest
1133 {
1134     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1135     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1136     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1137     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1138     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1139 }
1140 
1141 /// Compare the lower double-precision (64-bit) floating-point element 
1142 /// in `a` and `b` for less-than-or-equal.
1143 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1144 {
1145     return a.array[0] <= b.array[0];
1146 }
1147 unittest
1148 {
1149     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1150     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1151     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1152     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1153     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1154     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1155 }
1156 
1157 /// Compare the lower double-precision (64-bit) floating-point element 
1158 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1159 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1160 {
1161     return a.array[0] < b.array[0];
1162 }
1163 unittest
1164 {
1165     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1166     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1167     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1168     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1169     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1170     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1171 }
1172 
1173 /// Compare the lower double-precision (64-bit) floating-point element
1174 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1175 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1176 {
1177     return a.array[0] != b.array[0];
1178 }
1179 unittest
1180 {
1181     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1182     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1183     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1184     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1185     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1186 }
1187 
1188 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1189 /// floating-point elements.
1190  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1191 {
1192     version(LDC)
1193     {
1194         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1195         enum ir = `
1196             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1197             %r = sitofp <2 x i32> %v to <2 x double>
1198             ret <2 x double> %r`;
1199         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1200     }
1201     else static if (GDC_with_SSE2)
1202     {
1203         return __builtin_ia32_cvtdq2pd(a);
1204     }
1205     else
1206     {
1207         double2 r = void;
1208         r.ptr[0] = a.array[0];
1209         r.ptr[1] = a.array[1];
1210         return r;
1211     }
1212 }
1213 unittest
1214 {
1215     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1216     assert(A.array[0] == 54.0);
1217     assert(A.array[1] == 54.0);
1218 }
1219 
1220 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1221 /// floating-point elements.
1222 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1223 {
1224     static if (GDC_with_SSE2)
1225     {
1226         return __builtin_ia32_cvtdq2ps(a);
1227     }
1228     else version(LDC)
1229     {
1230         // See #86 for why we had to resort to LLVM IR.
1231         // Plain code below was leading to catastrophic behaviour. 
1232         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1233         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1234         enum ir = `
1235             %r = sitofp <4 x i32> %0 to <4 x float>
1236             ret <4 x float> %r`;
1237         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1238     }
1239     else
1240     {
1241         __m128 res; // PERF =void;
1242         res.ptr[0] = cast(float)a.array[0];
1243         res.ptr[1] = cast(float)a.array[1];
1244         res.ptr[2] = cast(float)a.array[2];
1245         res.ptr[3] = cast(float)a.array[3];
1246         return res;
1247     }
1248 }
1249 unittest
1250 {
1251     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1252     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1253 }
1254 
1255 /// Convert packed double-precision (64-bit) floating-point elements 
1256 /// in `a` to packed 32-bit integers.
1257 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1258 {
1259     // PERF ARM32
1260     static if (LDC_with_SSE2)
1261     {
1262         return __builtin_ia32_cvtpd2dq(a);
1263     }
1264     else static if (GDC_with_SSE2)
1265     {
1266         return __builtin_ia32_cvtpd2dq(a);
1267     }
1268     else static if (LDC_with_ARM64)
1269     {
1270         // Get current rounding mode.
1271         uint fpscr = arm_get_fpcr();
1272         long2 i;
1273         switch(fpscr & _MM_ROUND_MASK_ARM)
1274         {
1275             default:
1276             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1277             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1278             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1279             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1280         }
1281         int4 zero = 0;
1282         return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1283     }
1284     else
1285     {
1286         // PERF ARM32
1287         __m128i r = _mm_setzero_si128();
1288         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1289         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1290         return r;
1291     }
1292 }
1293 unittest
1294 {
1295     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1296     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1297 }
1298 
1299 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1300 /// to packed 32-bit integers
1301 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1302 {
1303     return to_m64(_mm_cvtpd_epi32(v));
1304 }
1305 unittest
1306 {
1307     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1308     assert(A.array[0] == 55 && A.array[1] == 61);
1309 }
1310 
1311 /// Convert packed double-precision (64-bit) floating-point elements 
1312 /// in `a` to packed single-precision (32-bit) floating-point elements.
1313 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1314 {
1315     static if (LDC_with_SSE2)
1316     {
1317         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1318     }
1319     else static if (GDC_with_SSE2)
1320     {
1321         return __builtin_ia32_cvtpd2ps(a);
1322     }
1323     else
1324     { 
1325         __m128 r = void;
1326         r.ptr[0] = a.array[0];
1327         r.ptr[1] = a.array[1];
1328         r.ptr[2] = 0;
1329         r.ptr[3] = 0;
1330         return r;
1331     }
1332 }
1333 unittest
1334 {
1335     __m128d A = _mm_set_pd(5.25, 4.0);
1336     __m128 B = _mm_cvtpd_ps(A);
1337     assert(B.array == [4.0f, 5.25f, 0, 0]);
1338 }
1339 
1340 /// Convert packed 32-bit integers in `v` to packed double-precision 
1341 /// (64-bit) floating-point elements.
1342 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1343 {
1344     return _mm_cvtepi32_pd(to_m128i(v));
1345 }
1346 unittest
1347 {
1348     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1349     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1350 }
1351 
1352 /// Convert packed single-precision (32-bit) floating-point elements 
1353 /// in `a` to packed 32-bit integers
1354 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1355 {
1356     static if (LDC_with_SSE2)
1357     {
1358         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1359     }
1360     else static if (GDC_with_SSE2)
1361     {
1362         return __builtin_ia32_cvtps2dq(a);
1363     }
1364     else static if (LDC_with_ARM64)
1365     {
1366         // Get current rounding mode.
1367         uint fpscr = arm_get_fpcr();
1368         switch(fpscr & _MM_ROUND_MASK_ARM)
1369         {
1370             default:
1371             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1372             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1373             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1374             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1375         }
1376     }
1377     else
1378     {
1379         __m128i r = void;
1380         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1381         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1382         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1383         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1384         return r;
1385     }
1386 }
1387 unittest
1388 {
1389     // GDC bug #98607
1390     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1391     // GDC does not provide optimization barrier for rounding mode.
1392     // Workarounded with different literals. This bug will likely only manifest in unittest.
1393     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1394 
1395     uint savedRounding = _MM_GET_ROUNDING_MODE();
1396 
1397     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1398     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1399     assert(A.array == [1, -2, 54, -3]);
1400 
1401     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1402     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1403     assert(A.array == [1, -3, 53, -3]);
1404 
1405     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1406     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1407     assert(A.array == [2, -2, 54, -2]);
1408 
1409     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1410     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1411     assert(A.array == [1, -2, 53, -2]);
1412 
1413     _MM_SET_ROUNDING_MODE(savedRounding);
1414 }
1415 
1416 /// Convert packed single-precision (32-bit) floating-point elements 
1417 /// in `a` to packed double-precision (64-bit) floating-point elements.
1418 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1419 {
1420     version(LDC)
1421     {
1422         // Generates cvtps2pd since LDC 1.0 -O0
1423         enum ir = `
1424             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1425             %r = fpext <2 x float> %v to <2 x double>
1426             ret <2 x double> %r`;
1427         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1428     }
1429     else static if (GDC_with_SSE2)
1430     {
1431         return __builtin_ia32_cvtps2pd(a);
1432     }
1433     else
1434     {
1435         double2 r = void;
1436         r.ptr[0] = a.array[0];
1437         r.ptr[1] = a.array[1];
1438         return r;
1439     }
1440 }
1441 unittest
1442 {
1443     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1444     assert(A.array[0] == 54.0);
1445     assert(A.array[1] == 54.0);
1446 }
1447 
1448 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1449 double _mm_cvtsd_f64 (__m128d a) pure @safe
1450 {
1451     return a.array[0];
1452 }
1453 
1454 /// Convert the lower double-precision (64-bit) floating-point element
1455 /// in `a` to a 32-bit integer.
1456 int _mm_cvtsd_si32 (__m128d a) @safe
1457 {
1458     static if (LDC_with_SSE2)
1459     {
1460         return __builtin_ia32_cvtsd2si(a);
1461     }
1462     else static if (GDC_with_SSE2)
1463     {
1464         return __builtin_ia32_cvtsd2si(a);
1465     }
1466     else
1467     {
1468         return convertDoubleToInt32UsingMXCSR(a[0]);
1469     }
1470 }
1471 unittest
1472 {
1473     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1474 }
1475 
1476 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1477 long _mm_cvtsd_si64 (__m128d a) @trusted
1478 {
1479     version (LDC)
1480     {
1481         version (X86_64)
1482         {
1483             return __builtin_ia32_cvtsd2si64(a);
1484         }
1485         else
1486         {
1487             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1488             // using SSE instructions only. So the builtin doesn't exit for this arch.
1489             return convertDoubleToInt64UsingMXCSR(a[0]);
1490         }
1491     }
1492     else
1493     {
1494         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1495     }
1496 }
1497 unittest
1498 {
1499     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1500 
1501     uint savedRounding = _MM_GET_ROUNDING_MODE();
1502 
1503     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1504     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1505 
1506     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1507     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1508 
1509     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1510     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1511 
1512     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1513     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1514 
1515     _MM_SET_ROUNDING_MODE(savedRounding);
1516 }
1517 
1518 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1519 
1520 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1521 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1522 /// to the upper elements of result.
1523 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1524 {
1525     static if (GDC_with_SSE2)
1526     {
1527         return __builtin_ia32_cvtsd2ss(a, b); 
1528     }
1529     else
1530     {
1531         // Generates cvtsd2ss since LDC 1.3 -O0
1532         a.ptr[0] = b.array[0];
1533         return a;
1534     }
1535 }
1536 unittest
1537 {
1538     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1539     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1540 }
1541 
1542 /// Get the lower 32-bit integer in `a`.
1543 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1544 {
1545     return a.array[0];
1546 }
1547 
1548 /// Get the lower 64-bit integer in `a`.
1549 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1550 {
1551     long2 la = cast(long2)a;
1552     return la.array[0];
1553 }
1554 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1555 
1556 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1557 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1558 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1559 {
1560     a.ptr[0] = cast(double)b;
1561     return a;
1562 }
1563 unittest
1564 {
1565     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1566     assert(a.array == [42.0, 0]);
1567 }
1568 
1569 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1570 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1571 {
1572     int4 r = [0, 0, 0, 0];
1573     r.ptr[0] = a;
1574     return r;
1575 }
1576 unittest
1577 {
1578     __m128i a = _mm_cvtsi32_si128(65);
1579     assert(a.array == [65, 0, 0, 0]);
1580 }
1581 
1582 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1583 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1584 
1585 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1586 {
1587     a.ptr[0] = cast(double)b;
1588     return a;
1589 }
1590 unittest
1591 {
1592     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1593     assert(a.array == [42.0, 0]);
1594 }
1595 
1596 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1597 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1598 {
1599     long2 r = [0, 0];
1600     r.ptr[0] = a;
1601     return cast(__m128i)(r);
1602 }
1603 
1604 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1605 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1606 
1607 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1608 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1609 // element of result.
1610 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1611 {
1612     a.ptr[0] = b.array[0];
1613     return a;
1614 }
1615 unittest
1616 {
1617     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1618     assert(a.array == [42.0, 0]);
1619 }
1620 
1621 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1622 long _mm_cvttss_si64 (__m128 a) pure @safe
1623 {
1624     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1625 }
1626 unittest
1627 {
1628     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1629 }
1630 
1631 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1632 /// Put zeroes in the upper elements of result.
1633 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1634 {
1635     static if (LDC_with_SSE2)
1636     {
1637         return __builtin_ia32_cvttpd2dq(a);
1638     }
1639     else static if (GDC_with_SSE2)
1640     {
1641         return __builtin_ia32_cvttpd2dq(a);
1642     }
1643     else
1644     {
1645         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1646         __m128i r; // PERF =void;
1647         r.ptr[0] = cast(int)a.array[0];
1648         r.ptr[1] = cast(int)a.array[1];
1649         r.ptr[2] = 0;
1650         r.ptr[3] = 0;
1651         return r;
1652     }
1653 }
1654 unittest
1655 {
1656     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1657     assert(R.array == [-4, 45641, 0, 0]);
1658 }
1659 
1660 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1661 /// to packed 32-bit integers with truncation.
1662 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1663 {
1664     return to_m64(_mm_cvttpd_epi32(v));
1665 }
1666 unittest
1667 {
1668     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1669     int[2] correct = [-4, 45641];
1670     assert(R.array == correct);
1671 }
1672 
1673 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1674 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1675 {
1676     // x86: Generates cvttps2dq since LDC 1.3 -O2
1677     // ARM64: generates fcvtze since LDC 1.8 -O2
1678     __m128i r; // PERF = void;
1679     r.ptr[0] = cast(int)a.array[0];
1680     r.ptr[1] = cast(int)a.array[1];
1681     r.ptr[2] = cast(int)a.array[2];
1682     r.ptr[3] = cast(int)a.array[3];
1683     return r;
1684 }
1685 unittest
1686 {
1687     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1688     assert(R.array == [-4, 45641, 0, 1]);
1689 }
1690 
1691 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1692 int _mm_cvttsd_si32 (__m128d a)
1693 {
1694     // Generates cvttsd2si since LDC 1.3 -O0
1695     return cast(int)a.array[0];
1696 }
1697 
1698 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1699 long _mm_cvttsd_si64 (__m128d a)
1700 {
1701     // Generates cvttsd2si since LDC 1.3 -O0
1702     // but in 32-bit instead, it's a long sequence that resort to FPU
1703     return cast(long)a.array[0];
1704 }
1705 
1706 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1707 
1708 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1709 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1710 {
1711     pragma(inline, true);
1712     return a / b;
1713 }
1714 
1715 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1716 {
1717     static if (GDC_with_SSE2)
1718     {
1719         return __builtin_ia32_divsd(a, b);
1720     }
1721     else version(DigitalMars)
1722     {
1723         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1724         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1725         asm pure nothrow @nogc @trusted { nop;}
1726         a.array[0] = a.array[0] / b.array[0];
1727         return a;
1728     }
1729     else
1730     {
1731         a.ptr[0] /= b.array[0];
1732         return a;
1733     }
1734 }
1735 unittest
1736 {
1737     __m128d a = [2.0, 4.5];
1738     a = _mm_div_sd(a, a);
1739     assert(a.array == [1.0, 4.5]);
1740 }
1741 
1742 /// Extract a 16-bit integer from `v`, selected with `index`.
1743 /// Warning: the returned value is zero-extended to 32-bits.
1744 int _mm_extract_epi16(__m128i v, int index) pure @safe
1745 {
1746     short8 r = cast(short8)v;
1747     return cast(ushort)(r.array[index & 7]);
1748 }
1749 unittest
1750 {
1751     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1752     assert(_mm_extract_epi16(A, 6) == 6);
1753     assert(_mm_extract_epi16(A, 0) == 65535);
1754     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1755 }
1756 
1757 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1758 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1759 {
1760     short8 r = cast(short8)v;
1761     r.ptr[index & 7] = cast(short)i;
1762     return cast(__m128i)r;
1763 }
1764 unittest
1765 {
1766     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1767     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1768     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1769     assert(R.array == correct);
1770 }
1771 
1772 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 
1773 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 
1774 /// is globally visible before any load instruction which follows the fence in program order.
1775 void _mm_lfence() @trusted
1776 {
1777     version(GNU)
1778     {
1779         static if (GDC_with_SSE2)
1780         {
1781             __builtin_ia32_lfence();
1782         }
1783         else version(X86)
1784         {
1785             asm pure nothrow @nogc @trusted
1786             {
1787                 "lfence;\n" : : : ;
1788             }
1789         }
1790         else
1791             static assert(false);
1792     }
1793     else static if (LDC_with_SSE2)
1794     {
1795         __builtin_ia32_lfence();
1796     }
1797     else static if (LDC_with_ARM64)
1798     {
1799          __builtin_arm_dmb(9);  // dmb ishld
1800     }
1801     else static if (DMD_with_asm)
1802     {
1803         asm nothrow @nogc pure @safe
1804         {
1805             lfence;
1806         }
1807     }
1808     else version(LDC)
1809     {
1810         // When the architecture is unknown, generate a full memory barrier,
1811         // as the semantics of sfence do not really match those of atomics.
1812         llvm_memory_fence();
1813     }
1814     else
1815         static assert(false);
1816 }
1817 unittest
1818 {
1819     _mm_lfence();
1820 }
1821 
1822 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1823 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1824 __m128d _mm_load_pd (const(double) * mem_addr) pure
1825 {
1826     pragma(inline, true);
1827     __m128d* aligned = cast(__m128d*)mem_addr;
1828     return *aligned;
1829 }
1830 unittest
1831 {
1832     align(16) double[2] S = [-5.0, 7.0];
1833     __m128d R = _mm_load_pd(S.ptr);
1834     assert(R.array == S);
1835 }
1836 
1837 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1838 /// `mem_addr` does not need to be aligned on any particular boundary.
1839 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1840 {
1841     double m = *mem_addr;
1842     __m128d r; // PERF =void;
1843     r.ptr[0] = m;
1844     r.ptr[1] = m;
1845     return r;
1846 }
1847 unittest
1848 {
1849     double what = 4;
1850     __m128d R = _mm_load_pd1(&what);
1851     double[2] correct = [4.0, 4];
1852     assert(R.array == correct);
1853 }
1854 
1855 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1856 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1857 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1858 {
1859     double2 r = [0, 0];
1860     r.ptr[0] = *mem_addr;
1861     return r;
1862 }
1863 unittest
1864 {
1865     double x = -42;
1866     __m128d a = _mm_load_sd(&x);
1867     assert(a.array == [-42.0, 0.0]);
1868 }
1869 
1870 /// Load 128-bits of integer data from memory into dst. 
1871 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1872 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1873 {
1874     pragma(inline, true);
1875     return *mem_addr;
1876 }
1877 unittest
1878 {
1879     align(16) int[4] correct = [-1, 2, 3, 4];
1880     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1881     assert(A.array == correct);
1882 }
1883 
1884 alias _mm_load1_pd = _mm_load_pd1; ///
1885 
1886 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1887 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1888 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1889 {
1890     pragma(inline, true);
1891     a.ptr[1] = *mem_addr;
1892     return a;
1893 }
1894 unittest
1895 {
1896     double A = 7.0;
1897     __m128d B = _mm_setr_pd(4.0, -5.0);
1898     __m128d R = _mm_loadh_pd(B, &A);
1899     double[2] correct = [ 4.0, 7.0 ];
1900     assert(R.array == correct);
1901 }
1902 
1903 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1904 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1905 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1906 {
1907     pragma(inline, true);
1908     auto pLong = cast(const(long)*)mem_addr;
1909     long2 r = [0, 0];
1910     r.ptr[0] = *pLong;
1911     return cast(__m128i)(r);
1912 }
1913 unittest
1914 {
1915     long A = 0x7878787870707070;
1916     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1917     long[2] correct = [0x7878787870707070, 0];
1918     assert(R.array == correct);
1919 }
1920 
1921 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1922 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1923 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1924 {
1925     a.ptr[0] = *mem_addr;
1926     return a;
1927 }
1928 unittest
1929 {
1930     double A = 7.0;
1931     __m128d B = _mm_setr_pd(4.0, -5.0);
1932     __m128d R = _mm_loadl_pd(B, &A);
1933     double[2] correct = [ 7.0, -5.0 ];
1934     assert(R.array == correct);
1935 }
1936 
1937 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1938 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1939 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1940 {
1941     __m128d a = *cast(__m128d*)(mem_addr);
1942     __m128d r; // PERF =void;
1943     r.ptr[0] = a.array[1];
1944     r.ptr[1] = a.array[0];
1945     return r;
1946 }
1947 unittest
1948 {
1949     align(16) double[2] A = [56.0, -74.0];
1950     __m128d R = _mm_loadr_pd(A.ptr);
1951     double[2] correct = [-74.0, 56.0];
1952     assert(R.array == correct);
1953 }
1954 
1955 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1956 /// `mem_addr` does not need to be aligned on any particular boundary.
1957 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1958 {
1959     pragma(inline, true);
1960     static if (GDC_with_SSE2)
1961     {
1962         return __builtin_ia32_loadupd(mem_addr); 
1963     }
1964     else version(LDC)
1965     {
1966         return loadUnaligned!(double2)(mem_addr);
1967     }
1968     else version(DigitalMars)
1969     {
1970         // Apparently inside __simd you can use aligned dereferences without fear.
1971         // That was issue 23048 on dlang's Bugzilla.
1972         static if (DMD_with_DSIMD)
1973         {
1974             return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
1975         }
1976         else static if (SSESizedVectorsAreEmulated)
1977         {
1978             // Since this vector is emulated, it doesn't have alignement constraints
1979             // and as such we can just cast it.
1980             return *cast(__m128d*)(mem_addr);
1981         }
1982         else
1983         {
1984             __m128d result;
1985             result.ptr[0] = mem_addr[0];
1986             result.ptr[1] = mem_addr[1];
1987             return result;
1988         }
1989     }
1990     else
1991     {
1992         __m128d result;
1993         result.ptr[0] = mem_addr[0];
1994         result.ptr[1] = mem_addr[1];
1995         return result;
1996     }
1997 }
1998 unittest
1999 {
2000     double[2] A = [56.0, -75.0];
2001     __m128d R = _mm_loadu_pd(A.ptr);
2002     double[2] correct = [56.0, -75.0];
2003     assert(R.array == correct);
2004 }
2005 
2006 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
2007 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
2008 {
2009     // PERF DMD
2010     pragma(inline, true);
2011     static if (GDC_with_SSE2)
2012     {
2013         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
2014     }
2015     else version(LDC)
2016     {
2017         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2018     }
2019     else
2020     {
2021         const(int)* p = cast(const(int)*)mem_addr;
2022         __m128i r = void;
2023         r.ptr[0] = p[0];
2024         r.ptr[1] = p[1];
2025         r.ptr[2] = p[2];
2026         r.ptr[3] = p[3];
2027         return r;
2028     }
2029 }
2030 unittest
2031 {
2032     align(16) int[4] correct = [-1, 2, -3, 4];
2033     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2034     assert(A.array == correct);
2035 }
2036 
2037 /// Load unaligned 32-bit integer from memory into the first element of result.
2038 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
2039 {
2040     pragma(inline, true);
2041     int r = *cast(int*)(mem_addr);
2042     int4 result = [0, 0, 0, 0];
2043     result.ptr[0] = r;
2044     return result;
2045 }
2046 unittest
2047 {
2048     int r = 42;
2049     __m128i A = _mm_loadu_si32(&r);
2050     int[4] correct = [42, 0, 0, 0];
2051     assert(A.array == correct);
2052 }
2053 
2054 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2055 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2056 /// and pack the results in destination.
2057 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2058 {
2059     static if (GDC_with_SSE2)
2060     {
2061         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2062     }
2063     else static if (LDC_with_SSE2)
2064     {
2065         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2066     }
2067     else static if (LDC_with_ARM64)
2068     {
2069         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2070         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2071         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2072         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2073         return vcombine_s32(rl, rh);
2074     }
2075     else
2076     {
2077         short8 sa = cast(short8)a;
2078         short8 sb = cast(short8)b;
2079         int4 r;
2080         foreach(i; 0..4)
2081         {
2082             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2083         }
2084         return r;
2085     }
2086 }
2087 unittest
2088 {
2089     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2090     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2091     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2092     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2093     assert(R.array == correct);
2094 }
2095 
2096 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2097 /// (elements are not stored when the highest bit is not set in the corresponding element)
2098 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2099 /// boundary.
2100 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2101 {
2102     static if (GDC_with_SSE2)
2103     {    
2104         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2105     }
2106     else static if (LDC_with_SSE2)
2107     {
2108         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2109     }
2110     else static if (LDC_with_ARM64)
2111     {
2112         // PERF: catastrophic on ARM32
2113         byte16 bmask  = cast(byte16)mask;
2114         byte16 shift = 7;
2115         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2116         mask = cast(__m128i) bmask;
2117         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2118         dest = (a & mask) | (dest & ~mask);
2119         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2120     }
2121     else
2122     {
2123         byte16 b = cast(byte16)a;
2124         byte16 m = cast(byte16)mask;
2125         byte* dest = cast(byte*)(mem_addr);
2126         foreach(j; 0..16)
2127         {
2128             if (m.array[j] & 128)
2129             {
2130                 dest[j] = b.array[j];
2131             }
2132         }
2133     }
2134 }
2135 unittest
2136 {
2137     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2138     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2139     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2140     _mm_maskmoveu_si128(A, mask, dest.ptr);
2141     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2142     assert(dest == correct);
2143 }
2144 
2145 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2146 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2147 {
2148     static if (GDC_with_SSE2)
2149     {
2150         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2151     }
2152     else version(LDC)
2153     {
2154         // x86: pmaxsw since LDC 1.0 -O1
2155         // ARM: smax.8h since LDC 1.5 -01
2156         short8 sa = cast(short8)a;
2157         short8 sb = cast(short8)b;
2158         short8 greater = greaterMask!short8(sa, sb);
2159         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2160     }
2161     else
2162     {
2163         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2164         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2165         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2166         return _mm_xor_si128(b, mask);
2167     }
2168 }
2169 unittest
2170 {
2171     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2172                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2173     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2174     assert(R.array == correct);
2175 }
2176 
2177 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2178 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2179 {
2180     version(LDC)
2181     {
2182         // x86: pmaxub since LDC 1.0.0 -O1
2183         // ARM64: umax.16b since LDC 1.5.0 -O1
2184         // PERF: catastrophic on ARM32
2185         ubyte16 sa = cast(ubyte16)a;
2186         ubyte16 sb = cast(ubyte16)b;
2187         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2188         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2189     }
2190     else
2191     {
2192         __m128i value128 = _mm_set1_epi8(-128);
2193         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2194         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2195         __m128i mask = _mm_and_si128(aTob, higher);
2196         return _mm_xor_si128(b, mask);
2197     }
2198 }
2199 unittest
2200 {
2201     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2202                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2203     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2204     assert(R.array == correct);
2205 }
2206 
2207 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2208 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2209 {
2210     static if (GDC_with_SSE2)
2211     {
2212         return __builtin_ia32_maxpd(a, b);
2213     }
2214     else
2215     {
2216         // x86: Generates maxpd starting with LDC 1.9 -O2
2217         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2218         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2219         return a;
2220     }
2221 }
2222 unittest
2223 {
2224     __m128d A = _mm_setr_pd(4.0, 1.0);
2225     __m128d B = _mm_setr_pd(1.0, 8.0);
2226     __m128d M = _mm_max_pd(A, B);
2227     assert(M.array[0] == 4.0);
2228     assert(M.array[1] == 8.0);
2229 }
2230 
2231 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2232 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2233 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2234 {
2235     static if (GDC_with_SSE2)
2236     {
2237         return __builtin_ia32_maxsd(a, b);
2238     }
2239     else
2240     {
2241          __m128d r = a;
2242         // Generates maxsd starting with LDC 1.3
2243         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2244         return r;
2245     }
2246 }
2247 unittest
2248 {
2249     __m128d A = _mm_setr_pd(1.0, 1.0);
2250     __m128d B = _mm_setr_pd(4.0, 2.0);
2251     __m128d M = _mm_max_sd(A, B);
2252     assert(M.array[0] == 4.0);
2253     assert(M.array[1] == 1.0);
2254 }
2255 
2256 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2257 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2258 /// is globally visible before any memory instruction which follows the fence in program order.
2259 void _mm_mfence() @trusted // not pure!
2260 {
2261     version(GNU)
2262     {
2263         static if (GDC_with_SSE2)
2264         {
2265             __builtin_ia32_mfence();
2266         }
2267         else version(X86)
2268         {
2269             asm pure nothrow @nogc @trusted
2270             {
2271                 "mfence;\n" : : : ;
2272             }
2273         }
2274         else
2275             static assert(false);
2276     }
2277     else static if (LDC_with_SSE2)
2278     {
2279         __builtin_ia32_mfence();
2280     }
2281     else static if (DMD_with_asm)
2282     {
2283         asm nothrow @nogc pure @safe
2284         {
2285             mfence;
2286         }
2287     }
2288     else version(LDC)
2289     {
2290         // Note: will generate the DMB ish instruction on ARM
2291         llvm_memory_fence();
2292     }
2293     else
2294         static assert(false);
2295 }
2296 unittest
2297 {
2298     _mm_mfence();
2299 }
2300 
2301 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2302 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2303 {
2304     static if (GDC_with_SSE2)
2305     {
2306         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2307     }
2308     else version(LDC)
2309     {
2310         // x86: pminsw since LDC 1.0 -O1
2311         // ARM64: smin.8h since LDC 1.5 -01
2312         short8 sa = cast(short8)a;
2313         short8 sb = cast(short8)b;
2314         short8 greater = greaterMask!short8(sa, sb);
2315         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2316     }
2317     else
2318     {
2319         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2320         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2321         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2322         return _mm_xor_si128(b, mask);
2323     }
2324 }
2325 unittest
2326 {
2327     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2328                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2329     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2330     assert(R.array == correct);
2331 }
2332 
2333 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2334 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2335 {
2336     version(LDC)
2337     {
2338         // x86: pminub since LDC 1.0.0 -O1
2339         // ARM: umin.16b since LDC 1.5.0 -O1
2340         // PERF: catastrophic on ARM32
2341         ubyte16 sa = cast(ubyte16)a;
2342         ubyte16 sb = cast(ubyte16)b;
2343         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2344         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2345     }
2346     else
2347     {
2348         __m128i value128 = _mm_set1_epi8(-128);
2349         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2350         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2351         __m128i mask = _mm_and_si128(aTob, lower);
2352         return _mm_xor_si128(b, mask);
2353     }
2354 }
2355 unittest
2356 {
2357     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2358                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2359     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2360     assert(R.array == correct);
2361 }
2362 
2363 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2364 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2365 {
2366     static if (GDC_with_SSE2)
2367     {
2368         return __builtin_ia32_minpd(a, b);
2369     }
2370     else
2371     {
2372         // Generates minpd starting with LDC 1.9
2373         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2374         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2375         return a;
2376     }
2377 }
2378 unittest
2379 {
2380     __m128d A = _mm_setr_pd(1.0, 2.0);
2381     __m128d B = _mm_setr_pd(4.0, 1.0);
2382     __m128d M = _mm_min_pd(A, B);
2383     assert(M.array[0] == 1.0);
2384     assert(M.array[1] == 1.0);
2385 }
2386 
2387 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2388 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2389 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2390 {
2391     static if (GDC_with_SSE2)
2392     {
2393         return __builtin_ia32_minsd(a, b);
2394     }
2395     else
2396     {
2397         // Generates minsd starting with LDC 1.3
2398         __m128d r = a;
2399         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2400         return r;
2401     }
2402 }
2403 unittest
2404 {
2405     __m128d A = _mm_setr_pd(1.0, 3.0);
2406     __m128d B = _mm_setr_pd(4.0, 2.0);
2407     __m128d M = _mm_min_sd(A, B);
2408     assert(M.array[0] == 1.0);
2409     assert(M.array[1] == 3.0);
2410 }
2411 
2412 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2413 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2414 {
2415     static if (GDC_with_SSE2)
2416     {
2417         // slightly better with GDC -O0
2418         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2419     }
2420     else
2421     {
2422         long2 result = [ 0, 0 ];
2423         long2 la = cast(long2) a;
2424         result.ptr[0] = la.array[0];
2425         return cast(__m128i)(result);
2426     }
2427 }
2428 unittest
2429 {
2430     long2 A = [13, 47];
2431     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2432     long[2] correct = [13, 0];
2433     assert(B.array == correct);
2434 }
2435 
2436 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2437 /// the upper element from `a` to the upper element of dst.
2438 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2439 {
2440     static if (GDC_with_SSE2)
2441     {
2442         return __builtin_ia32_movsd(a, b); 
2443     }
2444     else
2445     {
2446         b.ptr[1] = a.array[1];
2447         return b;
2448     }
2449 }
2450 unittest
2451 {
2452     double2 A = [13.0, 47.0];
2453     double2 B = [34.0, 58.0];
2454     double2 C = _mm_move_sd(A, B);
2455     double[2] correct = [34.0, 47.0];
2456     assert(C.array == correct);
2457 }
2458 
2459 /// Create mask from the most significant bit of each 8-bit element in `v`.
2460 int _mm_movemask_epi8 (__m128i a) pure @trusted
2461 {
2462     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2463     static if (GDC_with_SSE2)
2464     {
2465         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2466     }
2467     else static if (LDC_with_SSE2)
2468     {
2469         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2470     }
2471     else static if (LDC_with_ARM64)
2472     {
2473         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2474         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2475         // SO there might be something a bit faster, but this one is reasonable and branchless.
2476         byte8 mask_shift;
2477         mask_shift.ptr[0] = 7;
2478         mask_shift.ptr[1] = 6;
2479         mask_shift.ptr[2] = 5;
2480         mask_shift.ptr[3] = 4;
2481         mask_shift.ptr[4] = 3;
2482         mask_shift.ptr[5] = 2;
2483         mask_shift.ptr[6] = 1;
2484         mask_shift.ptr[7] = 0;
2485         byte8 mask_and = byte8(-128);
2486         byte8 lo = vget_low_u8(cast(byte16)a);
2487         byte8 hi = vget_high_u8(cast(byte16)a);
2488         lo = vand_u8(lo, mask_and);
2489         lo = vshr_u8(lo, mask_shift);
2490         hi = vand_u8(hi, mask_and);
2491         hi = vshr_u8(hi, mask_shift);
2492         lo = vpadd_u8(lo,lo);
2493         lo = vpadd_u8(lo,lo);
2494         lo = vpadd_u8(lo,lo);
2495         hi = vpadd_u8(hi,hi);
2496         hi = vpadd_u8(hi,hi);
2497         hi = vpadd_u8(hi,hi);
2498         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2499     }
2500     else
2501     {
2502         byte16 ai = cast(byte16)a;
2503         int r = 0;
2504         foreach(bit; 0..16)
2505         {
2506             if (ai.array[bit] < 0) r += (1 << bit);
2507         }
2508         return r;
2509     }
2510 }
2511 unittest
2512 {
2513     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2514 }
2515 
2516 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2517 int _mm_movemask_epi16 (__m128i a) pure @trusted
2518 {
2519     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2520 }
2521 unittest
2522 {
2523     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2524 }
2525 
2526 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2527 /// loating-point element in `v`.
2528 int _mm_movemask_pd(__m128d v) pure @safe
2529 {
2530     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2531     static if (GDC_with_SSE2)
2532     {
2533         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2534         /// packed double-precision (64-bit) floating-point element in `v`.
2535         return __builtin_ia32_movmskpd(v);
2536     }
2537     else static if (LDC_with_SSE2)
2538     {
2539         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2540         /// packed double-precision (64-bit) floating-point element in `v`.
2541         return __builtin_ia32_movmskpd(v);
2542     }
2543     else
2544     {
2545         long2 lv = cast(long2)v;
2546         int r = 0;
2547         if (lv.array[0] < 0) r += 1;
2548         if (lv.array[1] < 0) r += 2;
2549         return r;
2550     }
2551 }
2552 unittest
2553 {
2554     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2555     assert(_mm_movemask_pd(A) == 2);
2556 }
2557 
2558 /// Copy the lower 64-bit integer in `v`.
2559 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2560 {
2561     long2 lv = cast(long2)v;
2562     return long1(lv.array[0]);
2563 }
2564 unittest
2565 {
2566     __m128i A = _mm_set_epi64x(-1, -2);
2567     __m64 R = _mm_movepi64_pi64(A);
2568     assert(R.array[0] == -2);
2569 }
2570 
2571 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2572 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2573 {
2574     long2 r;
2575     r.ptr[0] = a.array[0];
2576     r.ptr[1] = 0;
2577     return cast(__m128i)r;
2578 }
2579 
2580 // Note: generates pmuludq in LDC with -O1
2581 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2582 {
2583     __m128i zero = _mm_setzero_si128();
2584 
2585     static if (__VERSION__ >= 2088)
2586     {
2587         // Need LLVM9 to avoid this shufflevector
2588         long2 la, lb;
2589         la.ptr[0] = cast(uint)a.array[0];
2590         la.ptr[1] = cast(uint)a.array[2];
2591         lb.ptr[0] = cast(uint)b.array[0];
2592         lb.ptr[1] = cast(uint)b.array[2];
2593     }
2594     else
2595     {
2596         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); // TODO remove this use of shufflevector except for LDC
2597         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2598     }
2599 
2600     version(DigitalMars)
2601     {
2602         // DMD has no long2 mul
2603         // long2 mul not supported before LDC 1.5
2604         la.ptr[0] *= lb.array[0];
2605         la.ptr[1] *= lb.array[1];
2606         return cast(__m128i)(la);
2607     }
2608     else
2609     {
2610         static if (__VERSION__ >= 2076)
2611         {
2612             return cast(__m128i)(la * lb);
2613         }
2614         else
2615         {
2616             // long2 mul not supported before LDC 1.5
2617             la.ptr[0] *= lb.array[0];
2618             la.ptr[1] *= lb.array[1];
2619             return cast(__m128i)(la);
2620         }
2621     }
2622 }
2623 unittest
2624 {
2625     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2626     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2627     __m128i C = _mm_mul_epu32(A, B);
2628     long2 LC = cast(long2)C;
2629     assert(LC.array[0] == 18446744065119617025uL);
2630     assert(LC.array[1] == 12723420444339690338uL);
2631 }
2632 
2633 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2634 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2635 {
2636     pragma(inline, true);
2637     return a * b;
2638 }
2639 unittest
2640 {
2641     __m128d a = [-2.0, 1.5];
2642     a = _mm_mul_pd(a, a);
2643     assert(a.array == [4.0, 2.25]);
2644 }
2645 
2646 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2647 /// element of result, and copy the upper element from `a` to the upper element of result.
2648 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2649 {
2650     version(DigitalMars)
2651     {    
2652         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2653         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2654         asm pure nothrow @nogc @trusted { nop;}
2655         a.array[0] = a.array[0] * b.array[0];
2656         return a;
2657     }
2658     else static if (GDC_with_SSE2)
2659     {
2660         return __builtin_ia32_mulsd(a, b);
2661     }
2662     else
2663     {
2664         a.ptr[0] *= b.array[0];
2665         return a;
2666     }
2667 }
2668 unittest
2669 {
2670     __m128d a = [-2.0, 1.5];
2671     a = _mm_mul_sd(a, a);
2672     assert(a.array == [4.0, 1.5]);
2673 }
2674 
2675 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2676 /// and get an unsigned 64-bit result.
2677 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2678 {
2679     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2680 }
2681 unittest
2682 {
2683     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2684     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2685     __m64 C = _mm_mul_su32(A, B);
2686     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2687 }
2688 
2689 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2690 /// high 16 bits of the intermediate integers.
2691 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2692 {
2693     static if (GDC_with_SSE2)
2694     {
2695         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2696     }
2697     else static if (LDC_with_SSE2)
2698     {
2699         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2700     }
2701     else
2702     {
2703         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2704         //        PERF: it seems the simde solution has one less instruction in ARM64.
2705         // PERF: Catastrophic in ARM32.
2706         short8 sa = cast(short8)a;
2707         short8 sb = cast(short8)b;
2708         short8 r = void;
2709         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2710         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2711         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2712         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2713         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2714         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2715         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2716         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2717         return cast(__m128i)r;
2718     }
2719 }
2720 unittest
2721 {
2722     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2723     __m128i B = _mm_set1_epi16(16384);
2724     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2725     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2726     assert(R.array == correct);
2727 }
2728 
2729 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2730 /// high 16 bits of the intermediate integers.
2731 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2732 {
2733     static if (GDC_with_SSE2)
2734     {
2735         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2736     }
2737     else static if (LDC_with_SSE2)
2738     {
2739         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2740     }
2741     else
2742     {
2743         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2744         //      it seems the simde solution has one less instruction in ARM64
2745         // PERF: Catastrophic in ARM32.
2746         short8 sa = cast(short8)a;
2747         short8 sb = cast(short8)b;
2748         short8 r = void;
2749         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2750         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2751         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2752         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2753         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2754         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2755         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2756         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2757         return cast(__m128i)r;
2758     }
2759 }
2760 unittest
2761 {
2762     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2763     __m128i B = _mm_set1_epi16(16384);
2764     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2765     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2766     assert(R.array == correct);
2767 }
2768 
2769 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2770 /// bits of the intermediate integers.
2771 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2772 {
2773     return cast(__m128i)(cast(short8)a * cast(short8)b);
2774 }
2775 unittest
2776 {
2777     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2778     __m128i B = _mm_set1_epi16(16384);
2779     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2780     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2781     assert(R.array == correct);
2782 }
2783 
2784 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
2785 __m128i _mm_not_si128 (__m128i a) pure @safe
2786 {
2787     return ~a;
2788 }
2789 unittest
2790 {
2791     __m128i A = _mm_set1_epi32(-748);
2792     int4 notA = cast(int4) _mm_not_si128(A);
2793     int[4] correct = [747, 747, 747, 747];
2794     assert(notA.array == correct);
2795 }
2796 
2797 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2798 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2799 {
2800     pragma(inline, true);
2801     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2802 }
2803 
2804 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2805 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2806 {
2807     pragma(inline, true);
2808     return a | b;
2809 }
2810 
2811 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2812 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2813 {
2814     static if (GDC_with_SSE2)
2815     {
2816         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2817     }    
2818     else static if (LDC_with_SSE2)
2819     {
2820         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2821     }
2822     else static if (LDC_with_ARM64)
2823     {
2824         short4 ra = vqmovn_s32(cast(int4)a);
2825         short4 rb = vqmovn_s32(cast(int4)b);
2826         return cast(__m128i)vcombine_s16(ra, rb);
2827     }
2828     else
2829     {
2830         // PERF: catastrophic on ARM32
2831         short8 r;
2832         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2833         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2834         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2835         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2836         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2837         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2838         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2839         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2840         return cast(__m128i)r;
2841     }
2842 }
2843 unittest
2844 {
2845     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2846     short8 R = cast(short8) _mm_packs_epi32(A, A);
2847     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2848     assert(R.array == correct);
2849 }
2850 
2851 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2852 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2853 {
2854     static if (GDC_with_SSE2)
2855     {
2856         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2857     }
2858     else static if (LDC_with_SSE2)
2859     {
2860         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2861     }
2862     else static if (LDC_with_ARM64)
2863     {
2864         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2865         byte8 ra = vqmovn_s16(cast(short8)a);
2866         byte8 rb = vqmovn_s16(cast(short8)b);
2867         return cast(__m128i)vcombine_s8(ra, rb);
2868     }
2869     else
2870     {
2871         // PERF: ARM32 is missing
2872         byte16 r;
2873         short8 sa = cast(short8)a;
2874         short8 sb = cast(short8)b;
2875         foreach(i; 0..8)
2876             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2877         foreach(i; 0..8)
2878             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2879         return cast(__m128i)r;
2880     }
2881 }
2882 unittest
2883 {
2884     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2885     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2886     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2887                         127, -128, 127, 0, 127, -128, 127, 0];
2888     assert(R.array == correct);
2889 }
2890 
2891 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2892 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2893 {
2894     // PERF DMD catastrophic
2895     static if (GDC_with_SSE2)
2896     {
2897         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2898     }
2899     else static if (LDC_with_SSE2)
2900     {
2901         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2902     }
2903     else static if (LDC_with_ARM64)
2904     {
2905         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2906         byte8 ra = vqmovun_s16(cast(short8)a);
2907         byte8 rb = vqmovun_s16(cast(short8)b);
2908         return cast(__m128i)vcombine_s8(ra, rb);
2909     }
2910     else
2911     {
2912         short8 sa = cast(short8)a;
2913         short8 sb = cast(short8)b;
2914         align(16) ubyte[16] result = void;
2915         for (int i = 0; i < 8; ++i)
2916         {
2917             short s = sa[i];
2918             if (s < 0) s = 0;
2919             if (s > 255) s = 255;
2920             result[i] = cast(ubyte)s;
2921 
2922             s = sb[i];
2923             if (s < 0) s = 0;
2924             if (s > 255) s = 255;
2925             result[i+8] = cast(ubyte)s;
2926         }
2927         return *cast(__m128i*)(result.ptr);
2928     }
2929 }
2930 unittest
2931 {
2932     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2933     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2934     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2935                                                 0, 255, 0, 255, 255, 2, 1, 0];
2936     foreach(i; 0..16)
2937         assert(AA.array[i] == cast(byte)(correctResult[i]));
2938 }
2939 
2940 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2941 /// and power consumption of spin-wait loops.
2942 void _mm_pause() @trusted
2943 {
2944     version(GNU)
2945     {
2946         static if (GDC_with_SSE2)
2947         {
2948             __builtin_ia32_pause();
2949         }
2950         else version(X86)
2951         {
2952             asm pure nothrow @nogc @trusted
2953             {
2954                 "pause;\n" : : : ;
2955             }
2956         }
2957         else
2958             static assert(false);
2959     }
2960     else static if (LDC_with_SSE2)
2961     {
2962         __builtin_ia32_pause();
2963     }
2964     else static if (DMD_with_asm)
2965     {
2966         asm nothrow @nogc pure @safe
2967         {
2968             rep; nop; // F3 90 =  pause
2969         }
2970     }
2971     else version (LDC)
2972     {
2973         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2974     }
2975     else
2976         static assert(false);
2977 }
2978 unittest
2979 {
2980     _mm_pause();
2981 }
2982 
2983 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2984 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2985 /// low 16 bits of 64-bit elements in result.
2986 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2987 {
2988     static if (GDC_with_SSE2)
2989     {
2990         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
2991     }
2992     else static if (LDC_with_SSE2)
2993     {
2994         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2995     }
2996     else static if (LDC_with_ARM64)
2997     {
2998         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
2999 
3000         // PERF: Looks suboptimal vs addp
3001         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
3002         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
3003         ushort8 r = 0;
3004         r[0] = r0;
3005         r[4] = r4;
3006         return cast(__m128i) r;
3007     }
3008     else
3009     {
3010         // PERF: ARM32 is lacking
3011         byte16 ab = cast(byte16)a;
3012         byte16 bb = cast(byte16)b;
3013         ubyte[16] t;
3014         foreach(i; 0..16)
3015         {
3016             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
3017             if (diff < 0) diff = -diff;
3018             t[i] = cast(ubyte)(diff);
3019         }
3020         int4 r = _mm_setzero_si128();
3021         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
3022         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
3023         return r;
3024     }
3025 }
3026 unittest
3027 {
3028     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3029     __m128i B = _mm_set1_epi8(1);
3030     __m128i R = _mm_sad_epu8(A, B);
3031     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3032                       0,
3033                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3034                       0];
3035     assert(R.array == correct);
3036 }
3037 
3038 /// Set packed 16-bit integers with the supplied values.
3039 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3040 {
3041     short8 r = void;
3042     r.ptr[0] = e0;
3043     r.ptr[1] = e1;
3044     r.ptr[2] = e2;
3045     r.ptr[3] = e3;
3046     r.ptr[4] = e4;
3047     r.ptr[5] = e5;
3048     r.ptr[6] = e6;
3049     r.ptr[7] = e7;
3050     return cast(__m128i) r;
3051 }
3052 unittest
3053 {
3054     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3055     short8 B = cast(short8) A;
3056     foreach(i; 0..8)
3057         assert(B.array[i] == i);
3058 }
3059 
3060 /// Set packed 32-bit integers with the supplied values.
3061 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3062 {
3063     align(16) int[4] r = [e0, e1, e2, e3];
3064     return *cast(int4*)&r;
3065 }
3066 unittest
3067 {
3068     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3069     foreach(i; 0..4)
3070         assert(A.array[i] == i);
3071 }
3072 
3073 /// Set packed 64-bit integers with the supplied values.
3074 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3075 {
3076     pragma(inline, true);
3077     long2 r = void;
3078     r.ptr[0] = e0.array[0];
3079     r.ptr[1] = e1.array[0];
3080     return cast(__m128i)(r);
3081 }
3082 unittest
3083 {
3084     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3085     long2 B = cast(long2) A;
3086     assert(B.array[0] == 5678);
3087     assert(B.array[1] == 1234);
3088 }
3089 
3090 /// Set packed 64-bit integers with the supplied values.
3091 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3092 {
3093     pragma(inline, true);
3094     long2 r = void;
3095     r.ptr[0] = e0;
3096     r.ptr[1] = e1;
3097     return cast(__m128i)(r);
3098 }
3099 unittest
3100 {
3101     __m128i A = _mm_set_epi64x(1234, -5678);
3102     long2 B = cast(long2) A;
3103     assert(B.array[0] == -5678);
3104     assert(B.array[1] == 1234);
3105 }
3106 
3107 /// Set packed 8-bit integers with the supplied values.
3108 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3109                       byte e11, byte e10, byte e9, byte e8,
3110                       byte e7, byte e6, byte e5, byte e4,
3111                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3112 {
3113     align(16) byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3114                                  e8, e9, e10, e11, e12, e13, e14, e15];
3115     return *cast(__m128i*)(result.ptr);
3116 }
3117 // TODO unittest
3118 
3119 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3120 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3121 {
3122     pragma(inline, true);
3123     double2 r = void;
3124     r.ptr[0] = e0;
3125     r.ptr[1] = e1;
3126     return r;
3127 }
3128 unittest
3129 {
3130     __m128d A = _mm_set_pd(61.0, 55.0);
3131     double[2] correct = [55.0, 61.0];
3132     assert(A.array == correct);
3133 }
3134 
3135 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3136 __m128d _mm_set_pd1 (double a) pure @trusted
3137 {
3138     pragma(inline, true);
3139     __m128d r = void;
3140     r.ptr[0] = a;
3141     r.ptr[1] = a;
3142     return r;
3143 }
3144 unittest
3145 {
3146     __m128d A = _mm_set_pd1(61.0);
3147     double[2] correct = [61.0, 61.0];
3148     assert(A.array == correct);
3149 }
3150 
3151 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3152 /// and zero the upper element.
3153 __m128d _mm_set_sd (double a) pure @trusted
3154 {
3155     double2 r = void;
3156     r.ptr[0] = a;
3157     r.ptr[1] = 0.0;
3158     return r;
3159 }
3160 unittest
3161 {
3162     __m128d A = _mm_set_sd(61.0);
3163     double[2] correct = [61.0, 0.0];
3164     assert(A.array == correct);
3165 }
3166 
3167 /// Broadcast 16-bit integer a to all elements of dst.
3168 __m128i _mm_set1_epi16 (short a) pure @trusted
3169 {
3170     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3171     {
3172         short8 v = a;
3173         return cast(__m128i) v;
3174     }
3175     else
3176     {
3177         pragma(inline, true);
3178         return cast(__m128i)(short8(a));
3179     }
3180 }
3181 unittest
3182 {
3183     short8 a = cast(short8) _mm_set1_epi16(31);
3184     for (int i = 0; i < 8; ++i)
3185         assert(a.array[i] == 31);
3186 }
3187 
3188 /// Broadcast 32-bit integer `a` to all elements.
3189 __m128i _mm_set1_epi32 (int a) pure @trusted
3190 {
3191     pragma(inline, true);
3192     return cast(__m128i)(int4(a));
3193 }
3194 unittest
3195 {
3196     int4 a = cast(int4) _mm_set1_epi32(31);
3197     for (int i = 0; i < 4; ++i)
3198         assert(a.array[i] == 31);
3199 }
3200 
3201 /// Broadcast 64-bit integer `a` to all elements.
3202 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3203 {
3204     return _mm_set_epi64(a, a);
3205 }
3206 unittest
3207 {
3208     long b = 0x1DEADCAFE; 
3209     __m64 a;
3210     a.ptr[0] = b;
3211     long2 c = cast(long2) _mm_set1_epi64(a);
3212     assert(c.array[0] == b);
3213     assert(c.array[1] == b);
3214 }
3215 
3216 /// Broadcast 64-bit integer `a` to all elements
3217 __m128i _mm_set1_epi64x (long a) pure @trusted
3218 {
3219     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3220     return cast(__m128i)(b);
3221 }
3222 unittest
3223 {
3224     long b = 0x1DEADCAFE;
3225     long2 c = cast(long2) _mm_set1_epi64x(b);
3226     for (int i = 0; i < 2; ++i)
3227         assert(c.array[i] == b);
3228 }
3229 
3230 /// Broadcast 8-bit integer `a` to all elements.
3231 __m128i _mm_set1_epi8 (byte a) pure @trusted
3232 {
3233     pragma(inline, true);
3234     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3235     return cast(__m128i)(b);
3236 }
3237 unittest
3238 {
3239     byte16 b = cast(byte16) _mm_set1_epi8(31);
3240     for (int i = 0; i < 16; ++i)
3241         assert(b.array[i] == 31);
3242 }
3243 
3244 alias _mm_set1_pd = _mm_set_pd1;
3245 
3246 /// Set packed 16-bit integers with the supplied values in reverse order.
3247 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3248                         short e3, short e2, short e1, short e0) pure @trusted
3249 {
3250     short8 r = void;
3251     r.ptr[0] = e7;
3252     r.ptr[1] = e6;
3253     r.ptr[2] = e5;
3254     r.ptr[3] = e4;
3255     r.ptr[4] = e3;
3256     r.ptr[5] = e2;
3257     r.ptr[6] = e1;
3258     r.ptr[7] = e0;
3259     return cast(__m128i)(r);
3260 }
3261 unittest
3262 {
3263     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3264     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3265     assert(A.array == correct);
3266 }
3267 
3268 /// Set packed 32-bit integers with the supplied values in reverse order.
3269 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3270 {
3271     // Performs better than = void; with GDC
3272     pragma(inline, true);
3273     align(16) int[4] result = [e3, e2, e1, e0];
3274     return *cast(__m128i*)(result.ptr);
3275 }
3276 unittest
3277 {
3278     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3279     int[4] correct = [-1, 0, -2147483648, 2147483647];
3280     assert(A.array == correct);
3281 }
3282 
3283 /// Set packed 64-bit integers with the supplied values in reverse order.
3284 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3285 {
3286     long2 r = void;
3287     r.ptr[0] = e1;
3288     r.ptr[1] = e0;
3289     return cast(__m128i)(r);
3290 }
3291 unittest
3292 {
3293     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3294     long[2] correct = [-1, 0];
3295     assert(A.array == correct);
3296 }
3297 
3298 /// Set packed 8-bit integers with the supplied values in reverse order.
3299 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3300                        byte e11, byte e10, byte e9,  byte e8,
3301                        byte e7,  byte e6,  byte e5,  byte e4,
3302                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3303 {
3304     align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3305                                  e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3306     return *cast(__m128i*)(result.ptr);
3307 }
3308 // TODO unittest
3309 
3310 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3311 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3312 {
3313     pragma(inline, true);
3314     double2 result;
3315     result.ptr[0] = e1;
3316     result.ptr[1] = e0;
3317     return result;
3318 }
3319 unittest
3320 {
3321     __m128d A = _mm_setr_pd(61.0, 55.0);
3322     double[2] correct = [61.0, 55.0];
3323     assert(A.array == correct);
3324 }
3325 
3326 /// Return vector of type `__m128d` with all elements set to zero.
3327 __m128d _mm_setzero_pd() pure @trusted
3328 {
3329     pragma(inline, true);
3330     double2 r = void;
3331     r.ptr[0] = 0.0;
3332     r.ptr[1] = 0.0;
3333     return r;
3334 }
3335 unittest
3336 {
3337     __m128d A = _mm_setzero_pd();
3338     double[2] correct = [0.0, 0.0];
3339     assert(A.array == correct);
3340 }
3341 
3342 /// Return vector of type `__m128i` with all elements set to zero.
3343 __m128i _mm_setzero_si128() pure @trusted
3344 {
3345     pragma(inline, true);
3346     int4 r = void;
3347     r.ptr[0] = 0;
3348     r.ptr[1] = 0;
3349     r.ptr[2] = 0;
3350     r.ptr[3] = 0;
3351     return r;
3352 }
3353 unittest
3354 {
3355     __m128i A = _mm_setzero_si128();
3356     int[4] correct = [0, 0, 0, 0];
3357     assert(A.array == correct);
3358 }
3359 
3360 /// Shuffle 32-bit integers in a using the control in `imm8`.
3361 /// See_also: `_MM_SHUFFLE`.
3362 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3363 {
3364     static if (GDC_with_SSE2)
3365     {
3366         return __builtin_ia32_pshufd(a, imm8);
3367     }
3368     else
3369     {
3370         return shufflevector!(int4, (imm8 >> 0) & 3,
3371                                     (imm8 >> 2) & 3,
3372                                     (imm8 >> 4) & 3,
3373                                     (imm8 >> 6) & 3)(a, a); // TODO remove this use of shufflevector except for LDC
3374     }
3375 }
3376 unittest
3377 {
3378     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3379     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3380     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3381     int[4] expectedB = [ 3, 2, 1, 0 ];
3382     assert(B.array == expectedB);
3383 }
3384 
3385 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3386 /// See_also: `_MM_SHUFFLE2`.
3387 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3388 {
3389     static if (GDC_with_SSE2)
3390     {
3391         return __builtin_ia32_shufpd(a, b, imm8);
3392     }
3393     else
3394     {
3395         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3396                                        2 + ( (imm8 >> 1) & 1 ))(a, b); // TODO remove this use of shufflevector except for LDC
3397     }
3398 }
3399 unittest
3400 {
3401     __m128d A = _mm_setr_pd(0.5, 2.0);
3402     __m128d B = _mm_setr_pd(4.0, 5.0);
3403     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3404     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3405     double[2] correct = [ 2.0, 5.0 ];
3406     assert(R.array == correct);
3407 }
3408 
3409 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3410 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3411 /// See also: `_MM_SHUFFLE`.
3412 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3413 {
3414     static if (GDC_with_SSE2)
3415     {
3416         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3417     }
3418     else
3419     {
3420         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3421                                           4 + ( (imm8 >> 0) & 3 ),
3422                                           4 + ( (imm8 >> 2) & 3 ),
3423                                           4 + ( (imm8 >> 4) & 3 ),
3424                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); // TODO remove this use of shufflevector except for LDC
3425     }
3426 }
3427 unittest
3428 {
3429     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3430     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3431     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3432     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3433     assert(C.array == expectedC);
3434 }
3435 
3436 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3437 /// bits of result, with the high 64 bits being copied from from `a` to result.
3438 /// See_also: `_MM_SHUFFLE`.
3439 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3440 {
3441     static if (GDC_with_SSE2)
3442     {
3443         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3444     }
3445     else
3446     {
3447         // TODO remove this use of shufflevector except for LDC
3448         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3449                                                     ( (imm8 >> 2) & 3 ),
3450                                                     ( (imm8 >> 4) & 3 ),
3451                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3452     }
3453 }
3454 unittest
3455 {
3456     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3457     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3458     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3459     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3460     assert(B.array == expectedB);
3461 }
3462 
3463 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3464 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3465 {
3466     static if (LDC_with_SSE2)
3467     {
3468         return __builtin_ia32_pslld128(a, count);
3469     }
3470     else static if (GDC_with_SSE2)
3471     {
3472         return __builtin_ia32_pslld128(a, count);
3473     }
3474     else static if (DMD_with_32bit_asm)
3475     {
3476         asm pure nothrow @nogc @trusted
3477         {
3478             movdqu XMM0, a;
3479             movdqu XMM1, count;
3480             pslld XMM0, XMM1;
3481             movdqu a, XMM0;
3482         }
3483         return a;
3484     }
3485     else
3486     {
3487         int4 r = void;
3488         long2 lc = cast(long2)count;
3489         int bits = cast(int)(lc.array[0]);
3490         foreach(i; 0..4)
3491             r[i] = cast(uint)(a[i]) << bits;
3492         return r;
3493     }
3494 }
3495 
3496 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3497 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3498 {
3499     static if (LDC_with_SSE2)
3500     {
3501         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3502     }
3503     else static if (GDC_with_SSE2)
3504     {
3505         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3506     }
3507     else static if (DMD_with_32bit_asm)
3508     {
3509         asm pure nothrow @nogc @trusted
3510         {
3511             movdqu XMM0, a;
3512             movdqu XMM1, count;
3513             psllq XMM0, XMM1;
3514             movdqu a, XMM0;
3515         }
3516         return a;
3517     }
3518     else
3519     {
3520         // ARM: good since LDC 1.12 -O2
3521         // ~but -O0 version is catastrophic
3522         long2 r = void;
3523         long2 sa = cast(long2)a;
3524         long2 lc = cast(long2)count;
3525         int bits = cast(int)(lc.array[0]);
3526         foreach(i; 0..2)
3527             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3528         return cast(__m128i)r;
3529     }
3530 }
3531 
3532 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3533 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3534 {
3535     static if (LDC_with_SSE2)
3536     {
3537         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3538     }
3539     else static if (GDC_with_SSE2)
3540     {
3541         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3542     }
3543     else static if (DMD_with_32bit_asm)
3544     {
3545         asm pure nothrow @nogc
3546         {
3547             movdqu XMM0, a;
3548             movdqu XMM1, count;
3549             psllw XMM0, XMM1;
3550             movdqu a, XMM0;
3551         }
3552         return a;
3553     }
3554     else
3555     {
3556         short8 sa = cast(short8)a;
3557         long2 lc = cast(long2)count;
3558         int bits = cast(int)(lc.array[0]);
3559         short8 r = void;
3560         foreach(i; 0..8)
3561             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3562         return cast(int4)r;
3563     }
3564 }
3565 
3566 
3567 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3568 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3569 {
3570     static if (GDC_with_SSE2)
3571     {
3572         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3573     }
3574     else static if (LDC_with_SSE2)
3575     {
3576         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3577     }
3578     else
3579     {
3580         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3581         //       D says "It's illegal to shift by the same or more bits 
3582         //       than the size of the quantity being shifted"
3583         //       and it's UB instead.
3584         int4 r = _mm_setzero_si128();
3585 
3586         ubyte count = cast(ubyte) imm8;
3587         if (count > 31)
3588             return r;
3589         
3590         foreach(i; 0..4)
3591             r.array[i] = cast(uint)(a.array[i]) << count;
3592         return r;
3593     }
3594 }
3595 unittest
3596 {
3597     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3598     __m128i B = _mm_slli_epi32(A, 1);
3599     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3600     int[4] expectedB = [ 0, 4, 6, -8];
3601     assert(B.array == expectedB);
3602     assert(B2.array == expectedB);
3603 
3604     __m128i C = _mm_slli_epi32(A, 0);
3605     int[4] expectedC = [ 0, 2, 3, -4];
3606     assert(C.array == expectedC);
3607 
3608     __m128i D = _mm_slli_epi32(A, 65);
3609     int[4] expectedD = [ 0, 0, 0, 0];
3610     assert(D.array == expectedD);
3611 }
3612 
3613 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3614 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3615 {
3616     static if (GDC_with_SSE2)
3617     {
3618         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3619     }
3620     else static if (LDC_with_SSE2)
3621     {
3622         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3623     }
3624     else
3625     {
3626         long2 sa = cast(long2)a;
3627 
3628         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3629         //       D says "It's illegal to shift by the same or more bits 
3630         //       than the size of the quantity being shifted"
3631         //       and it's UB instead.
3632         long2 r = cast(long2) _mm_setzero_si128();
3633         ubyte count = cast(ubyte) imm8;
3634         if (count > 63)
3635             return cast(__m128i)r;
3636 
3637         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3638         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3639         return cast(__m128i)r;
3640     }
3641 }
3642 unittest
3643 {
3644     __m128i A = _mm_setr_epi64(8, -4);
3645     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3646     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3647     long[2] expectedB = [ 16, -8];
3648     assert(B.array == expectedB);
3649     assert(B2.array == expectedB);
3650 
3651     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3652     long[2] expectedC = [ 8, -4];
3653     assert(C.array == expectedC);
3654 
3655     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3656     long[2] expectedD = [ 0, -0];
3657     assert(D.array == expectedD);
3658 }
3659 
3660 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3661 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3662 {
3663     static if (GDC_with_SSE2)
3664     {
3665         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3666     }
3667     else static if (LDC_with_SSE2)
3668     {
3669         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3670     }
3671     else static if (LDC_with_ARM64)
3672     {
3673         short8 sa = cast(short8)a;
3674         short8 r = cast(short8)_mm_setzero_si128();
3675         ubyte count = cast(ubyte) imm8;
3676         if (count > 15)
3677             return cast(__m128i)r;
3678         r = sa << short8(count);
3679         return cast(__m128i)r;
3680     }
3681     else
3682     {
3683         short8 sa = cast(short8)a;
3684         short8 r = cast(short8)_mm_setzero_si128();
3685         ubyte count = cast(ubyte) imm8;
3686         if (count > 15)
3687             return cast(__m128i)r;
3688         foreach(i; 0..8)
3689             r.ptr[i] = cast(short)(sa.array[i] << count);
3690         return cast(__m128i)r;
3691     }
3692 }
3693 unittest
3694 {
3695     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3696     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3697     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3698     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3699     assert(B.array == expectedB);
3700     assert(B2.array == expectedB);
3701 
3702     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3703     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3704     assert(C.array == expectedC);
3705 }
3706 
3707 
3708 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3709 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3710 {
3711     static if (bytes & 0xF0)
3712     {
3713         return _mm_setzero_si128();
3714     }
3715     else
3716     {
3717         static if (GDC_with_SSE2)
3718         {
3719             return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3720         }
3721         else version(DigitalMars)
3722         {
3723             version(D_InlineAsm_X86)
3724             {
3725                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3726                 {
3727                     movdqu XMM0, op;
3728                     pslldq XMM0, bytes;
3729                     movdqu op, XMM0;
3730                 }
3731                 return op;
3732             }
3733             else
3734             {
3735                 byte16 A = cast(byte16)op;
3736                 byte16 R;
3737                 for (int n = 15; n >= bytes; --n)
3738                     R.ptr[n] = A.array[n-bytes];
3739                 for (int n = bytes-1; n >= 0; --n)
3740                     R.ptr[n] = 0;
3741                 return cast(__m128i)R;
3742             }
3743         }
3744         else
3745         {
3746             // TODO remove this use of shufflevector except for LDC
3747             return cast(__m128i) shufflevector!(byte16,
3748             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3749             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3750             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3751             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3752         }
3753     }
3754 }
3755 unittest
3756 {
3757     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3758     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3759     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3760     assert(R.array == correct);
3761 
3762     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3763     int[4] expectedB = [0, 0, 0, 0];
3764     assert(B.array == expectedB);
3765 }
3766 
3767 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3768 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3769 {
3770     version(LDC)
3771     {
3772         // Disappeared with LDC 1.11
3773         static if (__VERSION__ < 2081)
3774             return __builtin_ia32_sqrtpd(vec);
3775         else
3776         {
3777             vec.array[0] = llvm_sqrt(vec.array[0]);
3778             vec.array[1] = llvm_sqrt(vec.array[1]);
3779             return vec;
3780         }
3781     }
3782     else static if (GDC_with_SSE2)    
3783     {
3784         return __builtin_ia32_sqrtpd(vec);
3785     }
3786     else
3787     {
3788         vec.ptr[0] = sqrt(vec.array[0]);
3789         vec.ptr[1] = sqrt(vec.array[1]);
3790         return vec;
3791     }
3792 }
3793 
3794 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3795 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3796 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3797 {
3798     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3799     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3800     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3801     version(LDC)
3802     {
3803         // Disappeared with LDC 1.11
3804         static if (__VERSION__ < 2081)
3805         {
3806             __m128d c = __builtin_ia32_sqrtsd(b);
3807             a[0] = c[0];
3808             return a;
3809         }
3810         else
3811         {
3812             a.array[0] = llvm_sqrt(b.array[0]);
3813             return a;
3814         }
3815     }
3816     else static if (GDC_with_SSE2)
3817     {
3818         __m128d c = __builtin_ia32_sqrtsd(b);
3819         a.ptr[0] = c.array[0];
3820         return a;
3821     }
3822     else
3823     {
3824         a.ptr[0] = sqrt(b.array[0]);
3825         return a;
3826     }
3827 }
3828 unittest
3829 {
3830     __m128d A = _mm_setr_pd(1.0, 3.0);
3831     __m128d B = _mm_setr_pd(4.0, 5.0);
3832     __m128d R = _mm_sqrt_sd(A, B);
3833     double[2] correct = [2.0, 3.0 ];
3834     assert(R.array == correct);
3835 }
3836 
3837 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3838 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3839 {
3840     static if (GDC_with_SSE2)
3841     {
3842         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3843     }
3844     else static if (LDC_with_SSE2)
3845     {
3846         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3847     }
3848     else
3849     {
3850         short8 sa = cast(short8)a;
3851         long2 lc = cast(long2)count;
3852         int bits = cast(int)(lc.array[0]);
3853         short8 r = void;
3854         foreach(i; 0..8)
3855             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3856         return cast(int4)r;
3857     }
3858 }
3859 
3860 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3861 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3862 {
3863     static if (LDC_with_SSE2)
3864     {
3865         return __builtin_ia32_psrad128(a, count);
3866     }
3867     else static if (GDC_with_SSE2)
3868     {
3869         return __builtin_ia32_psrad128(a, count);
3870     }
3871     else
3872     {    
3873         int4 r = void;
3874         long2 lc = cast(long2)count;
3875         int bits = cast(int)(lc.array[0]);
3876         r.ptr[0] = (a.array[0] >> bits);
3877         r.ptr[1] = (a.array[1] >> bits);
3878         r.ptr[2] = (a.array[2] >> bits);
3879         r.ptr[3] = (a.array[3] >> bits);
3880         return r;
3881     }
3882 }
3883 
3884 
3885 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3886 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3887 {
3888     static if (GDC_with_SSE2)
3889     {
3890         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3891     }
3892     else static if (LDC_with_SSE2)
3893     {
3894         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3895     }
3896     else static if (LDC_with_ARM64)
3897     {
3898         short8 sa = cast(short8)a;
3899         ubyte count = cast(ubyte)imm8;
3900         if (count > 15) 
3901             count = 15;
3902         short8 r = sa >> short8(count);
3903         return cast(__m128i)r;
3904     }
3905     else
3906     {
3907         short8 sa = cast(short8)a;
3908         short8 r = void;
3909 
3910         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3911         //       D says "It's illegal to shift by the same or more bits 
3912         //       than the size of the quantity being shifted"
3913         //       and it's UB instead.
3914         ubyte count = cast(ubyte)imm8;
3915         if (count > 15) 
3916             count = 15;
3917         foreach(i; 0..8)
3918             r.ptr[i] = cast(short)(sa.array[i] >> count);
3919         return cast(int4)r;
3920     }
3921 }
3922 unittest
3923 {
3924     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3925     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3926     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3927     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3928     assert(B.array == expectedB);
3929     assert(B2.array == expectedB);
3930 
3931     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3932     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3933     assert(C.array == expectedC);
3934 }
3935 
3936 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3937 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3938 {
3939     static if (LDC_with_SSE2)
3940     {
3941         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3942     }
3943     else static if (GDC_with_SSE2)
3944     {
3945         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3946     }
3947     else
3948     {
3949         int4 r = void;
3950 
3951         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3952         //       D says "It's illegal to shift by the same or more bits 
3953         //       than the size of the quantity being shifted"
3954         //       and it's UB instead.
3955         ubyte count = cast(ubyte) imm8;
3956         if (count > 31)
3957             count = 31;
3958 
3959         r.ptr[0] = (a.array[0] >> count);
3960         r.ptr[1] = (a.array[1] >> count);
3961         r.ptr[2] = (a.array[2] >> count);
3962         r.ptr[3] = (a.array[3] >> count);
3963         return r;
3964     }
3965 }
3966 unittest
3967 {
3968     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3969     __m128i B = _mm_srai_epi32(A, 1);
3970     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3971     int[4] expectedB = [ 0, 1, 1, -2];
3972     assert(B.array == expectedB);
3973     assert(B2.array == expectedB);
3974 
3975     __m128i C = _mm_srai_epi32(A, 32);
3976     int[4] expectedC = [ 0, 0, 0, -1];
3977     assert(C.array == expectedC);
3978 
3979     __m128i D = _mm_srai_epi32(A, 0);
3980     int[4] expectedD = [ 0, 2, 3, -4];
3981     assert(D.array == expectedD);
3982 }
3983 
3984 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3985 {
3986     static if (LDC_with_SSE2)
3987     {
3988         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3989     }
3990     else static if (GDC_with_SSE2)
3991     {
3992         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3993     }
3994     else
3995     {
3996         short8 sa = cast(short8)a;
3997         long2 lc = cast(long2)count;
3998         int bits = cast(int)(lc.array[0]);
3999         short8 r = void;
4000         foreach(i; 0..8)
4001             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
4002         return cast(int4)r;
4003     }
4004 }
4005 
4006 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
4007 {
4008     static if (LDC_with_SSE2)
4009     {
4010         return __builtin_ia32_psrld128(a, count);
4011     }
4012     else static if (GDC_with_SSE2)
4013     {
4014         return __builtin_ia32_psrld128(a, count);
4015     }
4016     else
4017     {
4018         int4 r = void;
4019         long2 lc = cast(long2)count;
4020         int bits = cast(int)(lc.array[0]);
4021         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
4022         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
4023         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
4024         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
4025         return r;
4026     }
4027 }
4028 
4029 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
4030 {
4031     static if (LDC_with_SSE2)
4032     {
4033         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4034     }
4035     else static if (GDC_with_SSE2)
4036     {
4037         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4038     }
4039     else
4040     {
4041         // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047
4042         // => avoid void initialization.
4043         long2 r;
4044         long2 sa = cast(long2)a;
4045         long2 lc = cast(long2)count;
4046         int bits = cast(int)(lc.array[0]);
4047         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
4048         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
4049         return cast(__m128i)r;
4050     }
4051 }
4052 
4053 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
4054 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
4055 {
4056     static if (GDC_with_SSE2)
4057     {
4058         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4059     }
4060     else static if (LDC_with_SSE2)
4061     {
4062         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4063     }
4064     else static if (LDC_with_ARM64)
4065     {
4066         short8 sa = cast(short8)a;
4067         short8 r = cast(short8) _mm_setzero_si128();
4068 
4069         ubyte count = cast(ubyte)imm8;
4070         if (count >= 16)
4071             return cast(__m128i)r;
4072 
4073         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
4074         return cast(__m128i)r;
4075     }
4076     else
4077     {
4078         short8 sa = cast(short8)a;
4079         ubyte count = cast(ubyte)imm8;
4080 
4081         short8 r = cast(short8) _mm_setzero_si128();
4082         if (count >= 16)
4083             return cast(__m128i)r;
4084 
4085         foreach(i; 0..8)
4086             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4087         return cast(__m128i)r;
4088     }
4089 }
4090 unittest
4091 {
4092     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4093     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4094     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4095     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4096     assert(B.array == expectedB);
4097     assert(B2.array == expectedB);
4098 
4099     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4100     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4101     assert(C.array == expectedC);
4102 
4103     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4104     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4105     assert(D.array == expectedD);
4106 }
4107 
4108 
4109 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4110 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4111 {
4112     static if (GDC_with_SSE2)
4113     {
4114         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4115     }
4116     else static if (LDC_with_SSE2)
4117     {
4118         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4119     }
4120     else
4121     {
4122         ubyte count = cast(ubyte) imm8;
4123 
4124         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4125         //       D says "It's illegal to shift by the same or more bits 
4126         //       than the size of the quantity being shifted"
4127         //       and it's UB instead.
4128         int4 r = _mm_setzero_si128();
4129         if (count >= 32)
4130             return r;
4131         r.ptr[0] = a.array[0] >>> count;
4132         r.ptr[1] = a.array[1] >>> count;
4133         r.ptr[2] = a.array[2] >>> count;
4134         r.ptr[3] = a.array[3] >>> count;
4135         return r;
4136     }
4137 }
4138 unittest
4139 {
4140     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4141     __m128i B = _mm_srli_epi32(A, 1);
4142     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4143     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4144     assert(B.array == expectedB);
4145     assert(B2.array == expectedB);
4146  
4147     __m128i C = _mm_srli_epi32(A, 255);
4148     int[4] expectedC = [ 0, 0, 0, 0 ];
4149     assert(C.array == expectedC);
4150 }
4151 
4152 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4153 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4154 {
4155     static if (GDC_with_SSE2)
4156     {
4157         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4158     }
4159     else static if (LDC_with_SSE2)
4160     {
4161         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4162     }
4163     else
4164     {
4165         long2 r = cast(long2) _mm_setzero_si128();
4166         long2 sa = cast(long2)a;
4167 
4168         ubyte count = cast(ubyte) imm8;
4169         if (count >= 64)
4170             return cast(__m128i)r;
4171 
4172         r.ptr[0] = sa.array[0] >>> count;
4173         r.ptr[1] = sa.array[1] >>> count;
4174         return cast(__m128i)r;
4175     }
4176 }
4177 unittest
4178 {
4179     __m128i A = _mm_setr_epi64(8, -4);
4180     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4181     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4182     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4183     assert(B.array == expectedB);
4184     assert(B2.array == expectedB);
4185 
4186     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4187     long[2] expectedC = [ 0, 0 ];
4188     assert(C.array == expectedC);
4189 }
4190 
4191 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4192 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
4193 {
4194     static if (bytes & 0xF0)
4195     {
4196         return _mm_setzero_si128();
4197     }
4198     else static if (GDC_with_SSE2)
4199     {
4200         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4201     }
4202     else static if (DMD_with_32bit_asm)
4203     {
4204         asm pure nothrow @nogc @trusted
4205         {
4206             movdqu XMM0, v;
4207             psrldq XMM0, bytes;
4208             movdqu v, XMM0;
4209         }
4210         return v;
4211     }
4212     else
4213     {
4214         // TODO remove this use of shufflevector except for LDC
4215         return cast(__m128i) shufflevector!(byte16,
4216                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4217                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4218                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4219     }
4220 }
4221 unittest
4222 {
4223     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
4224     int[4] correct = [2, 3, 4, 0];
4225     assert(R.array == correct);
4226 
4227     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4228     int[4] expectedA = [0, 0, 0, 0];
4229     assert(A.array == expectedA);
4230 }
4231 
4232 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4233 /// #BONUS
4234 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4235 {
4236     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4237 }
4238 unittest
4239 {
4240     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4241     float[4] correct = [3.0f, 4.0f, 0, 0];
4242     assert(R.array == correct);
4243 }
4244 
4245 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4246 /// #BONUS
4247 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4248 {
4249     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4250 }
4251 
4252 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4253 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4254 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4255 {
4256     pragma(inline, true);
4257     __m128d* aligned = cast(__m128d*)mem_addr;
4258     *aligned = a;
4259 }
4260 
4261 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4262 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4263 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4264 {
4265     __m128d* aligned = cast(__m128d*)mem_addr;
4266     __m128d r; // PERF =void;
4267     r.ptr[0] = a.array[0];
4268     r.ptr[1] = a.array[0];
4269     *aligned = r;
4270 }
4271 
4272 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4273 /// be aligned on any particular boundary.
4274 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4275 {
4276     pragma(inline, true);
4277     *mem_addr = a.array[0];
4278 }
4279 
4280 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4281 /// general-protection exception may be generated.
4282 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4283 {
4284     pragma(inline, true);
4285     *mem_addr = a;
4286 }
4287 
4288 alias _mm_store1_pd = _mm_store_pd1; ///
4289 
4290 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4291 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4292 {
4293     pragma(inline, true);
4294     *mem_addr = a.array[1];
4295 }
4296 
4297 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4298 // expectations from the user point of view. This problem also exist in C++.
4299 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4300 {
4301     pragma(inline, true);
4302     long* dest = cast(long*)mem_addr;
4303     long2 la = cast(long2)a;
4304     *dest = la.array[0];
4305 }
4306 unittest
4307 {
4308     long[3] A = [1, 2, 3];
4309     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4310     long[3] correct = [1, 0x1_0000_0000, 3];
4311     assert(A == correct);
4312 }
4313 
4314 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4315 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4316 {
4317     pragma(inline, true);
4318     *mem_addr = a.array[0];
4319 }
4320 
4321 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4322 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4323 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
4324 {
4325     // TODO remove this use of shufflevector except for LDC
4326     __m128d* aligned = cast(__m128d*)mem_addr;
4327     *aligned = shufflevector!(double2, 1, 0)(a, a);
4328 }
4329 
4330 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4331 /// `mem_addr` does not need to be aligned on any particular boundary.
4332 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
4333 {
4334     // PERF DMD
4335     pragma(inline, true);
4336     static if (GDC_with_SSE2)
4337     {
4338         __builtin_ia32_storeupd(mem_addr, a);
4339     }
4340     else version(LDC)
4341     {
4342         storeUnaligned!double2(a, mem_addr);
4343     }
4344     else
4345     {
4346         mem_addr[0] = a.array[0];
4347         mem_addr[1] = a.array[1];
4348     }
4349 }
4350 unittest
4351 {
4352     __m128d A = _mm_setr_pd(3.0, 4.0);
4353     align(16) double[4] R = [0.0, 0, 0, 0];
4354     double[2] correct = [3.0, 4.0];
4355     _mm_storeu_pd(&R[1], A);
4356     assert(R[1..3] == correct);
4357 }
4358 
4359 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4360 /// boundary.
4361 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned
4362 {
4363     // PERF: DMD
4364     pragma(inline, true);
4365     static if (GDC_with_SSE2)
4366     {
4367         __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
4368     }
4369     else version(LDC)
4370     {
4371         storeUnaligned!__m128i(a, cast(int*)mem_addr);
4372     }
4373     else
4374     {
4375         int* p = cast(int*)mem_addr;
4376         p[0] = a.array[0];
4377         p[1] = a.array[1];
4378         p[2] = a.array[2];
4379         p[3] = a.array[3];
4380     }
4381 }
4382 unittest
4383 {
4384     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4385     align(16) int[6] R = [0, 0, 0, 0, 0, 0];
4386     int[4] correct = [1, 2, 3, 4];
4387     _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
4388     assert(R[1..5] == correct);
4389 }
4390 
4391 /// Store 32-bit integer from the first element of `a` into memory. 
4392 /// `mem_addr` does not need to be aligned on any particular boundary.
4393 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4394 {
4395     pragma(inline, true);
4396     int* dest = cast(int*)mem_addr;
4397     *dest = a.array[0];
4398 }
4399 unittest
4400 {
4401     int[2] arr = [-24, 12];
4402     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4403     assert(arr == [-24, -1]);
4404 }
4405 
4406 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4407 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4408 /// boundary or a general-protection exception may be generated.
4409 void _mm_stream_pd (double* mem_addr, __m128d a)
4410 {
4411     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4412     __m128d* dest = cast(__m128d*)mem_addr;
4413     *dest = a;
4414 }
4415 
4416 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4417 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4418 /// may be generated.
4419 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4420 {
4421     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4422     __m128i* dest = cast(__m128i*)mem_addr;
4423     *dest = a;
4424 }
4425 
4426 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4427 /// pollution. If the cache line containing address mem_addr is already in the cache,
4428 /// the cache will be updated.
4429 void _mm_stream_si32 (int* mem_addr, int a)
4430 {
4431     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4432     *mem_addr = a;
4433 }
4434 
4435 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4436 /// cache pollution. If the cache line containing address mem_addr is already
4437 /// in the cache, the cache will be updated.
4438 void _mm_stream_si64 (long* mem_addr, long a)
4439 {
4440     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4441     *mem_addr = a;
4442 }
4443 
4444 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4445 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4446 {
4447     pragma(inline, true);
4448     return cast(__m128i)(cast(short8)a - cast(short8)b);
4449 }
4450 // TODO untitest
4451 
4452 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4453 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4454 {
4455     pragma(inline, true);
4456     return cast(__m128i)(cast(int4)a - cast(int4)b);
4457 }
4458 // TODO untitest
4459 
4460 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4461 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4462 {
4463     pragma(inline, true);
4464     return cast(__m128i)(cast(long2)a - cast(long2)b);
4465 }
4466 // TODO untitest
4467 
4468 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4469 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4470 {
4471     pragma(inline, true);
4472     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4473 }
4474 // TODO untitest
4475 
4476 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4477 /// floating-point elements in `a`.
4478 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4479 {
4480     pragma(inline, true);
4481     return a - b;
4482 }
4483 // TODO untitest
4484 
4485 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4486 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4487 /// upper element of result.
4488 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4489 {
4490     version(DigitalMars)
4491     {
4492         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4493         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4494         asm pure nothrow @nogc @trusted { nop;}
4495         a[0] = a[0] - b[0];
4496         return a;
4497     }
4498     else static if (GDC_with_SSE2)
4499     {
4500         return __builtin_ia32_subsd(a, b);
4501     }
4502     else
4503     {
4504         a.ptr[0] -= b.array[0];
4505         return a;
4506     }
4507 }
4508 unittest
4509 {
4510     __m128d a = [1.5, -2.0];
4511     a = _mm_sub_sd(a, a);
4512     assert(a.array == [0.0, -2.0]);
4513 }
4514 
4515 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4516 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4517 {
4518     pragma(inline, true);
4519     return a - b;
4520 }
4521 // TODO unittest
4522 
4523 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4524 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4525 {
4526     version(LDC)
4527     {
4528         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4529         {
4530             // Generates PSUBSW since LDC 1.15 -O0
4531             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4532             
4533             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4534             enum ir = `
4535                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4536                 ret <8 x i16> %r`;
4537             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4538         }
4539         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4540         {
4541             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4542             short[8] res; // PERF: =void;
4543             short8 sa = cast(short8)a;
4544             short8 sb = cast(short8)b;
4545             foreach(i; 0..8)
4546                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4547             return _mm_loadu_si128(cast(int4*)res.ptr);
4548         }
4549         else static if (LDC_with_SSE2)
4550         {
4551             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4552         }
4553         else
4554             static assert(false);
4555     }
4556     else static if (GDC_with_SSE2)
4557     {
4558         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4559     }
4560     else
4561     {
4562         short[8] res; // PERF =void;
4563         short8 sa = cast(short8)a;
4564         short8 sb = cast(short8)b;
4565         foreach(i; 0..8)
4566             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4567         return _mm_loadu_si128(cast(int4*)res.ptr);
4568     }
4569 }
4570 unittest
4571 {
4572     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4573                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4574     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4575     assert(res.array == correctResult);
4576 }
4577 
4578 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4579 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4580 {
4581     version(LDC)
4582     {
4583         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4584         {
4585             // x86: Generates PSUBSB since LDC 1.15 -O0
4586             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4587             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4588             enum ir = `
4589                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4590                 ret <16 x i8> %r`;
4591             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4592         }
4593         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4594         {
4595             byte[16] res; // PERF =void;
4596             byte16 sa = cast(byte16)a;
4597             byte16 sb = cast(byte16)b;
4598             foreach(i; 0..16)
4599                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4600             return _mm_loadu_si128(cast(int4*)res.ptr);
4601         }
4602         else static if (LDC_with_SSE2)
4603         {
4604             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4605         }
4606         else
4607             static assert(false);
4608     }
4609     else static if (GDC_with_SSE2)
4610     {
4611         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4612     }
4613     else
4614     {
4615         byte[16] res; // PERF =void;
4616         byte16 sa = cast(byte16)a;
4617         byte16 sb = cast(byte16)b;
4618         foreach(i; 0..16)
4619             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4620         return _mm_loadu_si128(cast(int4*)res.ptr);
4621     }
4622 }
4623 unittest
4624 {
4625     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4626                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4627     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4628     assert(res.array == correctResult);
4629 }
4630 
4631 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4632 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4633 {
4634     version(LDC)
4635     {
4636         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4637         {
4638             // x86: Generates PSUBUSW since LDC 1.15 -O0
4639             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4640             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4641             enum ir = `
4642                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4643                 ret <8 x i16> %r`;
4644             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4645         }
4646         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4647         {
4648             short[8] res; // PERF =void;
4649             short8 sa = cast(short8)a;
4650             short8 sb = cast(short8)b;
4651             foreach(i; 0..8)
4652             {
4653                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4654                 res[i] = saturateSignedIntToUnsignedShort(sum);
4655             }
4656             return _mm_loadu_si128(cast(int4*)res.ptr);
4657         }
4658         else static if (LDC_with_SSE2)
4659         {
4660             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4661         }
4662         else 
4663             static assert(false);
4664     }
4665     else static if (GDC_with_SSE2)
4666     {
4667         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4668     }
4669     else
4670     {
4671         short[8] res; // PERF =void;
4672         short8 sa = cast(short8)a;
4673         short8 sb = cast(short8)b;
4674         foreach(i; 0..8)
4675         {
4676             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4677             res[i] = saturateSignedIntToUnsignedShort(sum);
4678         }
4679         return _mm_loadu_si128(cast(int4*)res.ptr);
4680     }
4681 }
4682 unittest
4683 {
4684     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4685                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4686     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4687     assert(R.array == correct);
4688 }
4689 
4690 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4691 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4692 {
4693     version(LDC)
4694     {
4695         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4696         {
4697             // x86: Generates PSUBUSB since LDC 1.15 -O0
4698             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4699             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4700             enum ir = `
4701                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4702                 ret <16 x i8> %r`;
4703             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4704         }
4705         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4706         {
4707             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4708             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4709             {
4710                 ubyte[16] res; // PERF =void;
4711                 byte16 sa = cast(byte16)a;
4712                 byte16 sb = cast(byte16)b;
4713                 foreach(i; 0..16)
4714                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4715                 return _mm_loadu_si128(cast(int4*)res.ptr);
4716             }
4717         }
4718         else static if (LDC_with_SSE2)
4719         {
4720             return __builtin_ia32_psubusb128(a, b);
4721         }
4722         else 
4723             static assert(false);
4724     }
4725     else static if (GDC_with_SSE2)
4726     {
4727         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4728     }
4729     else
4730     {
4731         ubyte[16] res; // PERF =void;
4732         byte16 sa = cast(byte16)a;
4733         byte16 sb = cast(byte16)b;
4734         foreach(i; 0..16)
4735             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4736         return _mm_loadu_si128(cast(int4*)res.ptr);
4737     }
4738 }
4739 unittest
4740 {
4741     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4742                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4743     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4744     assert(res.array == correctResult);
4745 }
4746 
4747 // Note: the only difference between these intrinsics is the signalling
4748 //       behaviour of quiet NaNs. This is incorrect but the case where
4749 //       you would want to differentiate between qNaN and sNaN and then
4750 //       treat them differently on purpose seems extremely rare.
4751 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4752 alias _mm_ucomige_sd = _mm_comige_sd; ///
4753 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4754 alias _mm_ucomile_sd = _mm_comile_sd; ///
4755 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4756 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4757 
4758 /// Return vector of type `__m128d` with undefined elements.
4759 __m128d _mm_undefined_pd() pure @safe
4760 {
4761     pragma(inline, true);
4762     __m128d result = void;
4763     return result;
4764 }
4765 
4766 /// Return vector of type `__m128i` with undefined elements.
4767 __m128i _mm_undefined_si128() pure @safe
4768 {
4769     pragma(inline, true);
4770     __m128i result = void;
4771     return result;
4772 }
4773 
4774 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4775 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4776 {
4777     static if (GDC_with_SSE2)
4778     {
4779         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4780     }
4781     else static if (DMD_with_32bit_asm)
4782     {
4783         asm pure nothrow @nogc @trusted
4784         {
4785             movdqu XMM0, a;
4786             movdqu XMM1, b;
4787             punpckhwd XMM0, XMM1;
4788             movdqu a, XMM0;
4789         }
4790         return a;
4791     }
4792     else
4793     {
4794         // TODO remove this use of shufflevector except for LDC
4795         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4796                                            (cast(short8)a, cast(short8)b);
4797     }
4798 }
4799 unittest
4800 {
4801     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4802     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4803     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4804     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4805     assert(C.array == correct);
4806 }
4807 
4808 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4809 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
4810 {
4811     static if (GDC_with_SSE2)
4812     {
4813         return __builtin_ia32_punpckhdq128(a, b);
4814     }
4815     else version(DigitalMars)
4816     {
4817         __m128i r;
4818         r.ptr[0] = a.array[2];
4819         r.ptr[1] = b.array[2];
4820         r.ptr[2] = a.array[3];
4821         r.ptr[3] = b.array[3];
4822         return r;
4823     }
4824     else
4825     {
4826         // TODO remove this use of shufflevector except for LDC
4827         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4828     }
4829 }
4830 unittest
4831 {
4832     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4833     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4834     __m128i C = _mm_unpackhi_epi32(A, B);
4835     int[4] correct = [3, 7, 4, 8];
4836     assert(C.array == correct);
4837 }
4838 
4839 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4840 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4841 {
4842     static if (GDC_with_SSE2)
4843     {
4844         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
4845     }
4846     else
4847     {
4848         __m128i r = cast(__m128i)b;
4849         r[0] = a[2];
4850         r[1] = a[3];
4851         return r; 
4852     }
4853 }
4854 unittest // Issue #36
4855 {
4856     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4857     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4858     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4859     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4860     assert(C.array == correct);
4861 }
4862 
4863 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4864 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4865 {
4866     static if (GDC_with_SSE2)
4867     {
4868         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
4869     }
4870     else static if (DMD_with_32bit_asm)
4871     {
4872         asm pure nothrow @nogc @trusted
4873         {
4874             movdqu XMM0, a;
4875             movdqu XMM1, b;
4876             punpckhbw XMM0, XMM1;
4877             movdqu a, XMM0;
4878         }
4879         return a;
4880     }
4881     else
4882     {
4883         // TODO remove this use of shufflevector except for LDC
4884         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4885                                                    12, 28, 13, 29, 14, 30, 15, 31)
4886                                                    (cast(byte16)a, cast(byte16)b);
4887     }
4888 }
4889 unittest
4890 {
4891     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4892     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4893     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
4894     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
4895     assert(C.array == correct);
4896 }
4897 
4898 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4899 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4900 {
4901     static if (GDC_with_SSE2)
4902     {
4903         return __builtin_ia32_unpckhpd(a, b);
4904     }
4905     else
4906     {
4907         return shufflevector!(__m128d, 1, 3)(a, b); // TODO remove this use of shufflevector except for LDC
4908     }
4909 }
4910 unittest
4911 {
4912     __m128d A = _mm_setr_pd(4.0, 6.0);
4913     __m128d B = _mm_setr_pd(7.0, 9.0);
4914     __m128d C = _mm_unpackhi_pd(A, B);
4915     double[2] correct = [6.0, 9.0];
4916     assert(C.array == correct);
4917 }
4918 
4919 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4920 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4921 {
4922     static if (GDC_with_SSE2)
4923     {
4924         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
4925     }
4926     else static if (DMD_with_32bit_asm)
4927     {
4928         asm pure nothrow @nogc @trusted
4929         {
4930             movdqu XMM0, a;
4931             movdqu XMM1, b;
4932             punpcklwd XMM0, XMM1;
4933             movdqu a, XMM0;
4934         }
4935         return a;
4936     }
4937     else
4938     {
4939         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4940                                            (cast(short8)a, cast(short8)b); // TODO remove this use of shufflevector except for LDC
4941     }
4942 }
4943 unittest
4944 {
4945     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4946     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4947     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
4948     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
4949     assert(C.array == correct);
4950 }
4951 
4952 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4953 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
4954 {
4955     static if (GDC_with_SSE2)
4956     {
4957         return __builtin_ia32_punpckldq128(a, b);
4958     }
4959     else version(DigitalMars)
4960     {
4961         __m128i r;
4962         r.ptr[0] = a.array[0];
4963         r.ptr[1] = b.array[0];
4964         r.ptr[2] = a.array[1];
4965         r.ptr[3] = b.array[1];
4966         return r;
4967     }
4968     else
4969     {
4970         return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); // TODO remove this use of shufflevector except for LDC
4971     }
4972 }
4973 unittest
4974 {
4975     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4976     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4977     __m128i C = _mm_unpacklo_epi32(A, B);
4978     int[4] correct = [1, 5, 2, 6];
4979     assert(C.array == correct);
4980 }
4981 
4982 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4983 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4984 {
4985     static if (GDC_with_SSE2)
4986     {
4987         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
4988     }
4989     else
4990     {
4991         long2 lA = cast(long2)a;
4992         long2 lB = cast(long2)b;
4993         long2 R; // PERF =void;
4994         R.ptr[0] = lA.array[0];
4995         R.ptr[1] = lB.array[0];
4996         return cast(__m128i)R;
4997     }
4998 }
4999 unittest // Issue #36
5000 {
5001     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5002     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5003     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
5004     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
5005     assert(C.array == correct);
5006 }
5007 
5008 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
5009 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
5010 {
5011     static if (GDC_with_SSE2)
5012     {
5013         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
5014     }
5015     else static if (DMD_with_32bit_asm)
5016     {
5017         asm pure nothrow @nogc @trusted
5018         {
5019             movdqu XMM0, a;
5020             movdqu XMM1, b;
5021             punpcklbw XMM0, XMM1;
5022             movdqu a, XMM0;
5023         }
5024         return a;
5025     }
5026     else
5027     {
5028         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
5029                                                     4, 20, 5, 21, 6, 22, 7, 23)
5030                                            (cast(byte16)a, cast(byte16)b); // TODO remove this use of shufflevector except for LDC
5031     }
5032 }
5033 unittest
5034 {
5035     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5036     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5037     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
5038     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
5039     assert(C.array == correct);
5040 }
5041 
5042 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
5043 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
5044 {
5045     static if (GDC_with_SSE2)
5046     {
5047         return __builtin_ia32_unpcklpd(a, b);
5048     }
5049     else
5050     {
5051         return shufflevector!(__m128d, 0, 2)(a, b); // TODO remove this use of shufflevector except for LDC
5052     }
5053 }
5054 unittest
5055 {
5056     __m128d A = _mm_setr_pd(4.0, 6.0);
5057     __m128d B = _mm_setr_pd(7.0, 9.0);
5058     __m128d C = _mm_unpacklo_pd(A, B);
5059     double[2] correct = [4.0, 7.0];
5060     assert(C.array == correct);
5061 }
5062 
5063 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
5064 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
5065 {
5066     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
5067 }
5068 // TODO unittest
5069 
5070 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
5071 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
5072 {
5073     return a ^ b;
5074 }
5075 // TODO unittest
5076 
5077 unittest
5078 {
5079     float distance(float[4] a, float[4] b) nothrow @nogc
5080     {
5081         __m128 va = _mm_loadu_ps(a.ptr);
5082         __m128 vb = _mm_loadu_ps(b.ptr);
5083         __m128 diffSquared = _mm_sub_ps(va, vb);
5084         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
5085         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
5086         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
5087         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
5088     }
5089     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
5090 }