1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (GDC_with_SSE2)
83     {
84         return __builtin_ia32_addsd(a, b);
85     }
86     else version(DigitalMars)
87     {
88         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
89         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
90         asm pure nothrow @nogc @trusted { nop;}
91         a[0] = a[0] + b[0];
92         return a;
93     }
94     else
95     {
96         a[0] += b[0];
97         return a;
98     }
99 }
100 unittest
101 {
102     __m128d a = [1.5, -2.0];
103     a = _mm_add_sd(a, a);
104     assert(a.array == [3.0, -2.0]);
105 }
106 
107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
109 {
110     pragma(inline, true);
111     return a + b;
112 }
113 unittest
114 {
115     __m128d a = [1.5, -2.0];
116     a = _mm_add_pd(a, a);
117     assert(a.array == [3.0, -4.0]);
118 }
119 
120 /// Add 64-bit integers `a` and `b`.
121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
122 {
123     pragma(inline, true);
124     return a + b;
125 }
126 
127 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
129 {
130     static if (GDC_with_SSE2)
131     {
132         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
133     }
134     else version(LDC)
135     {
136         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
137         {
138             // x86: Generates PADDSW since LDC 1.15 -O0
139             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
140             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
141             enum ir = `
142                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
143                 ret <8 x i16> %r`;
144             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
145         }
146         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
147         {
148             short[8] res; // PERF =void;
149             short8 sa = cast(short8)a;
150             short8 sb = cast(short8)b;
151             foreach(i; 0..8)
152                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
153             return _mm_loadu_si128(cast(int4*)res.ptr);
154         }
155         else
156             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
157     }
158     else
159     {
160         short[8] res; // PERF =void;
161         short8 sa = cast(short8)a;
162         short8 sb = cast(short8)b;
163         foreach(i; 0..8)
164             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
165         return _mm_loadu_si128(cast(int4*)res.ptr);
166     }
167 }
168 unittest
169 {
170     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
171                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
172     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
173     assert(res.array == correctResult);
174 }
175 
176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
178 {
179     static if (GDC_with_SSE2)
180     {
181         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
182     }
183     else version(LDC)
184     {
185         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
186         {
187             // x86: Generates PADDSB since LDC 1.15 -O0
188             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
189             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
190             enum ir = `
191                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
192                 ret <16 x i8> %r`;
193             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
194         }
195         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
196         {
197             byte[16] res; // PERF =void;
198             byte16 sa = cast(byte16)a;
199             byte16 sb = cast(byte16)b;
200             foreach(i; 0..16)
201                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
202             return _mm_loadu_si128(cast(int4*)res.ptr);
203         }
204         else
205             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
206     }
207     else
208     {
209         byte[16] res; // PERF =void;
210         byte16 sa = cast(byte16)a;
211         byte16 sb = cast(byte16)b;
212         foreach(i; 0..16)
213             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
214         return _mm_loadu_si128(cast(int4*)res.ptr);
215     }
216 }
217 unittest
218 {
219     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
220                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
221     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
222                                                16, 18, 20, 22, 24, 26, 28, 30];
223     assert(res.array == correctResult);
224 }
225 
226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
227 // PERF: #GDC version?
228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
229 {
230     version(LDC)
231     {
232         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
233         {
234             // x86: Generates PADDUSB since LDC 1.15 -O0
235             // ARM: Generates uqadd.16b since LDC 1.21 -O1
236             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
237             enum ir = `
238                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
239                 ret <16 x i8> %r`;
240             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
241         }
242         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
243         {
244             ubyte[16] res; // PERF =void;
245             byte16 sa = cast(byte16)a;
246             byte16 sb = cast(byte16)b;
247             foreach(i; 0..16)
248                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
249             return _mm_loadu_si128(cast(int4*)res.ptr);
250         }
251         else
252             return __builtin_ia32_paddusb128(a, b);
253     }
254     else
255     {
256         ubyte[16] res; // PERF =void;
257         byte16 sa = cast(byte16)a;
258         byte16 sb = cast(byte16)b;
259         foreach(i; 0..16)
260             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
261         return _mm_loadu_si128(cast(int4*)res.ptr);
262     }
263 }
264 unittest
265 {
266     byte16 res = cast(byte16) 
267         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
268                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
269     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
270                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
271     assert(res.array == correctResult);
272 }
273 
274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
275 // PERF: #GDC version?
276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
277 {
278     version(LDC)
279     {
280         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
281         {
282             // x86: Generates PADDUSW since LDC 1.15 -O0
283             // ARM: Generates uqadd.8h since LDC 1.21 -O1
284             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
285             enum ir = `
286                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
287                 ret <8 x i16> %r`;
288             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
289         }
290         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
291         {
292             ushort[8] res; // PERF =void;
293             short8 sa = cast(short8)a;
294             short8 sb = cast(short8)b;
295             foreach(i; 0..8)
296                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
297             return _mm_loadu_si128(cast(int4*)res.ptr);
298         }
299         else
300             return __builtin_ia32_paddusw128(a, b);
301     }
302     else
303     {
304         ushort[8] res; // PERF =void;
305         short8 sa = cast(short8)a;
306         short8 sb = cast(short8)b;
307         foreach(i; 0..8)
308             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
309         return _mm_loadu_si128(cast(int4*)res.ptr);
310     }
311 }
312 unittest
313 {
314     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
315                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
316     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
317     assert(res.array == correctResult);
318 }
319 
320 /// Compute the bitwise AND of packed double-precision (64-bit) 
321 /// floating-point elements in `a` and `b`.
322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
323 {
324     pragma(inline, true);
325     return cast(__m128d)( cast(long2)a & cast(long2)b );
326 }
327 unittest
328 {
329     double a = 4.32;
330     double b = -78.99;
331     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
332     __m128d A = _mm_set_pd(a, b);
333     __m128d B = _mm_set_pd(b, a);
334     long2 R = cast(long2)( _mm_and_pd(A, B) );
335     assert(R.array[0] == correct);
336     assert(R.array[1] == correct);
337 }
338 
339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
341 {
342     pragma(inline, true);
343     return a & b;
344 }
345 unittest
346 {
347     __m128i A = _mm_set1_epi32(7);
348     __m128i B = _mm_set1_epi32(14);
349     __m128i R = _mm_and_si128(A, B);
350     int[4] correct = [6, 6, 6, 6];
351     assert(R.array == correct);
352 }
353 
354 /// Compute the bitwise NOT of packed double-precision (64-bit) 
355 /// floating-point elements in `a` and then AND with `b`.
356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
357 {
358     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
359 }
360 unittest
361 {
362     double a = 4.32;
363     double b = -78.99;
364     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
365     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
366     __m128d A = _mm_setr_pd(a, b);
367     __m128d B = _mm_setr_pd(b, a);
368     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
369     assert(R.array[0] == correct);
370     assert(R.array[1] == correct2);
371 }
372 
373 /// Compute the bitwise NOT of 128 bits (representing integer data) 
374 /// in `a` and then AND with `b`.
375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
376 {
377     return (~a) & b;
378 }
379 unittest
380 {
381     __m128i A = _mm_set1_epi32(7);
382     __m128i B = _mm_set1_epi32(14);
383     __m128i R = _mm_andnot_si128(A, B);
384     int[4] correct = [8, 8, 8, 8];
385     assert(R.array == correct);
386 }
387 
388 /// Average packed unsigned 16-bit integers in `a` and `b`.
389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
390 {
391     static if (GDC_with_SSE2)
392     {
393         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
394     }
395     else static if (LDC_with_ARM64)
396     {
397         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
398     }
399     else version(LDC)
400     {
401         // Generates pavgw even in LDC 1.0, even in -O0
402         // But not in ARM
403         enum ir = `
404             %ia = zext <8 x i16> %0 to <8 x i32>
405             %ib = zext <8 x i16> %1 to <8 x i32>
406             %isum = add <8 x i32> %ia, %ib
407             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
408             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
409             %r = trunc <8 x i32> %isums to <8 x i16>
410             ret <8 x i16> %r`;
411         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
412     }
413     else
414     {
415         short8 sa = cast(short8)a;
416         short8 sb = cast(short8)b;
417         short8 sr = void;
418         foreach(i; 0..8)
419         {
420             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
421         }
422         return cast(int4)sr;
423     }
424 }
425 unittest
426 {
427     __m128i A = _mm_set1_epi16(31);
428     __m128i B = _mm_set1_epi16(64);
429     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
430     foreach(i; 0..8)
431         assert(avg.array[i] == 48);
432 }
433 
434 /// Average packed unsigned 8-bit integers in `a` and `b`.
435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
436 {
437     static if (GDC_with_SSE2)
438     {
439         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
440     }
441     else static if (LDC_with_ARM64)
442     {
443         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
444     }
445     else version(LDC)
446     {
447         // Generates pavgb even in LDC 1.0, even in -O0
448         // But not in ARM
449         enum ir = `
450             %ia = zext <16 x i8> %0 to <16 x i16>
451             %ib = zext <16 x i8> %1 to <16 x i16>
452             %isum = add <16 x i16> %ia, %ib
453             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
454             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
455             %r = trunc <16 x i16> %isums to <16 x i8>
456             ret <16 x i8> %r`;
457         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
458     }
459     else
460     {
461         byte16 sa = cast(byte16)a;
462         byte16 sb = cast(byte16)b;
463         byte16 sr = void;
464         foreach(i; 0..16)
465         {
466             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
467         }
468         return cast(int4)sr;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_set1_epi8(31);
474     __m128i B = _mm_set1_epi8(64);
475     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
476     foreach(i; 0..16)
477         assert(avg.array[i] == 48);
478 }
479 
480 /// Shift `a` left by `bytes` bytes while shifting in zeros.
481 alias _mm_bslli_si128 = _mm_slli_si128;
482 unittest
483 {
484     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
485     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
486     __m128i result = _mm_bslli_si128!5(toShift);
487     assert( (cast(byte16)result).array == exact);
488 }
489 
490 /// Shift `v` right by `bytes` bytes while shifting in zeros.
491 alias _mm_bsrli_si128 = _mm_srli_si128;
492 unittest
493 {
494     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
495     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
496     __m128i result = _mm_bsrli_si128!5(toShift);
497     assert( (cast(byte16)result).array == exact);
498 }
499 
500 /// Cast vector of type `__m128d` to type `__m128`. 
501 /// Note: Also possible with a regular `cast(__m128)(a)`.
502 __m128 _mm_castpd_ps (__m128d a) pure @safe
503 {
504     return cast(__m128)a;
505 }
506 
507 /// Cast vector of type `__m128d` to type `__m128i`. 
508 /// Note: Also possible with a regular `cast(__m128i)(a)`.
509 __m128i _mm_castpd_si128 (__m128d a) pure @safe
510 {
511     return cast(__m128i)a;
512 }
513 
514 /// Cast vector of type `__m128` to type `__m128d`. 
515 /// Note: Also possible with a regular `cast(__m128d)(a)`.
516 __m128d _mm_castps_pd (__m128 a) pure @safe
517 {
518     return cast(__m128d)a;
519 }
520 
521 /// Cast vector of type `__m128` to type `__m128i`. 
522 /// Note: Also possible with a regular `cast(__m128i)(a)`.
523 __m128i _mm_castps_si128 (__m128 a) pure @safe
524 {
525     return cast(__m128i)a;
526 }
527 
528 /// Cast vector of type `__m128i` to type `__m128d`. 
529 /// Note: Also possible with a regular `cast(__m128d)(a)`.
530 __m128d _mm_castsi128_pd (__m128i a) pure @safe
531 {
532     return cast(__m128d)a;
533 }
534 
535 /// Cast vector of type `__m128i` to type `__m128`. 
536 /// Note: Also possible with a regular `cast(__m128)(a)`.
537 __m128 _mm_castsi128_ps (__m128i a) pure @safe
538 {
539     return cast(__m128)a;
540 }
541 
542 /// Invalidate and flush the cache line that contains `p` 
543 /// from all levels of the cache hierarchy.
544 void _mm_clflush (const(void)* p) @trusted
545 {
546     static if (GDC_with_SSE2)
547     {
548         __builtin_ia32_clflush(p);
549     }
550     else static if (LDC_with_SSE2)
551     {
552         __builtin_ia32_clflush(cast(void*)p);
553     }
554     else version(D_InlineAsm_X86)
555     {
556         asm pure nothrow @nogc @safe
557         {
558             mov EAX, p;
559             clflush [EAX];
560         }
561     }
562     else version(D_InlineAsm_X86_64)
563     {
564         asm pure nothrow @nogc @safe
565         {
566             mov RAX, p;
567             clflush [RAX];
568         }
569     }
570     else 
571     {
572         // Do nothing. Invalidating cacheline does
573         // not affect correctness.
574     }
575 }
576 unittest
577 {
578     ubyte[64] cacheline;
579     _mm_clflush(cacheline.ptr);
580 }
581 
582 /// Compare packed 16-bit integers in `a` and `b` for equality.
583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
584 {
585     static if (GDC_with_SSE2)
586     {
587         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
588     }
589     else
590     {
591         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
592     }
593 }
594 unittest
595 {
596     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
597     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
598     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
599     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
600     assert(R.array == E);
601 }
602 
603 /// Compare packed 32-bit integers in `a` and `b` for equality.
604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
605 {
606     static if (GDC_with_SSE2)
607     {
608         return __builtin_ia32_pcmpeqd128(a, b);
609     }
610     else
611     {
612         return equalMask!__m128i(a, b);
613     }
614 }
615 unittest
616 {
617     int4   A = [-3, -2, -1,  0];
618     int4   B = [ 4, -2,  2,  0];
619     int[4] E = [ 0, -1,  0, -1];
620     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
621     assert(R.array == E);
622 }
623 
624 /// Compare packed 8-bit integers in `a` and `b` for equality.
625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
626 {
627     static if (GDC_with_SSE2)
628     {
629         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
630     }
631     else
632     {
633         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
634     }
635 }
636 unittest
637 {
638     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
639     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
640     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
641     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
642     assert(C.array == correct);
643 }
644 
645 /// Compare packed double-precision (64-bit) floating-point elements 
646 /// in `a` and `b` for equality.
647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
648 {
649     static if (GDC_with_SSE2)
650     {
651         return __builtin_ia32_cmpeqpd(a, b);
652     }
653     else
654     {
655         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
656     }
657 }
658 
659 /// Compare the lower double-precision (64-bit) floating-point elements
660 /// in `a` and `b` for equality, store the result in the lower element,
661 /// and copy the upper element from `a`.
662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
663 {
664     static if (GDC_with_SSE2)
665     {
666         return __builtin_ia32_cmpeqsd(a, b);
667     }
668     else
669     {
670         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
671     }
672 }
673 
674 /// Compare packed double-precision (64-bit) floating-point elements 
675 /// in `a` and `b` for greater-than-or-equal.
676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
677 {
678     static if (GDC_with_SSE2)
679     {
680         return __builtin_ia32_cmpgepd(a, b);
681     }
682     else
683     {
684         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
685     }
686 }
687 
688 /// Compare the lower double-precision (64-bit) floating-point elements 
689 /// in `a` and `b` for greater-than-or-equal, store the result in the 
690 /// lower element, and copy the upper element from `a`.
691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
692 {
693     // Note: There is no __builtin_ia32_cmpgesd builtin.
694     static if (GDC_with_SSE2)
695     {
696         return __builtin_ia32_cmpnltsd(b, a);
697     }
698     else
699     {
700         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
701     }
702 }
703 
704 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
706 {
707     static if (GDC_with_SSE2)
708     {
709         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
710     }
711     else
712     {
713         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
714     }
715 }
716 unittest
717 {
718     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
719     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
720     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
721     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
722     assert(R.array == E);
723 }
724 
725 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
727 {
728     static if (GDC_with_SSE2)
729     {
730         return __builtin_ia32_pcmpgtd128(a, b); 
731     }
732     else
733     {
734         return cast(__m128i)( greaterMask!int4(a, b));
735     }
736 }
737 unittest
738 {
739     int4   A = [-3,  2, -1,  0];
740     int4   B = [ 4, -2,  2,  0];
741     int[4] E = [ 0, -1,  0,  0];
742     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
743     assert(R.array == E);
744 }
745 
746 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
748 {
749     // Workaround of a GCC bug here.
750     // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence
751     // with __builtin_ia32_pcmpgtb128.
752     // GCC's emmintrin.h uses comparison operators we don't have instead.
753     // PERF: this is a quite severe GDC performance problem.
754     // Could be workarounded with inline assembly, or another algorithm I guess.
755   
756   /*
757     static if (GDC_with_SSE2)
758     {
759         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
760     }
761     else */
762     {
763         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
764     }
765 }
766 unittest
767 {
768     __m128i A = _mm_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
769     __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
770     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
771     byte[16] correct =       [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
772     __m128i D = _mm_cmpeq_epi8(A, B);
773     assert(C.array == correct);
774 }
775 
776 /// Compare packed double-precision (64-bit) floating-point elements 
777 /// in `a` and `b` for greater-than.
778 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
779 {
780     static if (GDC_with_SSE2)
781     {
782         return __builtin_ia32_cmpgtpd(a, b); 
783     }
784     else
785     {
786         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
787     }
788 }
789 
790 /// Compare the lower double-precision (64-bit) floating-point elements 
791 /// in `a` and `b` for greater-than, store the result in the lower element,
792 /// and copy the upper element from `a`.
793 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
794 {
795     // Note: There is no __builtin_ia32_cmpgtsd builtin.
796     static if (GDC_with_SSE2)
797     {
798         return __builtin_ia32_cmpnlesd(b, a);
799     }
800     else
801     {
802         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
803     }
804 }
805 
806 /// Compare packed double-precision (64-bit) floating-point elements 
807 /// in `a` and `b` for less-than-or-equal.
808 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
809 {
810     static if (GDC_with_SSE2)
811     {
812         return __builtin_ia32_cmplepd(a, b); 
813     }
814     else
815     {
816         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
817     }
818 }
819 
820 /// Compare the lower double-precision (64-bit) floating-point elements 
821 /// in `a` and `b` for less-than-or-equal, store the result in the 
822 /// lower element, and copy the upper element from `a`.
823 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
824 {
825     static if (GDC_with_SSE2)
826     {
827         return __builtin_ia32_cmplesd(a, b); 
828     }
829     else
830     {
831         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
832     }
833 }
834 
835 /// Compare packed 16-bit integers in `a` and `b` for less-than.
836 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
837 {
838     return _mm_cmpgt_epi16(b, a);
839 }
840 
841 /// Compare packed 32-bit integers in `a` and `b` for less-than.
842 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
843 {
844     return _mm_cmpgt_epi32(b, a);
845 }
846 
847 /// Compare packed 8-bit integers in `a` and `b` for less-than.
848 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
849 {
850     return _mm_cmpgt_epi8(b, a);
851 }
852 
853 /// Compare packed double-precision (64-bit) floating-point elements
854 /// in `a` and `b` for less-than.
855 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
856 {
857     static if (GDC_with_SSE2)
858     {
859         return __builtin_ia32_cmpltpd(a, b); 
860     }
861     else
862     {
863         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
864     }
865 }
866 
867 /// Compare the lower double-precision (64-bit) floating-point elements
868 /// in `a` and `b` for less-than, store the result in the lower 
869 /// element, and copy the upper element from `a`.
870 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
871 {
872     static if (GDC_with_SSE2)
873     {
874         return __builtin_ia32_cmpltsd(a, b); 
875     }
876     else
877     {
878         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
879     }
880 }
881 
882 /// Compare packed double-precision (64-bit) floating-point elements
883 /// in `a` and `b` for not-equal.
884 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
885 {
886     static if (GDC_with_SSE2)
887     {
888         return __builtin_ia32_cmpneqpd(a, b); 
889     }
890     else
891     {
892         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
893     }
894 }
895 
896 /// Compare the lower double-precision (64-bit) floating-point elements
897 /// in `a` and `b` for not-equal, store the result in the lower 
898 /// element, and copy the upper element from `a`.
899 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
900 {
901     static if (GDC_with_SSE2)
902     {
903         return __builtin_ia32_cmpneqsd(a, b); 
904     }
905     else
906     {
907         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
908     }
909 }
910 
911 /// Compare packed double-precision (64-bit) floating-point elements 
912 /// in `a` and `b` for not-greater-than-or-equal.
913 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
914 {
915     static if (GDC_with_SSE2)
916     {
917         return __builtin_ia32_cmpngepd(a, b); 
918     }
919     else
920     {
921         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
922     }
923 }
924 
925 /// Compare the lower double-precision (64-bit) floating-point elements 
926 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
927 /// the lower element, and copy the upper element from `a`.
928 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
929 {
930     // Note: There is no __builtin_ia32_cmpngesd builtin.
931     static if (GDC_with_SSE2)
932     {
933         return __builtin_ia32_cmpltsd(b, a); 
934     }
935     else
936     {
937         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
938     }
939 }
940 
941 /// Compare packed double-precision (64-bit) floating-point elements 
942 /// in `a` and `b` for not-greater-than.
943 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
944 {
945     static if (GDC_with_SSE2)
946     {
947         return __builtin_ia32_cmpngtpd(a, b);
948     }
949     else
950     {
951         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
952     }
953 }
954 
955 /// Compare the lower double-precision (64-bit) floating-point elements 
956 /// in `a` and `b` for not-greater-than, store the result in the 
957 /// lower element, and copy the upper element from `a`.
958 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
959 {
960     // Note: There is no __builtin_ia32_cmpngtsd builtin.
961     static if (GDC_with_SSE2)
962     {
963         return __builtin_ia32_cmplesd(b, a);
964     }
965     else
966     {
967         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
968     }
969 }
970 
971 /// Compare packed double-precision (64-bit) floating-point elements 
972 /// in `a` and `b` for not-less-than-or-equal.
973 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
974 {
975     static if (GDC_with_SSE2)
976     {
977         return __builtin_ia32_cmpnlepd(a, b);
978     }
979     else
980     {
981         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
982     }
983 }
984 
985 /// Compare the lower double-precision (64-bit) floating-point elements 
986 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
987 /// lower element, and copy the upper element from `a`.
988 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
989 {
990     static if (GDC_with_SSE2)
991     {
992         return __builtin_ia32_cmpnlesd(a, b);
993     }
994     else
995     {
996         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
997     }
998 }
999  
1000 /// Compare packed double-precision (64-bit) floating-point elements 
1001 /// in `a` and `b` for not-less-than.
1002 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
1003 {
1004     static if (GDC_with_SSE2)
1005     {
1006         return __builtin_ia32_cmpnltpd(a, b);
1007     }
1008     else
1009     {
1010         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1011     }
1012 }
1013 
1014 /// Compare the lower double-precision (64-bit) floating-point elements 
1015 /// in `a` and `b` for not-less-than, store the result in the lower 
1016 /// element, and copy the upper element from `a`.
1017 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1018 {
1019     static if (GDC_with_SSE2)
1020     {
1021         return __builtin_ia32_cmpnltsd(a, b);
1022     }
1023     else
1024     {
1025         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1026     }
1027 }
1028 
1029 /// Compare packed double-precision (64-bit) floating-point elements 
1030 /// in `a` and `b` to see if neither is NaN.
1031 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1032 {
1033     static if (GDC_with_SSE2)
1034     {
1035         return __builtin_ia32_cmpordpd(a, b);
1036     }
1037     else
1038     {
1039         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1040     }
1041 }
1042 
1043 /// Compare the lower double-precision (64-bit) floating-point elements 
1044 /// in `a` and `b` to see if neither is NaN, store the result in the 
1045 /// lower element, and copy the upper element from `a` to the upper element.
1046 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1047 {
1048     static if (GDC_with_SSE2)
1049     {
1050         return __builtin_ia32_cmpordsd(a, b);
1051     }
1052     else
1053     {
1054         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1055     }
1056 }
1057 
1058 /// Compare packed double-precision (64-bit) floating-point elements 
1059 /// in `a` and `b` to see if either is NaN.
1060 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1061 {
1062     static if (GDC_with_SSE2)
1063     {
1064         return __builtin_ia32_cmpunordpd(a, b);
1065     }
1066     else
1067     {
1068         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1069     }
1070 }
1071 
1072 /// Compare the lower double-precision (64-bit) floating-point elements 
1073 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1074 /// element, and copy the upper element from `a` to the upper element.
1075 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1076 {
1077     static if (GDC_with_SSE2)
1078     {
1079         return __builtin_ia32_cmpunordsd(a, b);
1080     }
1081     else
1082     {
1083         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1084     }
1085 }
1086 
1087 /// Compare the lower double-precision (64-bit) floating-point element 
1088 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1089 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1090 {
1091     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1092     // comisd instruction, it returns false in case of unordered instead.
1093     //
1094     // Actually C++ compilers disagree over the meaning of that instruction.
1095     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1096     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1097     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1098     return a.array[0] == b.array[0];
1099 }
1100 unittest
1101 {
1102     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1103     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1104     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1105     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1106     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1107 }
1108 
1109 /// Compare the lower double-precision (64-bit) floating-point element 
1110 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1111 /// result (0 or 1).
1112 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1113 {
1114     return a.array[0] >= b.array[0];
1115 }
1116 unittest
1117 {
1118     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1119     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1120     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1121     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1122     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1123     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1124 }
1125 
1126 /// Compare the lower double-precision (64-bit) floating-point element 
1127 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1128 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1129 {
1130     return a.array[0] > b.array[0];
1131 }
1132 unittest
1133 {
1134     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1135     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1136     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1137     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1138     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1139 }
1140 
1141 /// Compare the lower double-precision (64-bit) floating-point element 
1142 /// in `a` and `b` for less-than-or-equal.
1143 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1144 {
1145     return a.array[0] <= b.array[0];
1146 }
1147 unittest
1148 {
1149     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1150     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1151     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1152     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1153     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1154     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1155 }
1156 
1157 /// Compare the lower double-precision (64-bit) floating-point element 
1158 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1159 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1160 {
1161     return a.array[0] < b.array[0];
1162 }
1163 unittest
1164 {
1165     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1166     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1167     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1168     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1169     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1170     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1171 }
1172 
1173 /// Compare the lower double-precision (64-bit) floating-point element
1174 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1175 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1176 {
1177     return a.array[0] != b.array[0];
1178 }
1179 unittest
1180 {
1181     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1182     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1183     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1184     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1185     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1186 }
1187 
1188 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1189 /// floating-point elements.
1190 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1191 {
1192     version(LDC)
1193     {
1194         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1195         enum ir = `
1196             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1197             %r = sitofp <2 x i32> %v to <2 x double>
1198             ret <2 x double> %r`;
1199         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1200     }
1201     else static if (GDC_with_SSE2)
1202     {
1203         return __builtin_ia32_cvtdq2pd(a);
1204     }
1205     else
1206     {
1207         double2 r = void;
1208         r.ptr[0] = a.array[0];
1209         r.ptr[1] = a.array[1];
1210         return r;
1211     }
1212 }
1213 unittest
1214 {
1215     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1216     assert(A.array[0] == 54.0);
1217     assert(A.array[1] == 54.0);
1218 }
1219 
1220 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1221 /// floating-point elements.
1222 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1223 {
1224     static if (GDC_with_SSE2)
1225     {
1226         return __builtin_ia32_cvtdq2ps(a);
1227     }
1228     else version(LDC)
1229     {
1230         // See #86 for why we had to resort to LLVM IR.
1231         // Plain code below was leading to catastrophic behaviour. 
1232         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1233         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1234         enum ir = `
1235             %r = sitofp <4 x i32> %0 to <4 x float>
1236             ret <4 x float> %r`;
1237         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1238     }
1239     else
1240     {
1241         __m128 res; // PERF =void;
1242         res.ptr[0] = cast(float)a.array[0];
1243         res.ptr[1] = cast(float)a.array[1];
1244         res.ptr[2] = cast(float)a.array[2];
1245         res.ptr[3] = cast(float)a.array[3];
1246         return res;
1247     }
1248 }
1249 unittest
1250 {
1251     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1252     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1253 }
1254 
1255 /// Convert packed double-precision (64-bit) floating-point elements 
1256 /// in `a` to packed 32-bit integers.
1257 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1258 {
1259     // PERF ARM32
1260     static if (LDC_with_SSE2)
1261     {
1262         return __builtin_ia32_cvtpd2dq(a);
1263     }
1264     else static if (GDC_with_SSE2)
1265     {
1266         return __builtin_ia32_cvtpd2dq(a);
1267     }
1268     else static if (LDC_with_ARM64)
1269     {
1270         // Get current rounding mode.
1271         uint fpscr = arm_get_fpcr();
1272         long2 i;
1273         switch(fpscr & _MM_ROUND_MASK_ARM)
1274         {
1275             default:
1276             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1277             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1278             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1279             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1280         }
1281         int4 zero = 0;
1282         return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1283     }
1284     else
1285     {
1286         // PERF ARM32
1287         __m128i r = _mm_setzero_si128();
1288         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1289         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1290         return r;
1291     }
1292 }
1293 unittest
1294 {
1295     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1296     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1297 }
1298 
1299 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1300 /// to packed 32-bit integers
1301 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1302 {
1303     return to_m64(_mm_cvtpd_epi32(v));
1304 }
1305 unittest
1306 {
1307     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1308     assert(A.array[0] == 55 && A.array[1] == 61);
1309 }
1310 
1311 /// Convert packed double-precision (64-bit) floating-point elements 
1312 /// in `a` to packed single-precision (32-bit) floating-point elements.
1313 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1314 {
1315     static if (LDC_with_SSE2)
1316     {
1317         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1318     }
1319     else static if (GDC_with_SSE2)
1320     {
1321         return __builtin_ia32_cvtpd2ps(a);
1322     }
1323     else
1324     { 
1325         __m128 r = void;
1326         r.ptr[0] = a.array[0];
1327         r.ptr[1] = a.array[1];
1328         r.ptr[2] = 0;
1329         r.ptr[3] = 0;
1330         return r;
1331     }
1332 }
1333 unittest
1334 {
1335     __m128d A = _mm_set_pd(5.25, 4.0);
1336     __m128 B = _mm_cvtpd_ps(A);
1337     assert(B.array == [4.0f, 5.25f, 0, 0]);
1338 }
1339 
1340 /// Convert packed 32-bit integers in `v` to packed double-precision 
1341 /// (64-bit) floating-point elements.
1342 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1343 {
1344     return _mm_cvtepi32_pd(to_m128i(v));
1345 }
1346 unittest
1347 {
1348     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1349     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1350 }
1351 
1352 /// Convert packed single-precision (32-bit) floating-point elements 
1353 /// in `a` to packed 32-bit integers
1354 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1355 {
1356     static if (LDC_with_SSE2)
1357     {
1358         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1359     }
1360     else static if (GDC_with_SSE2)
1361     {
1362         return __builtin_ia32_cvtps2dq(a);
1363     }
1364     else static if (LDC_with_ARM64)
1365     {
1366         // Get current rounding mode.
1367         uint fpscr = arm_get_fpcr();
1368         switch(fpscr & _MM_ROUND_MASK_ARM)
1369         {
1370             default:
1371             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1372             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1373             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1374             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1375         }
1376     }
1377     else
1378     {
1379         __m128i r = void;
1380         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1381         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1382         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1383         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1384         return r;
1385     }
1386 }
1387 unittest
1388 {
1389     // GDC bug #98607
1390     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1391     // GDC does not provide optimization barrier for rounding mode.
1392     // Workarounded with different literals. This bug will likely only manifest in unittest.
1393     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1394 
1395     uint savedRounding = _MM_GET_ROUNDING_MODE();
1396 
1397     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1398     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1399     assert(A.array == [1, -2, 54, -3]);
1400 
1401     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1402     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1403     assert(A.array == [1, -3, 53, -3]);
1404 
1405     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1406     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1407     assert(A.array == [2, -2, 54, -2]);
1408 
1409     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1410     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1411     assert(A.array == [1, -2, 53, -2]);
1412 
1413     _MM_SET_ROUNDING_MODE(savedRounding);
1414 }
1415 
1416 /// Convert packed single-precision (32-bit) floating-point elements 
1417 /// in `a` to packed double-precision (64-bit) floating-point elements.
1418 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1419 {
1420     version(LDC)
1421     {
1422         // Generates cvtps2pd since LDC 1.0 -O0
1423         enum ir = `
1424             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1425             %r = fpext <2 x float> %v to <2 x double>
1426             ret <2 x double> %r`;
1427         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1428     }
1429     else static if (GDC_with_SSE2)
1430     {
1431         return __builtin_ia32_cvtps2pd(a);
1432     }
1433     else
1434     {
1435         double2 r = void;
1436         r.ptr[0] = a.array[0];
1437         r.ptr[1] = a.array[1];
1438         return r;
1439     }
1440 }
1441 unittest
1442 {
1443     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1444     assert(A.array[0] == 54.0);
1445     assert(A.array[1] == 54.0);
1446 }
1447 
1448 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1449 double _mm_cvtsd_f64 (__m128d a) pure @safe
1450 {
1451     return a.array[0];
1452 }
1453 
1454 /// Convert the lower double-precision (64-bit) floating-point element
1455 /// in `a` to a 32-bit integer.
1456 int _mm_cvtsd_si32 (__m128d a) @safe
1457 {
1458     static if (LDC_with_SSE2)
1459     {
1460         return __builtin_ia32_cvtsd2si(a);
1461     }
1462     else static if (GDC_with_SSE2)
1463     {
1464         return __builtin_ia32_cvtsd2si(a);
1465     }
1466     else
1467     {
1468         return convertDoubleToInt32UsingMXCSR(a[0]);
1469     }
1470 }
1471 unittest
1472 {
1473     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1474 }
1475 
1476 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1477 long _mm_cvtsd_si64 (__m128d a) @trusted
1478 {
1479     version (LDC)
1480     {
1481         version (X86_64)
1482         {
1483             return __builtin_ia32_cvtsd2si64(a);
1484         }
1485         else
1486         {
1487             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1488             // using SSE instructions only. So the builtin doesn't exit for this arch.
1489             return convertDoubleToInt64UsingMXCSR(a[0]);
1490         }
1491     }
1492     else
1493     {
1494         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1495     }
1496 }
1497 unittest
1498 {
1499     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1500 
1501     uint savedRounding = _MM_GET_ROUNDING_MODE();
1502 
1503     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1504     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1505 
1506     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1507     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1508 
1509     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1510     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1511 
1512     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1513     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1514 
1515     _MM_SET_ROUNDING_MODE(savedRounding);
1516 }
1517 
1518 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1519 
1520 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1521 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1522 /// to the upper elements of result.
1523 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1524 {
1525     static if (GDC_with_SSE2)
1526     {
1527         return __builtin_ia32_cvtsd2ss(a, b); 
1528     }
1529     else
1530     {
1531         // Generates cvtsd2ss since LDC 1.3 -O0
1532         a.ptr[0] = b.array[0];
1533         return a;
1534     }
1535 }
1536 unittest
1537 {
1538     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1539     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1540 }
1541 
1542 /// Get the lower 32-bit integer in `a`.
1543 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1544 {
1545     return a.array[0];
1546 }
1547 
1548 /// Get the lower 64-bit integer in `a`.
1549 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1550 {
1551     long2 la = cast(long2)a;
1552     return la.array[0];
1553 }
1554 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1555 
1556 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1557 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1558 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1559 {
1560     a.ptr[0] = cast(double)b;
1561     return a;
1562 }
1563 unittest
1564 {
1565     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1566     assert(a.array == [42.0, 0]);
1567 }
1568 
1569 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1570 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1571 {
1572     int4 r = [0, 0, 0, 0];
1573     r.ptr[0] = a;
1574     return r;
1575 }
1576 unittest
1577 {
1578     __m128i a = _mm_cvtsi32_si128(65);
1579     assert(a.array == [65, 0, 0, 0]);
1580 }
1581 
1582 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1583 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1584 
1585 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1586 {
1587     a.ptr[0] = cast(double)b;
1588     return a;
1589 }
1590 unittest
1591 {
1592     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1593     assert(a.array == [42.0, 0]);
1594 }
1595 
1596 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1597 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1598 {
1599     long2 r = [0, 0];
1600     r.ptr[0] = a;
1601     return cast(__m128i)(r);
1602 }
1603 
1604 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1605 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1606 
1607 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1608 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1609 // element of result.
1610 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1611 {
1612     a.ptr[0] = b.array[0];
1613     return a;
1614 }
1615 unittest
1616 {
1617     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1618     assert(a.array == [42.0, 0]);
1619 }
1620 
1621 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1622 long _mm_cvttss_si64 (__m128 a) pure @safe
1623 {
1624     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1625 }
1626 unittest
1627 {
1628     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1629 }
1630 
1631 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1632 /// Put zeroes in the upper elements of result.
1633 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1634 {
1635     static if (LDC_with_SSE2)
1636     {
1637         return __builtin_ia32_cvttpd2dq(a);
1638     }
1639     else static if (GDC_with_SSE2)
1640     {
1641         return __builtin_ia32_cvttpd2dq(a);
1642     }
1643     else
1644     {
1645         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1646         __m128i r; // PERF =void;
1647         r.ptr[0] = cast(int)a.array[0];
1648         r.ptr[1] = cast(int)a.array[1];
1649         r.ptr[2] = 0;
1650         r.ptr[3] = 0;
1651         return r;
1652     }
1653 }
1654 unittest
1655 {
1656     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1657     assert(R.array == [-4, 45641, 0, 0]);
1658 }
1659 
1660 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1661 /// to packed 32-bit integers with truncation.
1662 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1663 {
1664     return to_m64(_mm_cvttpd_epi32(v));
1665 }
1666 unittest
1667 {
1668     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1669     int[2] correct = [-4, 45641];
1670     assert(R.array == correct);
1671 }
1672 
1673 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1674 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1675 {
1676     // x86: Generates cvttps2dq since LDC 1.3 -O2
1677     // ARM64: generates fcvtze since LDC 1.8 -O2
1678     __m128i r; // PERF = void;
1679     r.ptr[0] = cast(int)a.array[0];
1680     r.ptr[1] = cast(int)a.array[1];
1681     r.ptr[2] = cast(int)a.array[2];
1682     r.ptr[3] = cast(int)a.array[3];
1683     return r;
1684 }
1685 unittest
1686 {
1687     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1688     assert(R.array == [-4, 45641, 0, 1]);
1689 }
1690 
1691 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1692 int _mm_cvttsd_si32 (__m128d a)
1693 {
1694     // Generates cvttsd2si since LDC 1.3 -O0
1695     return cast(int)a.array[0];
1696 }
1697 
1698 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1699 long _mm_cvttsd_si64 (__m128d a)
1700 {
1701     // Generates cvttsd2si since LDC 1.3 -O0
1702     // but in 32-bit instead, it's a long sequence that resort to FPU
1703     return cast(long)a.array[0];
1704 }
1705 
1706 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1707 
1708 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1709 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1710 {
1711     pragma(inline, true);
1712     return a / b;
1713 }
1714 
1715 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1716 {
1717     static if (GDC_with_SSE2)
1718     {
1719         return __builtin_ia32_divsd(a, b);
1720     }
1721     else version(DigitalMars)
1722     {
1723         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1724         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1725         asm pure nothrow @nogc @trusted { nop;}
1726         a.array[0] = a.array[0] / b.array[0];
1727         return a;
1728     }
1729     else
1730     {
1731         a.ptr[0] /= b.array[0];
1732         return a;
1733     }
1734 }
1735 unittest
1736 {
1737     __m128d a = [2.0, 4.5];
1738     a = _mm_div_sd(a, a);
1739     assert(a.array == [1.0, 4.5]);
1740 }
1741 
1742 /// Extract a 16-bit integer from `v`, selected with `index`.
1743 /// Warning: the returned value is zero-extended to 32-bits.
1744 int _mm_extract_epi16(__m128i v, int index) pure @safe
1745 {
1746     short8 r = cast(short8)v;
1747     return cast(ushort)(r.array[index & 7]);
1748 }
1749 unittest
1750 {
1751     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1752     assert(_mm_extract_epi16(A, 6) == 6);
1753     assert(_mm_extract_epi16(A, 0) == 65535);
1754     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1755 }
1756 
1757 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1758 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1759 {
1760     short8 r = cast(short8)v;
1761     r.ptr[index & 7] = cast(short)i;
1762     return cast(__m128i)r;
1763 }
1764 unittest
1765 {
1766     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1767     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1768     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1769     assert(R.array == correct);
1770 }
1771 
1772 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 
1773 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 
1774 /// is globally visible before any load instruction which follows the fence in program order.
1775 void _mm_lfence() @trusted
1776 {
1777     version(GNU)
1778     {
1779         static if (GDC_with_SSE2)
1780         {
1781             __builtin_ia32_lfence();
1782         }
1783         else version(X86)
1784         {
1785             asm pure nothrow @nogc @trusted
1786             {
1787                 "lfence;\n" : : : ;
1788             }
1789         }
1790         else
1791             static assert(false);
1792     }
1793     else static if (LDC_with_SSE2)
1794     {
1795         __builtin_ia32_lfence();
1796     }
1797     else static if (LDC_with_ARM64)
1798     {
1799          __builtin_arm_dmb(9);  // dmb ishld
1800     }
1801     else static if (DMD_with_asm)
1802     {
1803         asm nothrow @nogc pure @safe
1804         {
1805             lfence;
1806         }
1807     }
1808     else version(LDC)
1809     {
1810         // When the architecture is unknown, generate a full memory barrier,
1811         // as the semantics of sfence do not really match those of atomics.
1812         llvm_memory_fence();
1813     }
1814     else
1815         static assert(false);
1816 }
1817 unittest
1818 {
1819     _mm_lfence();
1820 }
1821 
1822 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1823 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1824 __m128d _mm_load_pd (const(double) * mem_addr) pure
1825 {
1826     pragma(inline, true);
1827     __m128d* aligned = cast(__m128d*)mem_addr;
1828     return *aligned;
1829 }
1830 unittest
1831 {
1832     align(16) double[2] S = [-5.0, 7.0];
1833     __m128d R = _mm_load_pd(S.ptr);
1834     assert(R.array == S);
1835 }
1836 
1837 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1838 /// `mem_addr` does not need to be aligned on any particular boundary.
1839 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1840 {
1841     double m = *mem_addr;
1842     __m128d r; // PERF =void;
1843     r.ptr[0] = m;
1844     r.ptr[1] = m;
1845     return r;
1846 }
1847 unittest
1848 {
1849     double what = 4;
1850     __m128d R = _mm_load_pd1(&what);
1851     double[2] correct = [4.0, 4];
1852     assert(R.array == correct);
1853 }
1854 
1855 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1856 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1857 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1858 {
1859     double2 r = [0, 0];
1860     r.ptr[0] = *mem_addr;
1861     return r;
1862 }
1863 unittest
1864 {
1865     double x = -42;
1866     __m128d a = _mm_load_sd(&x);
1867     assert(a.array == [-42.0, 0.0]);
1868 }
1869 
1870 /// Load 128-bits of integer data from memory into dst. 
1871 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1872 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1873 {
1874     pragma(inline, true);
1875     return *mem_addr;
1876 }
1877 unittest
1878 {
1879     align(16) int[4] correct = [-1, 2, 3, 4];
1880     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1881     assert(A.array == correct);
1882 }
1883 
1884 alias _mm_load1_pd = _mm_load_pd1; ///
1885 
1886 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1887 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1888 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1889 {
1890     pragma(inline, true);
1891     a.ptr[1] = *mem_addr;
1892     return a;
1893 }
1894 unittest
1895 {
1896     double A = 7.0;
1897     __m128d B = _mm_setr_pd(4.0, -5.0);
1898     __m128d R = _mm_loadh_pd(B, &A);
1899     double[2] correct = [ 4.0, 7.0 ];
1900     assert(R.array == correct);
1901 }
1902 
1903 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1904 // Note: strange signature since the memory doesn't have to aligned (Issue #60), and doesn't have to be 128-bit
1905 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1906 {
1907     pragma(inline, true);
1908     auto pLong = cast(const(long)*)mem_addr;
1909     long2 r = [0, 0];
1910     r.ptr[0] = *pLong;
1911     return cast(__m128i)(r);
1912 }
1913 unittest
1914 {
1915     long A = 0x7878787870707070;
1916     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1917     long[2] correct = [0x7878787870707070, 0];
1918     assert(R.array == correct);
1919 }
1920 
1921 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1922 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1923 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1924 {
1925     a.ptr[0] = *mem_addr;
1926     return a;
1927 }
1928 unittest
1929 {
1930     double A = 7.0;
1931     __m128d B = _mm_setr_pd(4.0, -5.0);
1932     __m128d R = _mm_loadl_pd(B, &A);
1933     double[2] correct = [ 7.0, -5.0 ];
1934     assert(R.array == correct);
1935 }
1936 
1937 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1938 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1939 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1940 {
1941     __m128d a = *cast(__m128d*)(mem_addr);
1942     __m128d r; // PERF =void;
1943     r.ptr[0] = a.array[1];
1944     r.ptr[1] = a.array[0];
1945     return r;
1946 }
1947 unittest
1948 {
1949     align(16) double[2] A = [56.0, -74.0];
1950     __m128d R = _mm_loadr_pd(A.ptr);
1951     double[2] correct = [-74.0, 56.0];
1952     assert(R.array == correct);
1953 }
1954 
1955 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1956 /// `mem_addr` does not need to be aligned on any particular boundary.
1957 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1958 {
1959     pragma(inline, true);
1960     static if (GDC_with_SSE2)
1961     {
1962         return __builtin_ia32_loadupd(mem_addr); 
1963     }
1964     else version(LDC)
1965     {
1966         return loadUnaligned!(double2)(mem_addr);
1967     }
1968     else version(DigitalMars)
1969     {
1970         // Apparently inside __simd you can use aligned dereferences without fear.
1971         // That was issue 23048 on dlang's Bugzilla.
1972         static if (DMD_with_DSIMD)
1973         {
1974             return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
1975         }
1976         else static if (SSESizedVectorsAreEmulated)
1977         {
1978             // Since this vector is emulated, it doesn't have alignement constraints
1979             // and as such we can just cast it.
1980             return *cast(__m128d*)(mem_addr);
1981         }
1982         else
1983         {
1984             __m128d result;
1985             result.ptr[0] = mem_addr[0];
1986             result.ptr[1] = mem_addr[1];
1987             return result;
1988         }
1989     }
1990     else
1991     {
1992         __m128d result;
1993         result.ptr[0] = mem_addr[0];
1994         result.ptr[1] = mem_addr[1];
1995         return result;
1996     }
1997 }
1998 unittest
1999 {
2000     double[2] A = [56.0, -75.0];
2001     __m128d R = _mm_loadu_pd(A.ptr);
2002     double[2] correct = [56.0, -75.0];
2003     assert(R.array == correct);
2004 }
2005 
2006 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
2007 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
2008 {
2009     // PERF DMD
2010     pragma(inline, true);
2011     static if (GDC_with_SSE2)
2012     {
2013         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
2014     }
2015     else version(LDC)
2016     {
2017         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2018     }
2019     else
2020     {
2021         const(int)* p = cast(const(int)*)mem_addr;
2022         __m128i r = void;
2023         r.ptr[0] = p[0];
2024         r.ptr[1] = p[1];
2025         r.ptr[2] = p[2];
2026         r.ptr[3] = p[3];
2027         return r;
2028     }
2029 }
2030 unittest
2031 {
2032     align(16) int[4] correct = [-1, 2, -3, 4];
2033     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2034     assert(A.array == correct);
2035 }
2036 
2037 /// Load unaligned 32-bit integer from memory into the first element of result.
2038 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
2039 {
2040     pragma(inline, true);
2041     int r = *cast(int*)(mem_addr);
2042     int4 result = [0, 0, 0, 0];
2043     result.ptr[0] = r;
2044     return result;
2045 }
2046 unittest
2047 {
2048     int r = 42;
2049     __m128i A = _mm_loadu_si32(&r);
2050     int[4] correct = [42, 0, 0, 0];
2051     assert(A.array == correct);
2052 }
2053 
2054 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2055 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2056 /// and pack the results in destination.
2057 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2058 {
2059     static if (GDC_with_SSE2)
2060     {
2061         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2062     }
2063     else static if (LDC_with_SSE2)
2064     {
2065         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2066     }
2067     else static if (LDC_with_ARM64)
2068     {
2069         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2070         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2071         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2072         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2073         return vcombine_s32(rl, rh);
2074     }
2075     else
2076     {
2077         short8 sa = cast(short8)a;
2078         short8 sb = cast(short8)b;
2079         int4 r;
2080         foreach(i; 0..4)
2081         {
2082             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2083         }
2084         return r;
2085     }
2086 }
2087 unittest
2088 {
2089     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2090     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2091     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2092     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2093     assert(R.array == correct);
2094 }
2095 
2096 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2097 /// (elements are not stored when the highest bit is not set in the corresponding element)
2098 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2099 /// boundary.
2100 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2101 {
2102     static if (GDC_with_SSE2)
2103     {    
2104         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2105     }
2106     else static if (LDC_with_SSE2)
2107     {
2108         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2109     }
2110     else static if (LDC_with_ARM64)
2111     {
2112         // PERF: catastrophic on ARM32
2113         byte16 bmask  = cast(byte16)mask;
2114         byte16 shift = 7;
2115         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2116         mask = cast(__m128i) bmask;
2117         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2118         dest = (a & mask) | (dest & ~mask);
2119         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2120     }
2121     else
2122     {
2123         byte16 b = cast(byte16)a;
2124         byte16 m = cast(byte16)mask;
2125         byte* dest = cast(byte*)(mem_addr);
2126         foreach(j; 0..16)
2127         {
2128             if (m.array[j] & 128)
2129             {
2130                 dest[j] = b.array[j];
2131             }
2132         }
2133     }
2134 }
2135 unittest
2136 {
2137     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2138     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2139     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2140     _mm_maskmoveu_si128(A, mask, dest.ptr);
2141     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2142     assert(dest == correct);
2143 }
2144 
2145 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2146 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2147 {
2148     static if (GDC_with_SSE2)
2149     {
2150         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2151     }
2152     else version(LDC)
2153     {
2154         // x86: pmaxsw since LDC 1.0 -O1
2155         // ARM: smax.8h since LDC 1.5 -01
2156         short8 sa = cast(short8)a;
2157         short8 sb = cast(short8)b;
2158         short8 greater = greaterMask!short8(sa, sb);
2159         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2160     }
2161     else
2162     {
2163         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2164         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2165         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2166         return _mm_xor_si128(b, mask);
2167     }
2168 }
2169 unittest
2170 {
2171     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2172                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2173     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2174     assert(R.array == correct);
2175 }
2176 
2177 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2178 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2179 {
2180     version(LDC)
2181     {
2182         // x86: pmaxub since LDC 1.0.0 -O1
2183         // ARM64: umax.16b since LDC 1.5.0 -O1
2184         // PERF: catastrophic on ARM32
2185         ubyte16 sa = cast(ubyte16)a;
2186         ubyte16 sb = cast(ubyte16)b;
2187         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2188         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2189     }
2190     else
2191     {
2192         __m128i value128 = _mm_set1_epi8(-128);
2193         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2194         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2195         __m128i mask = _mm_and_si128(aTob, higher);
2196         return _mm_xor_si128(b, mask);
2197     }
2198 }
2199 unittest
2200 {
2201     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2202                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2203     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2204     assert(R.array == correct);
2205 }
2206 
2207 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2208 /// packed maximum values.
2209 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2210 {
2211     static if (GDC_with_SSE2)
2212     {
2213         return __builtin_ia32_maxpd(a, b);
2214     }
2215     else
2216     {
2217         // x86: Generates maxpd starting with LDC 1.9 -O2
2218         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2219         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2220         return a;
2221     }
2222 }
2223 unittest
2224 {
2225     __m128d A = _mm_setr_pd(4.0, 1.0);
2226     __m128d B = _mm_setr_pd(1.0, 8.0);
2227     __m128d M = _mm_max_pd(A, B);
2228     assert(M.array[0] == 4.0);
2229     assert(M.array[1] == 8.0);
2230 }
2231 
2232 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2233 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2234 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2235 {
2236     static if (GDC_with_SSE2)
2237     {
2238         return __builtin_ia32_maxsd(a, b);
2239     }
2240     else
2241     {
2242          __m128d r = a;
2243         // Generates maxsd starting with LDC 1.3
2244         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2245         return r;
2246     }
2247 }
2248 unittest
2249 {
2250     __m128d A = _mm_setr_pd(1.0, 1.0);
2251     __m128d B = _mm_setr_pd(4.0, 2.0);
2252     __m128d M = _mm_max_sd(A, B);
2253     assert(M.array[0] == 4.0);
2254     assert(M.array[1] == 1.0);
2255 }
2256 
2257 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2258 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2259 /// is globally visible before any memory instruction which follows the fence in program order.
2260 void _mm_mfence() @trusted // not pure!
2261 {
2262     version(GNU)
2263     {
2264         static if (GDC_with_SSE2)
2265         {
2266             __builtin_ia32_mfence();
2267         }
2268         else version(X86)
2269         {
2270             asm pure nothrow @nogc @trusted
2271             {
2272                 "mfence;\n" : : : ;
2273             }
2274         }
2275         else
2276             static assert(false);
2277     }
2278     else static if (LDC_with_SSE2)
2279     {
2280         __builtin_ia32_mfence();
2281     }
2282     else static if (DMD_with_asm)
2283     {
2284         asm nothrow @nogc pure @safe
2285         {
2286             mfence;
2287         }
2288     }
2289     else version(LDC)
2290     {
2291         // Note: will generate the DMB ish instruction on ARM
2292         llvm_memory_fence();
2293     }
2294     else
2295         static assert(false);
2296 }
2297 unittest
2298 {
2299     _mm_mfence();
2300 }
2301 
2302 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2303 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2304 {
2305     static if (GDC_with_SSE2)
2306     {
2307         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2308     }
2309     else version(LDC)
2310     {
2311         // x86: pminsw since LDC 1.0 -O1
2312         // ARM64: smin.8h since LDC 1.5 -01
2313         short8 sa = cast(short8)a;
2314         short8 sb = cast(short8)b;
2315         short8 greater = greaterMask!short8(sa, sb);
2316         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2317     }
2318     else
2319     {
2320         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2321         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2322         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2323         return _mm_xor_si128(b, mask);
2324     }
2325 }
2326 unittest
2327 {
2328     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2329                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2330     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2331     assert(R.array == correct);
2332 }
2333 
2334 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2335 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2336 {
2337     version(LDC)
2338     {
2339         // x86: pminub since LDC 1.0.0 -O1
2340         // ARM: umin.16b since LDC 1.5.0 -O1
2341         // PERF: catastrophic on ARM32
2342         ubyte16 sa = cast(ubyte16)a;
2343         ubyte16 sb = cast(ubyte16)b;
2344         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2345         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2346     }
2347     else
2348     {
2349         __m128i value128 = _mm_set1_epi8(-128);
2350         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2351         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2352         __m128i mask = _mm_and_si128(aTob, lower);
2353         return _mm_xor_si128(b, mask);
2354     }
2355 }
2356 unittest
2357 {
2358     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2359                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2360     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2361     assert(R.array == correct);
2362 }
2363 
2364 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2365 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2366 {
2367     static if (GDC_with_SSE2)
2368     {
2369         return __builtin_ia32_minpd(a, b);
2370     }
2371     else
2372     {
2373         // Generates minpd starting with LDC 1.9
2374         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2375         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2376         return a;
2377     }
2378 }
2379 unittest
2380 {
2381     __m128d A = _mm_setr_pd(1.0, 2.0);
2382     __m128d B = _mm_setr_pd(4.0, 1.0);
2383     __m128d M = _mm_min_pd(A, B);
2384     assert(M.array[0] == 1.0);
2385     assert(M.array[1] == 1.0);
2386 }
2387 
2388 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2389 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2390 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2391 {
2392     static if (GDC_with_SSE2)
2393     {
2394         return __builtin_ia32_minsd(a, b);
2395     }
2396     else
2397     {
2398         // Generates minsd starting with LDC 1.3
2399         __m128d r = a;
2400         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2401         return r;
2402     }
2403 }
2404 unittest
2405 {
2406     __m128d A = _mm_setr_pd(1.0, 3.0);
2407     __m128d B = _mm_setr_pd(4.0, 2.0);
2408     __m128d M = _mm_min_sd(A, B);
2409     assert(M.array[0] == 1.0);
2410     assert(M.array[1] == 3.0);
2411 }
2412 
2413 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2414 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2415 {
2416     static if (GDC_with_SSE2)
2417     {
2418         // slightly better with GDC -O0
2419         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2420     }
2421     else
2422     {
2423         long2 result = [ 0, 0 ];
2424         long2 la = cast(long2) a;
2425         result.ptr[0] = la.array[0];
2426         return cast(__m128i)(result);
2427     }
2428 }
2429 unittest
2430 {
2431     long2 A = [13, 47];
2432     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2433     long[2] correct = [13, 0];
2434     assert(B.array == correct);
2435 }
2436 
2437 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2438 /// the upper element from `a` to the upper element of dst.
2439 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2440 {
2441     static if (GDC_with_SSE2)
2442     {
2443         return __builtin_ia32_movsd(a, b); 
2444     }
2445     else
2446     {
2447         b.ptr[1] = a.array[1];
2448         return b;
2449     }
2450 }
2451 unittest
2452 {
2453     double2 A = [13.0, 47.0];
2454     double2 B = [34.0, 58.0];
2455     double2 C = _mm_move_sd(A, B);
2456     double[2] correct = [34.0, 47.0];
2457     assert(C.array == correct);
2458 }
2459 
2460 /// Create mask from the most significant bit of each 8-bit element in `v`.
2461 int _mm_movemask_epi8 (__m128i a) pure @trusted
2462 {
2463     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2464     static if (GDC_with_SSE2)
2465     {
2466         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2467     }
2468     else static if (LDC_with_SSE2)
2469     {
2470         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2471     }
2472     else static if (LDC_with_ARM64)
2473     {
2474         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2475         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2476         // SO there might be something a bit faster, but this one is reasonable and branchless.
2477         byte8 mask_shift;
2478         mask_shift.ptr[0] = 7;
2479         mask_shift.ptr[1] = 6;
2480         mask_shift.ptr[2] = 5;
2481         mask_shift.ptr[3] = 4;
2482         mask_shift.ptr[4] = 3;
2483         mask_shift.ptr[5] = 2;
2484         mask_shift.ptr[6] = 1;
2485         mask_shift.ptr[7] = 0;
2486         byte8 mask_and = byte8(-128);
2487         byte8 lo = vget_low_u8(cast(byte16)a);
2488         byte8 hi = vget_high_u8(cast(byte16)a);
2489         lo = vand_u8(lo, mask_and);
2490         lo = vshr_u8(lo, mask_shift);
2491         hi = vand_u8(hi, mask_and);
2492         hi = vshr_u8(hi, mask_shift);
2493         lo = vpadd_u8(lo,lo);
2494         lo = vpadd_u8(lo,lo);
2495         lo = vpadd_u8(lo,lo);
2496         hi = vpadd_u8(hi,hi);
2497         hi = vpadd_u8(hi,hi);
2498         hi = vpadd_u8(hi,hi);
2499         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2500     }
2501     else
2502     {
2503         byte16 ai = cast(byte16)a;
2504         int r = 0;
2505         foreach(bit; 0..16)
2506         {
2507             if (ai.array[bit] < 0) r += (1 << bit);
2508         }
2509         return r;
2510     }
2511 }
2512 unittest
2513 {
2514     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2515 }
2516 
2517 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2518 int _mm_movemask_epi16 (__m128i a) pure @trusted
2519 {
2520     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2521 }
2522 unittest
2523 {
2524     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2525 }
2526 
2527 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2528 /// loating-point element in `v`.
2529 int _mm_movemask_pd(__m128d v) pure @safe
2530 {
2531     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2532     static if (GDC_with_SSE2)
2533     {
2534         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2535         /// packed double-precision (64-bit) floating-point element in `v`.
2536         return __builtin_ia32_movmskpd(v);
2537     }
2538     else static if (LDC_with_SSE2)
2539     {
2540         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2541         /// packed double-precision (64-bit) floating-point element in `v`.
2542         return __builtin_ia32_movmskpd(v);
2543     }
2544     else
2545     {
2546         long2 lv = cast(long2)v;
2547         int r = 0;
2548         if (lv.array[0] < 0) r += 1;
2549         if (lv.array[1] < 0) r += 2;
2550         return r;
2551     }
2552 }
2553 unittest
2554 {
2555     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2556     assert(_mm_movemask_pd(A) == 2);
2557 }
2558 
2559 /// Copy the lower 64-bit integer in `v`.
2560 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2561 {
2562     long2 lv = cast(long2)v;
2563     return long1(lv.array[0]);
2564 }
2565 unittest
2566 {
2567     __m128i A = _mm_set_epi64x(-1, -2);
2568     __m64 R = _mm_movepi64_pi64(A);
2569     assert(R.array[0] == -2);
2570 }
2571 
2572 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2573 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2574 {
2575     long2 r;
2576     r.ptr[0] = a.array[0];
2577     r.ptr[1] = 0;
2578     return cast(__m128i)r;
2579 }
2580 
2581 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 
2582 /// and store the unsigned 64-bit results.
2583 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2584 {    
2585     // PERF DMD D_SIMD
2586     static if (GDC_with_SSE2)
2587     {
2588         return cast(__m128i) __builtin_ia32_pmuludq128 (a, b);
2589     }
2590     else
2591     {
2592         version(LDC)
2593         {
2594             static if (__VERSION__ >= 2088)
2595             {
2596                 // Need LLVM9 for proper optimization
2597                 long2 la, lb;
2598                 la.ptr[0] = cast(uint)a.array[0];
2599                 la.ptr[1] = cast(uint)a.array[2];
2600                 lb.ptr[0] = cast(uint)b.array[0];
2601                 lb.ptr[1] = cast(uint)b.array[2];
2602             }
2603             else
2604             {
2605                 __m128i zero;
2606                 zero = 0;
2607                 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero);
2608                 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero);
2609             }
2610         }
2611         else
2612         {
2613             long2 la, lb;
2614             la.ptr[0] = cast(uint)a.array[0];
2615             la.ptr[1] = cast(uint)a.array[2];
2616             lb.ptr[0] = cast(uint)b.array[0];
2617             lb.ptr[1] = cast(uint)b.array[2];
2618         }
2619 
2620         version(DigitalMars)
2621         {
2622             // DMD has no long2 mul
2623             la.ptr[0] *= lb.array[0];
2624             la.ptr[1] *= lb.array[1];
2625             return cast(__m128i)(la);
2626         }
2627         else
2628         {
2629             static if (__VERSION__ >= 2076)
2630             {
2631                 return cast(__m128i)(la * lb);
2632             }
2633             else
2634             {
2635                 // long2 mul not supported before LDC 1.5
2636                 la.ptr[0] *= lb.array[0];
2637                 la.ptr[1] *= lb.array[1];
2638                 return cast(__m128i)(la);
2639             }
2640         }
2641     }
2642 }
2643 unittest
2644 {
2645     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2646     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2647     __m128i C = _mm_mul_epu32(A, B);
2648     long2 LC = cast(long2)C;
2649     assert(LC.array[0] == 18446744065119617025uL);
2650     assert(LC.array[1] == 12723420444339690338uL);
2651 }
2652 
2653 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2654 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2655 {
2656     pragma(inline, true);
2657     return a * b;
2658 }
2659 unittest
2660 {
2661     __m128d a = [-2.0, 1.5];
2662     a = _mm_mul_pd(a, a);
2663     assert(a.array == [4.0, 2.25]);
2664 }
2665 
2666 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2667 /// element of result, and copy the upper element from `a` to the upper element of result.
2668 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2669 {
2670     version(DigitalMars)
2671     {    
2672         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2673         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2674         asm pure nothrow @nogc @trusted { nop;}
2675         a.array[0] = a.array[0] * b.array[0];
2676         return a;
2677     }
2678     else static if (GDC_with_SSE2)
2679     {
2680         return __builtin_ia32_mulsd(a, b);
2681     }
2682     else
2683     {
2684         a.ptr[0] *= b.array[0];
2685         return a;
2686     }
2687 }
2688 unittest
2689 {
2690     __m128d a = [-2.0, 1.5];
2691     a = _mm_mul_sd(a, a);
2692     assert(a.array == [4.0, 1.5]);
2693 }
2694 
2695 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2696 /// and get an unsigned 64-bit result.
2697 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2698 {
2699     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2700 }
2701 unittest
2702 {
2703     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2704     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2705     __m64 C = _mm_mul_su32(A, B);
2706     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2707 }
2708 
2709 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2710 /// high 16 bits of the intermediate integers.
2711 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2712 {
2713     static if (GDC_with_SSE2)
2714     {
2715         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2716     }
2717     else static if (LDC_with_SSE2)
2718     {
2719         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2720     }
2721     else
2722     {
2723         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2724         //        PERF: it seems the simde solution has one less instruction in ARM64.
2725         // PERF: Catastrophic in ARM32.
2726         short8 sa = cast(short8)a;
2727         short8 sb = cast(short8)b;
2728         short8 r = void;
2729         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2730         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2731         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2732         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2733         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2734         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2735         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2736         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2737         return cast(__m128i)r;
2738     }
2739 }
2740 unittest
2741 {
2742     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2743     __m128i B = _mm_set1_epi16(16384);
2744     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2745     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2746     assert(R.array == correct);
2747 }
2748 
2749 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2750 /// high 16 bits of the intermediate integers.
2751 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2752 {
2753     static if (GDC_with_SSE2)
2754     {
2755         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2756     }
2757     else static if (LDC_with_SSE2)
2758     {
2759         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2760     }
2761     else
2762     {
2763         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2764         //      it seems the simde solution has one less instruction in ARM64
2765         // PERF: Catastrophic in ARM32.
2766         short8 sa = cast(short8)a;
2767         short8 sb = cast(short8)b;
2768         short8 r = void;
2769         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2770         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2771         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2772         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2773         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2774         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2775         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2776         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2777         return cast(__m128i)r;
2778     }
2779 }
2780 unittest
2781 {
2782     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2783     __m128i B = _mm_set1_epi16(16384);
2784     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2785     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2786     assert(R.array == correct);
2787 }
2788 
2789 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2790 /// bits of the intermediate integers.
2791 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2792 {
2793     return cast(__m128i)(cast(short8)a * cast(short8)b);
2794 }
2795 unittest
2796 {
2797     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2798     __m128i B = _mm_set1_epi16(16384);
2799     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2800     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2801     assert(R.array == correct);
2802 }
2803 
2804 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
2805 __m128i _mm_not_si128 (__m128i a) pure @safe
2806 {
2807     return ~a;
2808 }
2809 unittest
2810 {
2811     __m128i A = _mm_set1_epi32(-748);
2812     int4 notA = cast(int4) _mm_not_si128(A);
2813     int[4] correct = [747, 747, 747, 747];
2814     assert(notA.array == correct);
2815 }
2816 
2817 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2818 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2819 {
2820     pragma(inline, true);
2821     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2822 }
2823 
2824 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2825 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2826 {
2827     pragma(inline, true);
2828     return a | b;
2829 }
2830 
2831 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2832 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2833 {
2834     static if (GDC_with_SSE2)
2835     {
2836         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2837     }    
2838     else static if (LDC_with_SSE2)
2839     {
2840         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2841     }
2842     else static if (LDC_with_ARM64)
2843     {
2844         short4 ra = vqmovn_s32(cast(int4)a);
2845         short4 rb = vqmovn_s32(cast(int4)b);
2846         return cast(__m128i)vcombine_s16(ra, rb);
2847     }
2848     else
2849     {
2850         // PERF: catastrophic on ARM32
2851         short8 r;
2852         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2853         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2854         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2855         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2856         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2857         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2858         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2859         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2860         return cast(__m128i)r;
2861     }
2862 }
2863 unittest
2864 {
2865     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2866     short8 R = cast(short8) _mm_packs_epi32(A, A);
2867     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2868     assert(R.array == correct);
2869 }
2870 
2871 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2872 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2873 {
2874     static if (GDC_with_SSE2)
2875     {
2876         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2877     }
2878     else static if (LDC_with_SSE2)
2879     {
2880         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2881     }
2882     else static if (LDC_with_ARM64)
2883     {
2884         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2885         byte8 ra = vqmovn_s16(cast(short8)a);
2886         byte8 rb = vqmovn_s16(cast(short8)b);
2887         return cast(__m128i)vcombine_s8(ra, rb);
2888     }
2889     else
2890     {
2891         // PERF: ARM32 is missing
2892         byte16 r;
2893         short8 sa = cast(short8)a;
2894         short8 sb = cast(short8)b;
2895         foreach(i; 0..8)
2896             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2897         foreach(i; 0..8)
2898             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2899         return cast(__m128i)r;
2900     }
2901 }
2902 unittest
2903 {
2904     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2905     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2906     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2907                         127, -128, 127, 0, 127, -128, 127, 0];
2908     assert(R.array == correct);
2909 }
2910 
2911 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2912 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2913 {
2914     // PERF DMD catastrophic
2915     static if (GDC_with_SSE2)
2916     {
2917         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2918     }
2919     else static if (LDC_with_SSE2)
2920     {
2921         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2922     }
2923     else static if (LDC_with_ARM64)
2924     {
2925         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2926         byte8 ra = vqmovun_s16(cast(short8)a);
2927         byte8 rb = vqmovun_s16(cast(short8)b);
2928         return cast(__m128i)vcombine_s8(ra, rb);
2929     }
2930     else
2931     {
2932         short8 sa = cast(short8)a;
2933         short8 sb = cast(short8)b;
2934         align(16) ubyte[16] result = void;
2935         for (int i = 0; i < 8; ++i)
2936         {
2937             short s = sa[i];
2938             if (s < 0) s = 0;
2939             if (s > 255) s = 255;
2940             result[i] = cast(ubyte)s;
2941 
2942             s = sb[i];
2943             if (s < 0) s = 0;
2944             if (s > 255) s = 255;
2945             result[i+8] = cast(ubyte)s;
2946         }
2947         return *cast(__m128i*)(result.ptr);
2948     }
2949 }
2950 unittest
2951 {
2952     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2953     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2954     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2955                                                 0, 255, 0, 255, 255, 2, 1, 0];
2956     foreach(i; 0..16)
2957         assert(AA.array[i] == cast(byte)(correctResult[i]));
2958 }
2959 
2960 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2961 /// and power consumption of spin-wait loops.
2962 void _mm_pause() @trusted
2963 {
2964     version(GNU)
2965     {
2966         static if (GDC_with_SSE2)
2967         {
2968             __builtin_ia32_pause();
2969         }
2970         else version(X86)
2971         {
2972             asm pure nothrow @nogc @trusted
2973             {
2974                 "pause;\n" : : : ;
2975             }
2976         }
2977         else
2978             static assert(false);
2979     }
2980     else static if (LDC_with_SSE2)
2981     {
2982         __builtin_ia32_pause();
2983     }
2984     else static if (DMD_with_asm)
2985     {
2986         asm nothrow @nogc pure @safe
2987         {
2988             rep; nop; // F3 90 =  pause
2989         }
2990     }
2991     else version (LDC)
2992     {
2993         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2994     }
2995     else
2996         static assert(false);
2997 }
2998 unittest
2999 {
3000     _mm_pause();
3001 }
3002 
3003 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
3004 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
3005 /// low 16 bits of 64-bit elements in result.
3006 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
3007 {
3008     static if (GDC_with_SSE2)
3009     {
3010         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
3011     }
3012     else static if (LDC_with_SSE2)
3013     {
3014         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
3015     }
3016     else static if (LDC_with_ARM64)
3017     {
3018         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
3019 
3020         // PERF: Looks suboptimal vs addp
3021         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
3022         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
3023         ushort8 r = 0;
3024         r[0] = r0;
3025         r[4] = r4;
3026         return cast(__m128i) r;
3027     }
3028     else
3029     {
3030         // PERF: ARM32 is lacking
3031         byte16 ab = cast(byte16)a;
3032         byte16 bb = cast(byte16)b;
3033         ubyte[16] t;
3034         foreach(i; 0..16)
3035         {
3036             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
3037             if (diff < 0) diff = -diff;
3038             t[i] = cast(ubyte)(diff);
3039         }
3040         int4 r = _mm_setzero_si128();
3041         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
3042         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
3043         return r;
3044     }
3045 }
3046 unittest
3047 {
3048     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3049     __m128i B = _mm_set1_epi8(1);
3050     __m128i R = _mm_sad_epu8(A, B);
3051     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3052                       0,
3053                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3054                       0];
3055     assert(R.array == correct);
3056 }
3057 
3058 /// Set packed 16-bit integers with the supplied values.
3059 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3060 {
3061     short8 r = void;
3062     r.ptr[0] = e0;
3063     r.ptr[1] = e1;
3064     r.ptr[2] = e2;
3065     r.ptr[3] = e3;
3066     r.ptr[4] = e4;
3067     r.ptr[5] = e5;
3068     r.ptr[6] = e6;
3069     r.ptr[7] = e7;
3070     return cast(__m128i) r;
3071 }
3072 unittest
3073 {
3074     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3075     short8 B = cast(short8) A;
3076     foreach(i; 0..8)
3077         assert(B.array[i] == i);
3078 }
3079 
3080 /// Set packed 32-bit integers with the supplied values.
3081 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3082 {
3083     // PERF: does a constant inline correctly? vs int4 field assignment
3084     align(16) int[4] r = [e0, e1, e2, e3];
3085     return *cast(int4*)&r;
3086 }
3087 unittest
3088 {
3089     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3090     foreach(i; 0..4)
3091         assert(A.array[i] == i);
3092 }
3093 
3094 /// Set packed 64-bit integers with the supplied values.
3095 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3096 {
3097     pragma(inline, true);
3098     long2 r = void;
3099     r.ptr[0] = e0.array[0];
3100     r.ptr[1] = e1.array[0];
3101     return cast(__m128i)(r);
3102 }
3103 unittest
3104 {
3105     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3106     long2 B = cast(long2) A;
3107     assert(B.array[0] == 5678);
3108     assert(B.array[1] == 1234);
3109 }
3110 
3111 /// Set packed 64-bit integers with the supplied values.
3112 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3113 {
3114     pragma(inline, true);
3115     long2 r = void;
3116     r.ptr[0] = e0;
3117     r.ptr[1] = e1;
3118     return cast(__m128i)(r);
3119 }
3120 unittest
3121 {
3122     __m128i A = _mm_set_epi64x(1234, -5678);
3123     long2 B = cast(long2) A;
3124     assert(B.array[0] == -5678);
3125     assert(B.array[1] == 1234);
3126 }
3127 
3128 /// Set packed 8-bit integers with the supplied values.
3129 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3130                       byte e11, byte e10, byte e9, byte e8,
3131                       byte e7, byte e6, byte e5, byte e4,
3132                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3133 {
3134     align(16) byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3135                                  e8, e9, e10, e11, e12, e13, e14, e15];
3136     return *cast(__m128i*)(result.ptr);
3137 }
3138 unittest
3139 {
3140     byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3141     byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3142     assert(R.array == correct);
3143 }
3144 
3145 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3146 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3147 {
3148     pragma(inline, true);
3149     double2 r = void;
3150     r.ptr[0] = e0;
3151     r.ptr[1] = e1;
3152     return r;
3153 }
3154 unittest
3155 {
3156     __m128d A = _mm_set_pd(61.0, 55.0);
3157     double[2] correct = [55.0, 61.0];
3158     assert(A.array == correct);
3159 }
3160 
3161 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3162 __m128d _mm_set_pd1 (double a) pure @trusted
3163 {
3164     pragma(inline, true);
3165     __m128d r = void;
3166     r.ptr[0] = a;
3167     r.ptr[1] = a;
3168     return r;
3169 }
3170 unittest
3171 {
3172     __m128d A = _mm_set_pd1(61.0);
3173     double[2] correct = [61.0, 61.0];
3174     assert(A.array == correct);
3175 }
3176 
3177 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3178 /// and zero the upper element.
3179 __m128d _mm_set_sd (double a) pure @trusted
3180 {
3181     double2 r = void;
3182     r.ptr[0] = a;
3183     r.ptr[1] = 0.0;
3184     return r;
3185 }
3186 unittest
3187 {
3188     __m128d A = _mm_set_sd(61.0);
3189     double[2] correct = [61.0, 0.0];
3190     assert(A.array == correct);
3191 }
3192 
3193 /// Broadcast 16-bit integer a to all elements of dst.
3194 __m128i _mm_set1_epi16 (short a) pure @trusted
3195 {
3196     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3197     {
3198         short8 v = a;
3199         return cast(__m128i) v;
3200     }
3201     else
3202     {
3203         pragma(inline, true);
3204         return cast(__m128i)(short8(a));
3205     }
3206 }
3207 unittest
3208 {
3209     short8 a = cast(short8) _mm_set1_epi16(31);
3210     for (int i = 0; i < 8; ++i)
3211         assert(a.array[i] == 31);
3212 }
3213 
3214 /// Broadcast 32-bit integer `a` to all elements.
3215 __m128i _mm_set1_epi32 (int a) pure @trusted
3216 {
3217     pragma(inline, true);
3218     return cast(__m128i)(int4(a));
3219 }
3220 unittest
3221 {
3222     int4 a = cast(int4) _mm_set1_epi32(31);
3223     for (int i = 0; i < 4; ++i)
3224         assert(a.array[i] == 31);
3225 }
3226 
3227 /// Broadcast 64-bit integer `a` to all elements.
3228 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3229 {
3230     return _mm_set_epi64(a, a);
3231 }
3232 unittest
3233 {
3234     long b = 0x1DEADCAFE; 
3235     __m64 a;
3236     a.ptr[0] = b;
3237     long2 c = cast(long2) _mm_set1_epi64(a);
3238     assert(c.array[0] == b);
3239     assert(c.array[1] == b);
3240 }
3241 
3242 /// Broadcast 64-bit integer `a` to all elements
3243 __m128i _mm_set1_epi64x (long a) pure @trusted
3244 {
3245     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3246     return cast(__m128i)(b);
3247 }
3248 unittest
3249 {
3250     long b = 0x1DEADCAFE;
3251     long2 c = cast(long2) _mm_set1_epi64x(b);
3252     for (int i = 0; i < 2; ++i)
3253         assert(c.array[i] == b);
3254 }
3255 
3256 /// Broadcast 8-bit integer `a` to all elements.
3257 __m128i _mm_set1_epi8 (byte a) pure @trusted
3258 {
3259     pragma(inline, true);
3260     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3261     return cast(__m128i)(b);
3262 }
3263 unittest
3264 {
3265     byte16 b = cast(byte16) _mm_set1_epi8(31);
3266     for (int i = 0; i < 16; ++i)
3267         assert(b.array[i] == 31);
3268 }
3269 
3270 alias _mm_set1_pd = _mm_set_pd1;
3271 
3272 /// Set packed 16-bit integers with the supplied values in reverse order.
3273 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3274                         short e3, short e2, short e1, short e0) pure @trusted
3275 {
3276     short8 r = void;
3277     r.ptr[0] = e7;
3278     r.ptr[1] = e6;
3279     r.ptr[2] = e5;
3280     r.ptr[3] = e4;
3281     r.ptr[4] = e3;
3282     r.ptr[5] = e2;
3283     r.ptr[6] = e1;
3284     r.ptr[7] = e0;
3285     return cast(__m128i)(r);
3286 }
3287 unittest
3288 {
3289     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3290     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3291     assert(A.array == correct);
3292 }
3293 
3294 /// Set packed 32-bit integers with the supplied values in reverse order.
3295 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3296 {
3297     // Performs better than = void; with GDC
3298     pragma(inline, true);
3299     align(16) int[4] result = [e3, e2, e1, e0];
3300     return *cast(__m128i*)(result.ptr);
3301 }
3302 unittest
3303 {
3304     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3305     int[4] correct = [-1, 0, -2147483648, 2147483647];
3306     assert(A.array == correct);
3307 }
3308 
3309 /// Set packed 64-bit integers with the supplied values in reverse order.
3310 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3311 {
3312     long2 r = void;
3313     r.ptr[0] = e1;
3314     r.ptr[1] = e0;
3315     return cast(__m128i)(r);
3316 }
3317 unittest
3318 {
3319     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3320     long[2] correct = [-1, 0];
3321     assert(A.array == correct);
3322 }
3323 
3324 /// Set packed 8-bit integers with the supplied values in reverse order.
3325 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3326                        byte e11, byte e10, byte e9,  byte e8,
3327                        byte e7,  byte e6,  byte e5,  byte e4,
3328                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3329 {
3330     align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3331                                  e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3332     return *cast(__m128i*)(result.ptr);
3333 }
3334 unittest
3335 {
3336     byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3337     byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
3338     assert(R.array == correct);
3339 }
3340 
3341 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3342 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3343 {
3344     pragma(inline, true);
3345     double2 result;
3346     result.ptr[0] = e1;
3347     result.ptr[1] = e0;
3348     return result;
3349 }
3350 unittest
3351 {
3352     __m128d A = _mm_setr_pd(61.0, 55.0);
3353     double[2] correct = [61.0, 55.0];
3354     assert(A.array == correct);
3355 }
3356 
3357 /// Return vector of type `__m128d` with all elements set to zero.
3358 __m128d _mm_setzero_pd() pure @trusted
3359 {
3360     pragma(inline, true);
3361     double2 r = void;
3362     r.ptr[0] = 0.0;
3363     r.ptr[1] = 0.0;
3364     return r;
3365 }
3366 unittest
3367 {
3368     __m128d A = _mm_setzero_pd();
3369     double[2] correct = [0.0, 0.0];
3370     assert(A.array == correct);
3371 }
3372 
3373 /// Return vector of type `__m128i` with all elements set to zero.
3374 __m128i _mm_setzero_si128() pure @trusted
3375 {
3376     pragma(inline, true);
3377     int4 r = void;
3378     r.ptr[0] = 0;
3379     r.ptr[1] = 0;
3380     r.ptr[2] = 0;
3381     r.ptr[3] = 0;
3382     return r;
3383 }
3384 unittest
3385 {
3386     __m128i A = _mm_setzero_si128();
3387     int[4] correct = [0, 0, 0, 0];
3388     assert(A.array == correct);
3389 }
3390 
3391 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
3392 /// See_also: `_MM_SHUFFLE`.
3393 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted
3394 {
3395     // PERF DMD D_SIMD
3396     static if (GDC_with_SSE2)
3397     {
3398         return __builtin_ia32_pshufd(a, imm8);
3399     }
3400     else version(LDC)
3401     {
3402         return shufflevectorLDC!(int4, (imm8 >> 0) & 3,
3403                                  (imm8 >> 2) & 3,
3404                                  (imm8 >> 4) & 3,
3405                                  (imm8 >> 6) & 3)(a, a);
3406     }
3407     else
3408     {
3409         int4 r = void;
3410         r.ptr[0] = a.ptr[(imm8 >> 0) & 3];
3411         r.ptr[1] = a.ptr[(imm8 >> 2) & 3];
3412         r.ptr[2] = a.ptr[(imm8 >> 4) & 3];
3413         r.ptr[3] = a.ptr[(imm8 >> 6) & 3];
3414         return r;
3415     }
3416 }
3417 unittest
3418 {
3419     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3420     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3421     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3422     int[4] expectedB = [ 3, 2, 1, 0 ];
3423     assert(B.array == expectedB);
3424 }
3425 
3426 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3427 /// See_also: `_MM_SHUFFLE2`.
3428 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted
3429 {
3430     // PERF DMD D_SIMD
3431     static if (GDC_with_SSE2)
3432     {
3433         return __builtin_ia32_shufpd(a, b, imm8);
3434     }
3435     else version(LDC)
3436     {
3437         return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ),
3438                                  2 + ( (imm8 >> 1) & 1 ))(a, b);
3439     }
3440     else
3441     {
3442         double2 r = void;
3443         r.ptr[0] = a.array[imm8 & 1];
3444         r.ptr[1] = b.array[(imm8 >> 1) & 1];
3445         return r;
3446     }
3447 }
3448 unittest
3449 {
3450     __m128d A = _mm_setr_pd(0.5, 2.0);
3451     __m128d B = _mm_setr_pd(4.0, 5.0);
3452     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3453     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3454     double[2] correct = [ 2.0, 5.0 ];
3455     assert(R.array == correct);
3456 }
3457 
3458 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3459 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3460 /// See also: `_MM_SHUFFLE`.
3461 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted
3462 {
3463     // PERF DMD D_SIMD
3464     static if (GDC_with_SSE2)
3465     {
3466         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3467     }
3468     else version(LDC)
3469     {
3470         return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3,
3471                                           4 + ( (imm8 >> 0) & 3 ),
3472                                           4 + ( (imm8 >> 2) & 3 ),
3473                                           4 + ( (imm8 >> 4) & 3 ),
3474                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3475     }
3476     else
3477     {
3478         short8 r = cast(short8)a;
3479         short8 sa = cast(short8)a;
3480         r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ];
3481         r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ];
3482         r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ];
3483         r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ];
3484         return cast(__m128i) r;
3485     }
3486 }
3487 unittest
3488 {
3489     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3490     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3491     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3492     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3493     assert(C.array == expectedC);
3494 }
3495 
3496 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3497 /// bits of result, with the high 64 bits being copied from from `a` to result.
3498 /// See_also: `_MM_SHUFFLE`.
3499 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted
3500 {
3501     // PERF DMD D_SIMD
3502     static if (GDC_with_SSE2)
3503     {
3504         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3505     }
3506     else version(LDC)
3507     {
3508         return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ),
3509                                                        ( (imm8 >> 2) & 3 ),
3510                                                        ( (imm8 >> 4) & 3 ),
3511                                                        ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3512     }
3513     else
3514     {
3515         short8 r = cast(short8)a;
3516         short8 sa = cast(short8)a;
3517         r.ptr[0] = sa.array[(imm8 >> 0) & 3];
3518         r.ptr[1] = sa.array[(imm8 >> 2) & 3];
3519         r.ptr[2] = sa.array[(imm8 >> 4) & 3];
3520         r.ptr[3] = sa.array[(imm8 >> 6) & 3];
3521         return cast(__m128i) r;
3522     }
3523 }
3524 unittest
3525 {
3526     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3527     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3528     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3529     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3530     assert(B.array == expectedB);
3531 }
3532 
3533 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3534 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3535 {
3536     static if (LDC_with_SSE2)
3537     {
3538         return __builtin_ia32_pslld128(a, count);
3539     }
3540     else static if (GDC_with_SSE2)
3541     {
3542         return __builtin_ia32_pslld128(a, count);
3543     }
3544     else static if (DMD_with_32bit_asm)
3545     {
3546         asm pure nothrow @nogc @trusted
3547         {
3548             movdqu XMM0, a;
3549             movdqu XMM1, count;
3550             pslld XMM0, XMM1;
3551             movdqu a, XMM0;
3552         }
3553         return a;
3554     }
3555     else
3556     {
3557         int4 r = void;
3558         long2 lc = cast(long2)count;
3559         int bits = cast(int)(lc.array[0]);
3560         foreach(i; 0..4)
3561             r[i] = cast(uint)(a[i]) << bits;
3562         return r;
3563     }
3564 }
3565 
3566 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3567 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3568 {
3569     static if (LDC_with_SSE2)
3570     {
3571         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3572     }
3573     else static if (GDC_with_SSE2)
3574     {
3575         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3576     }
3577     else static if (DMD_with_32bit_asm)
3578     {
3579         asm pure nothrow @nogc @trusted
3580         {
3581             movdqu XMM0, a;
3582             movdqu XMM1, count;
3583             psllq XMM0, XMM1;
3584             movdqu a, XMM0;
3585         }
3586         return a;
3587     }
3588     else
3589     {
3590         // ARM: good since LDC 1.12 -O2
3591         // ~but -O0 version is catastrophic
3592         long2 r = void;
3593         long2 sa = cast(long2)a;
3594         long2 lc = cast(long2)count;
3595         int bits = cast(int)(lc.array[0]);
3596         foreach(i; 0..2)
3597             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3598         return cast(__m128i)r;
3599     }
3600 }
3601 
3602 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3603 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3604 {
3605     static if (LDC_with_SSE2)
3606     {
3607         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3608     }
3609     else static if (GDC_with_SSE2)
3610     {
3611         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3612     }
3613     else static if (DMD_with_32bit_asm)
3614     {
3615         asm pure nothrow @nogc
3616         {
3617             movdqu XMM0, a;
3618             movdqu XMM1, count;
3619             psllw XMM0, XMM1;
3620             movdqu a, XMM0;
3621         }
3622         return a;
3623     }
3624     else
3625     {
3626         short8 sa = cast(short8)a;
3627         long2 lc = cast(long2)count;
3628         int bits = cast(int)(lc.array[0]);
3629         short8 r = void;
3630         foreach(i; 0..8)
3631             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3632         return cast(int4)r;
3633     }
3634 }
3635 
3636 
3637 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3638 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3639 {
3640     static if (GDC_with_SSE2)
3641     {
3642         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3643     }
3644     else static if (LDC_with_SSE2)
3645     {
3646         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3647     }
3648     else
3649     {
3650         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3651         //       D says "It's illegal to shift by the same or more bits 
3652         //       than the size of the quantity being shifted"
3653         //       and it's UB instead.
3654         int4 r = _mm_setzero_si128();
3655 
3656         ubyte count = cast(ubyte) imm8;
3657         if (count > 31)
3658             return r;
3659         
3660         foreach(i; 0..4)
3661             r.array[i] = cast(uint)(a.array[i]) << count;
3662         return r;
3663     }
3664 }
3665 unittest
3666 {
3667     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3668     __m128i B = _mm_slli_epi32(A, 1);
3669     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3670     int[4] expectedB = [ 0, 4, 6, -8];
3671     assert(B.array == expectedB);
3672     assert(B2.array == expectedB);
3673 
3674     __m128i C = _mm_slli_epi32(A, 0);
3675     int[4] expectedC = [ 0, 2, 3, -4];
3676     assert(C.array == expectedC);
3677 
3678     __m128i D = _mm_slli_epi32(A, 65);
3679     int[4] expectedD = [ 0, 0, 0, 0];
3680     assert(D.array == expectedD);
3681 }
3682 
3683 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3684 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3685 {
3686     static if (GDC_with_SSE2)
3687     {
3688         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3689     }
3690     else static if (LDC_with_SSE2)
3691     {
3692         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3693     }
3694     else
3695     {
3696         long2 sa = cast(long2)a;
3697 
3698         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3699         //       D says "It's illegal to shift by the same or more bits 
3700         //       than the size of the quantity being shifted"
3701         //       and it's UB instead.
3702         long2 r = cast(long2) _mm_setzero_si128();
3703         ubyte count = cast(ubyte) imm8;
3704         if (count > 63)
3705             return cast(__m128i)r;
3706 
3707         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3708         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3709         return cast(__m128i)r;
3710     }
3711 }
3712 unittest
3713 {
3714     __m128i A = _mm_setr_epi64(8, -4);
3715     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3716     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3717     long[2] expectedB = [ 16, -8];
3718     assert(B.array == expectedB);
3719     assert(B2.array == expectedB);
3720 
3721     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3722     long[2] expectedC = [ 8, -4];
3723     assert(C.array == expectedC);
3724 
3725     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3726     long[2] expectedD = [ 0, -0];
3727     assert(D.array == expectedD);
3728 }
3729 
3730 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3731 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3732 {
3733     static if (GDC_with_SSE2)
3734     {
3735         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3736     }
3737     else static if (LDC_with_SSE2)
3738     {
3739         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3740     }
3741     else static if (LDC_with_ARM64)
3742     {
3743         short8 sa = cast(short8)a;
3744         short8 r = cast(short8)_mm_setzero_si128();
3745         ubyte count = cast(ubyte) imm8;
3746         if (count > 15)
3747             return cast(__m128i)r;
3748         r = sa << short8(count);
3749         return cast(__m128i)r;
3750     }
3751     else
3752     {
3753         short8 sa = cast(short8)a;
3754         short8 r = cast(short8)_mm_setzero_si128();
3755         ubyte count = cast(ubyte) imm8;
3756         if (count > 15)
3757             return cast(__m128i)r;
3758         foreach(i; 0..8)
3759             r.ptr[i] = cast(short)(sa.array[i] << count);
3760         return cast(__m128i)r;
3761     }
3762 }
3763 unittest
3764 {
3765     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3766     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3767     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3768     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3769     assert(B.array == expectedB);
3770     assert(B2.array == expectedB);
3771 
3772     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3773     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3774     assert(C.array == expectedC);
3775 }
3776 
3777 
3778 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3779 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3780 {
3781     static if (bytes & 0xF0)
3782     {
3783         return _mm_setzero_si128();
3784     }
3785     else static if (GDC_with_SSE2)
3786     {
3787         return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3788     }
3789     else version(LDC)
3790     {
3791         return cast(__m128i) shufflevectorLDC!(byte16,
3792                                                16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3793                                                22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3794                                                28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3795                                                (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3796     }
3797     else static if (DMD_with_32bit_asm)
3798     {
3799         asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3800         {
3801             movdqu XMM0, op;
3802             pslldq XMM0, bytes;
3803             movdqu op, XMM0;
3804         }
3805         return op;
3806     }
3807     else
3808     {
3809         byte16 A = cast(byte16)op;
3810         byte16 R = void;
3811         for (int n = 15; n >= bytes; --n)
3812             R.ptr[n] = A.array[n-bytes];
3813         for (int n = bytes-1; n >= 0; --n)
3814             R.ptr[n] = 0;
3815         return cast(__m128i)R;
3816     }
3817 }
3818 unittest
3819 {
3820     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3821     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3822     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3823     assert(R.array == correct);
3824 
3825     __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1));
3826     int[4] expectedB = [0, 0, 0, 0];
3827     assert(B.array == expectedB);
3828 }
3829 
3830 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3831 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3832 {
3833     version(LDC)
3834     {
3835         // Disappeared with LDC 1.11
3836         static if (__VERSION__ < 2081)
3837             return __builtin_ia32_sqrtpd(vec);
3838         else
3839         {
3840             // PERF: use llvm_sqrt on the vector
3841             vec.array[0] = llvm_sqrt(vec.array[0]); 
3842             vec.array[1] = llvm_sqrt(vec.array[1]);
3843             return vec;
3844         }
3845     }
3846     else static if (GDC_with_SSE2)    
3847     {
3848         return __builtin_ia32_sqrtpd(vec);
3849     }
3850     else
3851     {
3852         vec.ptr[0] = sqrt(vec.array[0]);
3853         vec.ptr[1] = sqrt(vec.array[1]);
3854         return vec;
3855     }
3856 }
3857 
3858 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3859 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3860 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3861 {
3862     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3863     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3864     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3865     version(LDC)
3866     {
3867         // Disappeared with LDC 1.11
3868         static if (__VERSION__ < 2081)
3869         {
3870             __m128d c = __builtin_ia32_sqrtsd(b);
3871             a[0] = c[0];
3872             return a;
3873         }
3874         else
3875         {
3876             a.array[0] = llvm_sqrt(b.array[0]);
3877             return a;
3878         }
3879     }
3880     else static if (GDC_with_SSE2)
3881     {
3882         __m128d c = __builtin_ia32_sqrtsd(b);
3883         a.ptr[0] = c.array[0];
3884         return a;
3885     }
3886     else
3887     {
3888         a.ptr[0] = sqrt(b.array[0]);
3889         return a;
3890     }
3891 }
3892 unittest
3893 {
3894     __m128d A = _mm_setr_pd(1.0, 3.0);
3895     __m128d B = _mm_setr_pd(4.0, 5.0);
3896     __m128d R = _mm_sqrt_sd(A, B);
3897     double[2] correct = [2.0, 3.0 ];
3898     assert(R.array == correct);
3899 }
3900 
3901 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3902 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3903 {
3904     static if (GDC_with_SSE2)
3905     {
3906         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3907     }
3908     else static if (LDC_with_SSE2)
3909     {
3910         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3911     }
3912     else
3913     {
3914         short8 sa = cast(short8)a;
3915         long2 lc = cast(long2)count;
3916         int bits = cast(int)(lc.array[0]);
3917         short8 r = void;
3918         foreach(i; 0..8)
3919             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3920         return cast(int4)r;
3921     }
3922 }
3923 
3924 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3925 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3926 {
3927     static if (LDC_with_SSE2)
3928     {
3929         return __builtin_ia32_psrad128(a, count);
3930     }
3931     else static if (GDC_with_SSE2)
3932     {
3933         return __builtin_ia32_psrad128(a, count);
3934     }
3935     else
3936     {    
3937         int4 r = void;
3938         long2 lc = cast(long2)count;
3939         int bits = cast(int)(lc.array[0]);
3940         r.ptr[0] = (a.array[0] >> bits);
3941         r.ptr[1] = (a.array[1] >> bits);
3942         r.ptr[2] = (a.array[2] >> bits);
3943         r.ptr[3] = (a.array[3] >> bits);
3944         return r;
3945     }
3946 }
3947 
3948 
3949 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3950 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3951 {
3952     static if (GDC_with_SSE2)
3953     {
3954         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3955     }
3956     else static if (LDC_with_SSE2)
3957     {
3958         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3959     }
3960     else static if (LDC_with_ARM64)
3961     {
3962         short8 sa = cast(short8)a;
3963         ubyte count = cast(ubyte)imm8;
3964         if (count > 15) 
3965             count = 15;
3966         short8 r = sa >> short8(count);
3967         return cast(__m128i)r;
3968     }
3969     else
3970     {
3971         short8 sa = cast(short8)a;
3972         short8 r = void;
3973 
3974         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3975         //       D says "It's illegal to shift by the same or more bits 
3976         //       than the size of the quantity being shifted"
3977         //       and it's UB instead.
3978         ubyte count = cast(ubyte)imm8;
3979         if (count > 15) 
3980             count = 15;
3981         foreach(i; 0..8)
3982             r.ptr[i] = cast(short)(sa.array[i] >> count);
3983         return cast(int4)r;
3984     }
3985 }
3986 unittest
3987 {
3988     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3989     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3990     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3991     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3992     assert(B.array == expectedB);
3993     assert(B2.array == expectedB);
3994 
3995     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3996     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3997     assert(C.array == expectedC);
3998 }
3999 
4000 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
4001 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
4002 {
4003     static if (LDC_with_SSE2)
4004     {
4005         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4006     }
4007     else static if (GDC_with_SSE2)
4008     {
4009         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4010     }
4011     else
4012     {
4013         int4 r = void;
4014 
4015         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4016         //       D says "It's illegal to shift by the same or more bits 
4017         //       than the size of the quantity being shifted"
4018         //       and it's UB instead.
4019         ubyte count = cast(ubyte) imm8;
4020         if (count > 31)
4021             count = 31;
4022 
4023         r.ptr[0] = (a.array[0] >> count);
4024         r.ptr[1] = (a.array[1] >> count);
4025         r.ptr[2] = (a.array[2] >> count);
4026         r.ptr[3] = (a.array[3] >> count);
4027         return r;
4028     }
4029 }
4030 unittest
4031 {
4032     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4033     __m128i B = _mm_srai_epi32(A, 1);
4034     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
4035     int[4] expectedB = [ 0, 1, 1, -2];
4036     assert(B.array == expectedB);
4037     assert(B2.array == expectedB);
4038 
4039     __m128i C = _mm_srai_epi32(A, 32);
4040     int[4] expectedC = [ 0, 0, 0, -1];
4041     assert(C.array == expectedC);
4042 
4043     __m128i D = _mm_srai_epi32(A, 0);
4044     int[4] expectedD = [ 0, 2, 3, -4];
4045     assert(D.array == expectedD);
4046 }
4047 
4048 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
4049 {
4050     static if (LDC_with_SSE2)
4051     {
4052         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4053     }
4054     else static if (GDC_with_SSE2)
4055     {
4056         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4057     }
4058     else
4059     {
4060         short8 sa = cast(short8)a;
4061         long2 lc = cast(long2)count;
4062         int bits = cast(int)(lc.array[0]);
4063         short8 r = void;
4064         foreach(i; 0..8)
4065             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
4066         return cast(int4)r;
4067     }
4068 }
4069 
4070 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
4071 {
4072     static if (LDC_with_SSE2)
4073     {
4074         return __builtin_ia32_psrld128(a, count);
4075     }
4076     else static if (GDC_with_SSE2)
4077     {
4078         return __builtin_ia32_psrld128(a, count);
4079     }
4080     else
4081     {
4082         int4 r = void;
4083         long2 lc = cast(long2)count;
4084         int bits = cast(int)(lc.array[0]);
4085         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
4086         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
4087         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
4088         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
4089         return r;
4090     }
4091 }
4092 
4093 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
4094 {
4095     static if (LDC_with_SSE2)
4096     {
4097         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4098     }
4099     else static if (GDC_with_SSE2)
4100     {
4101         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4102     }
4103     else
4104     {
4105         // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047
4106         // => avoid void initialization.
4107         long2 r;
4108         long2 sa = cast(long2)a;
4109         long2 lc = cast(long2)count;
4110         int bits = cast(int)(lc.array[0]);
4111         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
4112         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
4113         return cast(__m128i)r;
4114     }
4115 }
4116 
4117 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
4118 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
4119 {
4120     static if (GDC_with_SSE2)
4121     {
4122         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4123     }
4124     else static if (LDC_with_SSE2)
4125     {
4126         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4127     }
4128     else static if (LDC_with_ARM64)
4129     {
4130         short8 sa = cast(short8)a;
4131         short8 r = cast(short8) _mm_setzero_si128();
4132 
4133         ubyte count = cast(ubyte)imm8;
4134         if (count >= 16)
4135             return cast(__m128i)r;
4136 
4137         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
4138         return cast(__m128i)r;
4139     }
4140     else
4141     {
4142         short8 sa = cast(short8)a;
4143         ubyte count = cast(ubyte)imm8;
4144 
4145         short8 r = cast(short8) _mm_setzero_si128();
4146         if (count >= 16)
4147             return cast(__m128i)r;
4148 
4149         foreach(i; 0..8)
4150             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4151         return cast(__m128i)r;
4152     }
4153 }
4154 unittest
4155 {
4156     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4157     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4158     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4159     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4160     assert(B.array == expectedB);
4161     assert(B2.array == expectedB);
4162 
4163     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4164     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4165     assert(C.array == expectedC);
4166 
4167     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4168     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4169     assert(D.array == expectedD);
4170 }
4171 
4172 
4173 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4174 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4175 {
4176     static if (GDC_with_SSE2)
4177     {
4178         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4179     }
4180     else static if (LDC_with_SSE2)
4181     {
4182         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4183     }
4184     else
4185     {
4186         ubyte count = cast(ubyte) imm8;
4187 
4188         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4189         //       D says "It's illegal to shift by the same or more bits 
4190         //       than the size of the quantity being shifted"
4191         //       and it's UB instead.
4192         int4 r = _mm_setzero_si128();
4193         if (count >= 32)
4194             return r;
4195         r.ptr[0] = a.array[0] >>> count;
4196         r.ptr[1] = a.array[1] >>> count;
4197         r.ptr[2] = a.array[2] >>> count;
4198         r.ptr[3] = a.array[3] >>> count;
4199         return r;
4200     }
4201 }
4202 unittest
4203 {
4204     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4205     __m128i B = _mm_srli_epi32(A, 1);
4206     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4207     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4208     assert(B.array == expectedB);
4209     assert(B2.array == expectedB);
4210  
4211     __m128i C = _mm_srli_epi32(A, 255);
4212     int[4] expectedC = [ 0, 0, 0, 0 ];
4213     assert(C.array == expectedC);
4214 }
4215 
4216 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4217 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4218 {
4219     static if (GDC_with_SSE2)
4220     {
4221         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4222     }
4223     else static if (LDC_with_SSE2)
4224     {
4225         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4226     }
4227     else
4228     {
4229         long2 r = cast(long2) _mm_setzero_si128();
4230         long2 sa = cast(long2)a;
4231 
4232         ubyte count = cast(ubyte) imm8;
4233         if (count >= 64)
4234             return cast(__m128i)r;
4235 
4236         r.ptr[0] = sa.array[0] >>> count;
4237         r.ptr[1] = sa.array[1] >>> count;
4238         return cast(__m128i)r;
4239     }
4240 }
4241 unittest
4242 {
4243     __m128i A = _mm_setr_epi64(8, -4);
4244     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4245     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4246     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4247     assert(B.array == expectedB);
4248     assert(B2.array == expectedB);
4249 
4250     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4251     long[2] expectedC = [ 0, 0 ];
4252     assert(C.array == expectedC);
4253 }
4254 
4255 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4256 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted
4257 {
4258     static if (bytes & 0xF0)
4259     {
4260         return _mm_setzero_si128();
4261     }
4262     else static if (GDC_with_SSE2)
4263     {
4264         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4265     }
4266     else static if (DMD_with_32bit_asm)
4267     {
4268         asm pure nothrow @nogc @trusted
4269         {
4270             movdqu XMM0, v;
4271             psrldq XMM0, bytes;
4272             movdqu v, XMM0;
4273         }
4274         return v;
4275     }
4276     else version(LDC)
4277     {
4278         return cast(__m128i) shufflevectorLDC!(byte16,
4279                                                bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4280                                                bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4281                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4282     }
4283     else
4284     {
4285         byte16 A = cast(byte16)v;
4286         byte16 R = void;
4287         for (int n = 0; n < bytes; ++n)
4288             R.ptr[15-n] = 0;
4289         for (int n = bytes; n < 16; ++n)
4290             R.ptr[15-n] = A.array[15 - n + bytes];
4291         return cast(__m128i)R;
4292     }
4293 }
4294 unittest
4295 {
4296     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1));
4297     int[4] correct = [-2, 3, 4, 0];
4298     assert(R.array == correct);
4299 
4300     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4301     int[4] expectedA = [0, 0, 0, 0];
4302     assert(A.array == expectedA);
4303 }
4304 
4305 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4306 /// #BONUS
4307 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4308 {
4309     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4310 }
4311 unittest
4312 {
4313     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4314     float[4] correct = [3.0f, 4.0f, 0, 0];
4315     assert(R.array == correct);
4316 }
4317 
4318 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4319 /// #BONUS
4320 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4321 {
4322     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4323 }
4324 
4325 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4326 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4327 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4328 {
4329     pragma(inline, true);
4330     __m128d* aligned = cast(__m128d*)mem_addr;
4331     *aligned = a;
4332 }
4333 unittest
4334 {
4335     align(16) double[2] A;
4336     __m128d B = _mm_setr_pd(-8.0, 9.0);
4337     _mm_store_pd(A.ptr, B);
4338     assert(A == [-8.0, 9.0]);
4339 }
4340 
4341 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4342 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4343 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4344 {
4345     __m128d* aligned = cast(__m128d*)mem_addr;
4346     __m128d r; // PERF =void;
4347     r.ptr[0] = a.array[0];
4348     r.ptr[1] = a.array[0];
4349     *aligned = r;
4350 }
4351 
4352 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4353 /// be aligned on any particular boundary.
4354 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4355 {
4356     pragma(inline, true);
4357     *mem_addr = a.array[0];
4358 }
4359 
4360 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4361 /// general-protection exception may be generated.
4362 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4363 {
4364     pragma(inline, true);
4365     *mem_addr = a;
4366 }
4367 
4368 alias _mm_store1_pd = _mm_store_pd1; ///
4369 
4370 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4371 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4372 {
4373     pragma(inline, true);
4374     *mem_addr = a.array[1];
4375 }
4376 
4377 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4378 // expectations from the user point of view. This problem also exist in C++.
4379 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4380 {
4381     pragma(inline, true);
4382     long* dest = cast(long*)mem_addr;
4383     long2 la = cast(long2)a;
4384     *dest = la.array[0];
4385 }
4386 unittest
4387 {
4388     long[3] A = [1, 2, 3];
4389     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4390     long[3] correct = [1, 0x1_0000_0000, 3];
4391     assert(A == correct);
4392 }
4393 
4394 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4395 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4396 {
4397     pragma(inline, true);
4398     *mem_addr = a.array[0];
4399 }
4400 
4401 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 
4402 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 
4403 /// may be generated.
4404 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
4405 {
4406     __m128d reversed = void;
4407     reversed.ptr[0] = a.array[1];
4408     reversed.ptr[1] = a.array[0];
4409     *cast(__m128d*)mem_addr = reversed;
4410 }
4411 unittest
4412 {
4413     align(16) double[2] A = [0.0, 1.0];
4414     _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0));
4415     assert(A[0] == 3.0 && A[1] == 2.0);
4416 }
4417 
4418 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 
4419 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
4420 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
4421 {
4422     // PERF DMD
4423     pragma(inline, true);
4424     static if (GDC_with_SSE2)
4425     {
4426         __builtin_ia32_storeupd(mem_addr, a);
4427     }
4428     else version(LDC)
4429     {
4430         storeUnaligned!double2(a, mem_addr);
4431     }
4432     else
4433     {
4434         mem_addr[0] = a.array[0];
4435         mem_addr[1] = a.array[1];
4436     }
4437 }
4438 unittest
4439 {
4440     __m128d A = _mm_setr_pd(3.0, 4.0);
4441     align(16) double[4] R = [0.0, 0, 0, 0];
4442     double[2] correct = [3.0, 4.0];
4443     _mm_storeu_pd(&R[1], A);
4444     assert(R[1..3] == correct);
4445 }
4446 
4447 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4448 /// boundary.
4449 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned
4450 {
4451     // PERF: DMD
4452     pragma(inline, true);
4453     static if (GDC_with_SSE2)
4454     {
4455         __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
4456     }
4457     else version(LDC)
4458     {
4459         storeUnaligned!__m128i(a, cast(int*)mem_addr);
4460     }
4461     else
4462     {
4463         int* p = cast(int*)mem_addr;
4464         p[0] = a.array[0];
4465         p[1] = a.array[1];
4466         p[2] = a.array[2];
4467         p[3] = a.array[3];
4468     }
4469 }
4470 unittest
4471 {
4472     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4473     align(16) int[6] R = [0, 0, 0, 0, 0, 0];
4474     int[4] correct = [1, 2, 3, 4];
4475     _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
4476     assert(R[1..5] == correct);
4477 }
4478 
4479 /// Store 32-bit integer from the first element of `a` into memory. 
4480 /// `mem_addr` does not need to be aligned on any particular boundary.
4481 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4482 {
4483     pragma(inline, true);
4484     int* dest = cast(int*)mem_addr;
4485     *dest = a.array[0];
4486 }
4487 unittest
4488 {
4489     int[2] arr = [-24, 12];
4490     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4491     assert(arr == [-24, -1]);
4492 }
4493 
4494 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4495 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte
4496 /// boundary or a general-protection exception may be generated.
4497 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4498 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system
4499 {
4500     // PERF DMD D_SIMD
4501     static if (GDC_with_SSE2)
4502     {
4503         return __builtin_ia32_movntpd(mem_addr, a); 
4504     }
4505     else version(LDC)
4506     {
4507         enum prefix = `!0 = !{ i32 1 }`;
4508         enum ir = `
4509             store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0
4510             ret void`;
4511         LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a);
4512     }
4513     else
4514     {
4515         // Regular store instead.
4516         __m128d* dest = cast(__m128d*)mem_addr;
4517         *dest = a;
4518     }
4519 }
4520 unittest
4521 {
4522     align(16) double[2] A;
4523     __m128d B = _mm_setr_pd(-8.0, 9.0);
4524     _mm_stream_pd(A.ptr, B);
4525     assert(A == [-8.0, 9.0]);
4526 }
4527 
4528 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4529 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
4530 /// may be generated.
4531 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4532 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted
4533 {
4534     // PERF DMD D_SIMD
4535     static if (GDC_with_SSE2)
4536     {
4537         return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 
4538     }
4539     else version(LDC)
4540     {
4541         enum prefix = `!0 = !{ i32 1 }`;
4542         enum ir = `
4543             store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0
4544             ret void`;
4545         LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a);
4546     }
4547     else
4548     {
4549         // Regular store instead.
4550         __m128i* dest = cast(__m128i*)mem_addr;
4551         *dest = a;
4552     }
4553 }
4554 unittest
4555 {
4556     align(16) int[4] A;
4557     __m128i B = _mm_setr_epi32(-8, 9, 10, -11);
4558     _mm_stream_si128(cast(__m128i*)A.ptr, B);
4559     assert(A == [-8, 9, 10, -11]);
4560 }
4561 
4562 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4563 /// pollution. If the cache line containing address `mem_addr` is already in the cache,
4564 /// the cache will be updated.
4565 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4566 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted
4567 {
4568     // PERF DMD D_SIMD
4569     static if (GDC_with_SSE2)
4570     {
4571         return __builtin_ia32_movnti(mem_addr, a);
4572     }
4573     else version(LDC)
4574     {
4575         enum prefix = `!0 = !{ i32 1 }`;
4576         enum ir = `
4577             store i32 %1, i32* %0, !nontemporal !0
4578             ret void`;
4579         LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a);
4580     }
4581     else
4582     {
4583         // Regular store instead.
4584         *mem_addr = a;
4585     }
4586 }
4587 unittest
4588 {
4589     int A;
4590     _mm_stream_si32(&A, -34);
4591     assert(A == -34);
4592 }
4593 
4594 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4595 /// cache pollution. If the cache line containing address `mem_addr` is already
4596 /// in the cache, the cache will be updated.
4597 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4598 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted
4599 {
4600     // PERF DMD D_SIMD
4601     static if (GDC_with_SSE2)
4602     {
4603         return __builtin_ia32_movnti64(mem_addr, a);
4604     }
4605     else version(LDC)
4606     {
4607         enum prefix = `!0 = !{ i32 1 }`;
4608         enum ir = `
4609             store i64 %1, i64* %0, !nontemporal !0
4610             ret void`;
4611         LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a);
4612 
4613     }
4614     else
4615     {
4616         // Regular store instead.
4617         *mem_addr = a;
4618     }
4619 }
4620 unittest
4621 {
4622     long A;
4623     _mm_stream_si64(&A, -46);
4624     assert(A == -46);
4625 }
4626 
4627 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4628 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4629 {
4630     pragma(inline, true);
4631     return cast(__m128i)(cast(short8)a - cast(short8)b);
4632 }
4633 // TODO untitest
4634 
4635 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4636 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4637 {
4638     pragma(inline, true);
4639     return cast(__m128i)(cast(int4)a - cast(int4)b);
4640 }
4641 // TODO untitest
4642 
4643 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4644 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4645 {
4646     pragma(inline, true);
4647     return cast(__m128i)(cast(long2)a - cast(long2)b);
4648 }
4649 // TODO untitest
4650 
4651 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4652 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4653 {
4654     pragma(inline, true);
4655     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4656 }
4657 // TODO untitest
4658 
4659 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4660 /// floating-point elements in `a`.
4661 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4662 {
4663     pragma(inline, true);
4664     return a - b;
4665 }
4666 unittest
4667 {
4668     __m128d A = _mm_setr_pd(4000.0, -8.0);
4669     __m128d B = _mm_setr_pd(12.0, -8450.0);
4670     __m128d C = _mm_sub_pd(A, B);
4671     double[2] correct =     [3988.0, 8442.0];
4672     assert(C.array == correct);
4673 }
4674 
4675 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4676 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4677 /// upper element of result.
4678 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4679 {
4680     version(DigitalMars)
4681     {
4682         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4683         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4684         asm pure nothrow @nogc @trusted { nop;}
4685         a[0] = a[0] - b[0];
4686         return a;
4687     }
4688     else static if (GDC_with_SSE2)
4689     {
4690         return __builtin_ia32_subsd(a, b);
4691     }
4692     else
4693     {
4694         a.ptr[0] -= b.array[0];
4695         return a;
4696     }
4697 }
4698 unittest
4699 {
4700     __m128d a = [1.5, -2.0];
4701     a = _mm_sub_sd(a, a);
4702     assert(a.array == [0.0, -2.0]);
4703 }
4704 
4705 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4706 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4707 {
4708     pragma(inline, true);
4709     return a - b;
4710 }
4711 unittest
4712 {
4713     __m64 A, B;
4714     A = -1214;
4715     B = 489415;
4716     __m64 C = _mm_sub_si64(B, A);
4717     assert(C.array[0] == 489415 + 1214);
4718 }
4719 
4720 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4721 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4722 {
4723     version(LDC)
4724     {
4725         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4726         {
4727             // Generates PSUBSW since LDC 1.15 -O0
4728             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4729             
4730             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4731             enum ir = `
4732                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4733                 ret <8 x i16> %r`;
4734             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4735         }
4736         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4737         {
4738             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4739             short[8] res; // PERF: =void;
4740             short8 sa = cast(short8)a;
4741             short8 sb = cast(short8)b;
4742             foreach(i; 0..8)
4743                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4744             return _mm_loadu_si128(cast(int4*)res.ptr);
4745         }
4746         else static if (LDC_with_SSE2)
4747         {
4748             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4749         }
4750         else
4751             static assert(false);
4752     }
4753     else static if (GDC_with_SSE2)
4754     {
4755         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4756     }
4757     else
4758     {
4759         short[8] res; // PERF =void;
4760         short8 sa = cast(short8)a;
4761         short8 sb = cast(short8)b;
4762         foreach(i; 0..8)
4763             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4764         return _mm_loadu_si128(cast(int4*)res.ptr);
4765     }
4766 }
4767 unittest
4768 {
4769     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4770                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4771     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4772     assert(res.array == correctResult);
4773 }
4774 
4775 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4776 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4777 {
4778     version(LDC)
4779     {
4780         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4781         {
4782             // x86: Generates PSUBSB since LDC 1.15 -O0
4783             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4784             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4785             enum ir = `
4786                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4787                 ret <16 x i8> %r`;
4788             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4789         }
4790         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4791         {
4792             byte[16] res; // PERF =void;
4793             byte16 sa = cast(byte16)a;
4794             byte16 sb = cast(byte16)b;
4795             foreach(i; 0..16)
4796                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4797             return _mm_loadu_si128(cast(int4*)res.ptr);
4798         }
4799         else static if (LDC_with_SSE2)
4800         {
4801             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4802         }
4803         else
4804             static assert(false);
4805     }
4806     else static if (GDC_with_SSE2)
4807     {
4808         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4809     }
4810     else
4811     {
4812         byte[16] res; // PERF =void;
4813         byte16 sa = cast(byte16)a;
4814         byte16 sb = cast(byte16)b;
4815         foreach(i; 0..16)
4816             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4817         return _mm_loadu_si128(cast(int4*)res.ptr);
4818     }
4819 }
4820 unittest
4821 {
4822     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4823                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4824     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4825     assert(res.array == correctResult);
4826 }
4827 
4828 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4829 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4830 {
4831     version(LDC)
4832     {
4833         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4834         {
4835             // x86: Generates PSUBUSW since LDC 1.15 -O0
4836             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4837             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4838             enum ir = `
4839                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4840                 ret <8 x i16> %r`;
4841             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4842         }
4843         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4844         {
4845             short[8] res; // PERF =void;
4846             short8 sa = cast(short8)a;
4847             short8 sb = cast(short8)b;
4848             foreach(i; 0..8)
4849             {
4850                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4851                 res[i] = saturateSignedIntToUnsignedShort(sum);
4852             }
4853             return _mm_loadu_si128(cast(int4*)res.ptr);
4854         }
4855         else static if (LDC_with_SSE2)
4856         {
4857             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4858         }
4859         else 
4860             static assert(false);
4861     }
4862     else static if (GDC_with_SSE2)
4863     {
4864         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4865     }
4866     else
4867     {
4868         short[8] res; // PERF =void;
4869         short8 sa = cast(short8)a;
4870         short8 sb = cast(short8)b;
4871         foreach(i; 0..8)
4872         {
4873             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4874             res[i] = saturateSignedIntToUnsignedShort(sum);
4875         }
4876         return _mm_loadu_si128(cast(int4*)res.ptr);
4877     }
4878 }
4879 unittest
4880 {
4881     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4882                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4883     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4884     assert(R.array == correct);
4885 }
4886 
4887 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4888 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4889 {
4890     version(LDC)
4891     {
4892         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4893         {
4894             // x86: Generates PSUBUSB since LDC 1.15 -O0
4895             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4896             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4897             enum ir = `
4898                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4899                 ret <16 x i8> %r`;
4900             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4901         }
4902         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4903         {
4904             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4905             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4906             {
4907                 ubyte[16] res; // PERF =void;
4908                 byte16 sa = cast(byte16)a;
4909                 byte16 sb = cast(byte16)b;
4910                 foreach(i; 0..16)
4911                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4912                 return _mm_loadu_si128(cast(int4*)res.ptr);
4913             }
4914         }
4915         else static if (LDC_with_SSE2)
4916         {
4917             return __builtin_ia32_psubusb128(a, b);
4918         }
4919         else 
4920             static assert(false);
4921     }
4922     else static if (GDC_with_SSE2)
4923     {
4924         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4925     }
4926     else
4927     {
4928         ubyte[16] res; // PERF =void;
4929         byte16 sa = cast(byte16)a;
4930         byte16 sb = cast(byte16)b;
4931         foreach(i; 0..16)
4932             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4933         return _mm_loadu_si128(cast(int4*)res.ptr);
4934     }
4935 }
4936 unittest
4937 {
4938     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4939                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4940     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4941     assert(res.array == correctResult);
4942 }
4943 
4944 // Note: the only difference between these intrinsics is the signalling
4945 //       behaviour of quiet NaNs. This is incorrect but the case where
4946 //       you would want to differentiate between qNaN and sNaN and then
4947 //       treat them differently on purpose seems extremely rare.
4948 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4949 alias _mm_ucomige_sd = _mm_comige_sd; ///
4950 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4951 alias _mm_ucomile_sd = _mm_comile_sd; ///
4952 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4953 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4954 
4955 /// Return vector of type `__m128d` with undefined elements.
4956 __m128d _mm_undefined_pd() pure @safe
4957 {
4958     pragma(inline, true);
4959     __m128d result = void;
4960     return result;
4961 }
4962 
4963 /// Return vector of type `__m128i` with undefined elements.
4964 __m128i _mm_undefined_si128() pure @safe
4965 {
4966     pragma(inline, true);
4967     __m128i result = void;
4968     return result;
4969 }
4970 
4971 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4972 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted
4973 {
4974     // PERF DMD D_SIMD
4975     static if (GDC_with_SSE2)
4976     {
4977         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4978     }
4979     else version(LDC)
4980     {
4981         return cast(__m128i) shufflevectorLDC!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4982                                               (cast(short8)a, cast(short8)b);
4983     }
4984     else static if (DMD_with_32bit_asm)
4985     {
4986         asm pure nothrow @nogc @trusted
4987         {
4988             movdqu XMM0, a;
4989             movdqu XMM1, b;
4990             punpckhwd XMM0, XMM1;
4991             movdqu a, XMM0;
4992         }
4993         return a;
4994     }   
4995     else
4996     {
4997         short8 r = void;
4998         short8 sa = cast(short8)a;
4999         short8 sb = cast(short8)b;
5000         r.ptr[0] = sa.array[4];
5001         r.ptr[1] = sb.array[4];
5002         r.ptr[2] = sa.array[5];
5003         r.ptr[3] = sb.array[5];
5004         r.ptr[4] = sa.array[6];
5005         r.ptr[5] = sb.array[6];
5006         r.ptr[6] = sa.array[7];
5007         r.ptr[7] = sb.array[7];
5008         return cast(__m128i)r;
5009     }
5010 }
5011 unittest
5012 {
5013     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
5014     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
5015     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
5016     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
5017     assert(C.array == correct);
5018 }
5019 
5020 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
5021 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
5022 {
5023     static if (GDC_with_SSE2)
5024     {
5025         return __builtin_ia32_punpckhdq128(a, b);
5026     }
5027     else version(LDC)
5028     {
5029         return shufflevectorLDC!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
5030     }
5031     else
5032     {
5033         __m128i r = void;
5034         r.ptr[0] = a.array[2];
5035         r.ptr[1] = b.array[2];
5036         r.ptr[2] = a.array[3];
5037         r.ptr[3] = b.array[3];
5038         return r;
5039     }
5040 }
5041 unittest
5042 {
5043     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5044     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5045     __m128i C = _mm_unpackhi_epi32(A, B);
5046     int[4] correct = [3, 7, 4, 8];
5047     assert(C.array == correct);
5048 }
5049 
5050 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
5051 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
5052 {
5053     static if (GDC_with_SSE2)
5054     {
5055         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
5056     }
5057     else
5058     {
5059         __m128i r = cast(__m128i)b;
5060         r[0] = a[2];
5061         r[1] = a[3];
5062         return r; 
5063     }
5064 }
5065 unittest // Issue #36
5066 {
5067     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5068     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5069     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
5070     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
5071     assert(C.array == correct);
5072 }
5073 
5074 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
5075 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted
5076 {
5077     // PERF DMD D_SIMD
5078     static if (GDC_with_SSE2)
5079     {
5080         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
5081     }
5082     else static if (DMD_with_32bit_asm)
5083     {
5084         asm pure nothrow @nogc @trusted
5085         {
5086             movdqu XMM0, a;
5087             movdqu XMM1, b;
5088             punpckhbw XMM0, XMM1;
5089             movdqu a, XMM0;
5090         }
5091         return a;
5092     }
5093     else version(LDC)
5094     {
5095         return cast(__m128i)shufflevectorLDC!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
5096                                                       12, 28, 13, 29, 14, 30, 15, 31)
5097             (cast(byte16)a, cast(byte16)b);
5098     }
5099     else
5100     {
5101         byte16 r = void;
5102         byte16 ba = cast(byte16)a;
5103         byte16 bb = cast(byte16)b;
5104         r.ptr[0] = ba.array[8];
5105         r.ptr[1] = bb.array[8];
5106         r.ptr[2] = ba.array[9];
5107         r.ptr[3] = bb.array[9];
5108         r.ptr[4] = ba.array[10];
5109         r.ptr[5] = bb.array[10];
5110         r.ptr[6] = ba.array[11];
5111         r.ptr[7] = bb.array[11];
5112         r.ptr[8] = ba.array[12];
5113         r.ptr[9] = bb.array[12];
5114         r.ptr[10] = ba.array[13];
5115         r.ptr[11] = bb.array[13];
5116         r.ptr[12] = ba.array[14];
5117         r.ptr[13] = bb.array[14];
5118         r.ptr[14] = ba.array[15];
5119         r.ptr[15] = bb.array[15];
5120         return cast(__m128i)r;
5121     }
5122 }
5123 unittest
5124 {
5125     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5126     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5127     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
5128     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
5129     assert(C.array == correct);
5130 }
5131 
5132 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
5133 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted
5134 {
5135     // PERF DMD D_SIMD
5136     static if (GDC_with_SSE2)
5137     {
5138         return __builtin_ia32_unpckhpd(a, b);
5139     }
5140     else version(LDC)
5141     {
5142         return shufflevectorLDC!(__m128d, 1, 3)(a, b);
5143     }
5144     else
5145     {
5146         double2 r = void;
5147         r.ptr[0] = a.array[1];
5148         r.ptr[1] = b.array[1];
5149         return r;
5150     }
5151 }
5152 unittest
5153 {
5154     __m128d A = _mm_setr_pd(4.0, 6.0);
5155     __m128d B = _mm_setr_pd(7.0, 9.0);
5156     __m128d C = _mm_unpackhi_pd(A, B);
5157     double[2] correct = [6.0, 9.0];
5158     assert(C.array == correct);
5159 }
5160 
5161 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
5162 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted
5163 {
5164     // PERF DMD SIMD
5165     static if (GDC_with_SSE2)
5166     {
5167         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
5168     }
5169     else version(LDC)
5170     {
5171         return cast(__m128i) shufflevectorLDC!(short8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(short8)a, cast(short8)b);
5172     }
5173     else static if (DMD_with_32bit_asm)
5174     {
5175         asm pure nothrow @nogc @trusted
5176         {
5177             movdqu XMM0, a;
5178             movdqu XMM1, b;
5179             punpcklwd XMM0, XMM1;
5180             movdqu a, XMM0;
5181         }
5182         return a;
5183     }
5184     else
5185     {
5186         short8 r = void;
5187         short8 sa = cast(short8)a;
5188         short8 sb = cast(short8)b;
5189         r.ptr[0] = sa.array[0];
5190         r.ptr[1] = sb.array[0];
5191         r.ptr[2] = sa.array[1];
5192         r.ptr[3] = sb.array[1];
5193         r.ptr[4] = sa.array[2];
5194         r.ptr[5] = sb.array[2];
5195         r.ptr[6] = sa.array[3];
5196         r.ptr[7] = sb.array[3];
5197         return cast(__m128i)r;
5198     }
5199 }
5200 unittest
5201 {
5202     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
5203     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
5204     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
5205     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
5206     assert(C.array == correct);
5207 }
5208 
5209 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
5210 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
5211 {
5212     // PERF DMD
5213     static if (GDC_with_SSE2)
5214     {
5215         return __builtin_ia32_punpckldq128(a, b);
5216     }
5217     else version(LDC)
5218     {
5219         return shufflevectorLDC!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
5220     }
5221     else
5222     {
5223         __m128i r;
5224         r.ptr[0] = a.array[0];
5225         r.ptr[1] = b.array[0];
5226         r.ptr[2] = a.array[1];
5227         r.ptr[3] = b.array[1];
5228         return r;
5229     }
5230 }
5231 unittest
5232 {
5233     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5234     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5235     __m128i C = _mm_unpacklo_epi32(A, B);
5236     int[4] correct = [1, 5, 2, 6];
5237     assert(C.array == correct);
5238 }
5239 
5240 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
5241 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
5242 {
5243     static if (GDC_with_SSE2)
5244     {
5245         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
5246     }
5247     else
5248     {
5249         long2 lA = cast(long2)a;
5250         long2 lB = cast(long2)b;
5251         long2 R; // PERF =void;
5252         R.ptr[0] = lA.array[0];
5253         R.ptr[1] = lB.array[0];
5254         return cast(__m128i)R;
5255     }
5256 }
5257 unittest // Issue #36
5258 {
5259     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5260     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5261     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
5262     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
5263     assert(C.array == correct);
5264 }
5265 
5266 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
5267 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted
5268 {
5269     // PERF DMD D_SIMD
5270     static if (GDC_with_SSE2)
5271     {
5272         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
5273     }
5274     else static if (DMD_with_32bit_asm)
5275     {
5276         asm pure nothrow @nogc @trusted
5277         {
5278             movdqu XMM0, a;
5279             movdqu XMM1, b;
5280             punpcklbw XMM0, XMM1;
5281             movdqu a, XMM0;
5282         }
5283         return a;
5284     }
5285     else version(LDC)
5286     {
5287         return cast(__m128i) shufflevectorLDC!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
5288                                                        4, 20, 5, 21, 6, 22, 7, 23)
5289                                                        (cast(byte16)a, cast(byte16)b); 
5290     }
5291     else
5292     {
5293         byte16 r = void;
5294         byte16 ba = cast(byte16)a;
5295         byte16 bb = cast(byte16)b;
5296         r.ptr[0] = ba.array[0];
5297         r.ptr[1] = bb.array[0];
5298         r.ptr[2] = ba.array[1];
5299         r.ptr[3] = bb.array[1];
5300         r.ptr[4] = ba.array[2];
5301         r.ptr[5] = bb.array[2];
5302         r.ptr[6] = ba.array[3];
5303         r.ptr[7] = bb.array[3];
5304         r.ptr[8] = ba.array[4];
5305         r.ptr[9] = bb.array[4];
5306         r.ptr[10] = ba.array[5];
5307         r.ptr[11] = bb.array[5];
5308         r.ptr[12] = ba.array[6];
5309         r.ptr[13] = bb.array[6];
5310         r.ptr[14] = ba.array[7];
5311         r.ptr[15] = bb.array[7];
5312         return cast(__m128i)r;
5313     }
5314 }
5315 unittest
5316 {
5317     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5318     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5319     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
5320     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
5321     assert(C.array == correct);
5322 }
5323 
5324 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
5325 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted
5326 {
5327     // PERF DMD D_SIMD
5328     static if (GDC_with_SSE2)
5329     {
5330         return __builtin_ia32_unpcklpd(a, b);
5331     }
5332     else version(LDC)
5333     {
5334         return shufflevectorLDC!(__m128d, 0, 2)(a, b);
5335     }
5336     else
5337     {
5338         double2 r = void;
5339         r.ptr[0] = a.array[0];
5340         r.ptr[1] = b.array[0];
5341         return r;
5342     }
5343 }
5344 unittest
5345 {
5346     __m128d A = _mm_setr_pd(4.0, 6.0);
5347     __m128d B = _mm_setr_pd(7.0, 9.0);
5348     __m128d C = _mm_unpacklo_pd(A, B);
5349     double[2] correct = [4.0, 7.0];
5350     assert(C.array == correct);
5351 }
5352 
5353 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
5354 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
5355 {
5356     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
5357 }
5358 // TODO unittest
5359 
5360 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
5361 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
5362 {
5363     return a ^ b;
5364 }
5365 // TODO unittest
5366 
5367 unittest
5368 {
5369     float distance(float[4] a, float[4] b) nothrow @nogc
5370     {
5371         __m128 va = _mm_loadu_ps(a.ptr);
5372         __m128 vb = _mm_loadu_ps(b.ptr);
5373         __m128 diffSquared = _mm_sub_ps(va, vb);
5374         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
5375         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
5376         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
5377         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
5378     }
5379     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
5380 }