inteli.emmintrin source code

1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (GDC_with_SSE2)
83     {
84         return __builtin_ia32_addsd(a, b);
85     }
86     else version(DigitalMars)
87     {
88         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
89         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
90         asm pure nothrow @nogc @trusted { nop;}
91         a[0] = a[0] + b[0];
92         return a;
93     }
94     else
95     {
96         a[0] += b[0];
97         return a;
98     }
99 }
100 unittest
101 {
102     __m128d a = [1.5, -2.0];
103     a = _mm_add_sd(a, a);
104     assert(a.array == [3.0, -2.0]);
105 }
106 
107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
109 {
110     pragma(inline, true);
111     return a + b;
112 }
113 unittest
114 {
115     __m128d a = [1.5, -2.0];
116     a = _mm_add_pd(a, a);
117     assert(a.array == [3.0, -4.0]);
118 }
119 
120 /// Add 64-bit integers `a` and `b`.
121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
122 {
123     pragma(inline, true);
124     return a + b;
125 }
126 
127 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
129 {
130     static if (GDC_with_SSE2)
131     {
132         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
133     }
134     else version(LDC)
135     {
136         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
137         {
138             // x86: Generates PADDSW since LDC 1.15 -O0
139             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
140             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
141             enum ir = `
142                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
143                 ret <8 x i16> %r`;
144             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
145         }
146         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
147         {
148             short[8] res;
149             short8 sa = cast(short8)a;
150             short8 sb = cast(short8)b;
151             foreach(i; 0..8)
152                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
153             return _mm_loadu_si128(cast(int4*)res.ptr);
154         }
155         else
156             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
157     }
158     else
159     {
160         short[8] res;
161         short8 sa = cast(short8)a;
162         short8 sb = cast(short8)b;
163         foreach(i; 0..8)
164             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
165         return _mm_loadu_si128(cast(int4*)res.ptr);
166     }
167 }
168 unittest
169 {
170     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
171                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
172     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
173     assert(res.array == correctResult);
174 }
175 
176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
178 {
179     static if (GDC_with_SSE2)
180     {
181         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
182     }
183     else version(LDC)
184     {
185         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
186         {
187             // x86: Generates PADDSB since LDC 1.15 -O0
188             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
189             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
190             enum ir = `
191                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
192                 ret <16 x i8> %r`;
193             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
194         }
195         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
196         {
197             byte[16] res;
198             byte16 sa = cast(byte16)a;
199             byte16 sb = cast(byte16)b;
200             foreach(i; 0..16)
201                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
202             return _mm_loadu_si128(cast(int4*)res.ptr);
203         }
204         else
205             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
206     }
207     else
208     {
209         byte[16] res;
210         byte16 sa = cast(byte16)a;
211         byte16 sb = cast(byte16)b;
212         foreach(i; 0..16)
213             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
214         return _mm_loadu_si128(cast(int4*)res.ptr);
215     }
216 }
217 unittest
218 {
219     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
220                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
221     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
222                                                16, 18, 20, 22, 24, 26, 28, 30];
223     assert(res.array == correctResult);
224 }
225 
226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
227 // PERF: #GDC version?
228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
229 {
230     version(LDC)
231     {
232         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
233         {
234             // x86: Generates PADDUSB since LDC 1.15 -O0
235             // ARM: Generates uqadd.16b since LDC 1.21 -O1
236             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
237             enum ir = `
238                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
239                 ret <16 x i8> %r`;
240             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
241         }
242         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
243         {
244             ubyte[16] res;
245             byte16 sa = cast(byte16)a;
246             byte16 sb = cast(byte16)b;
247             foreach(i; 0..16)
248                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
249             return _mm_loadu_si128(cast(int4*)res.ptr);
250         }
251         else
252             return __builtin_ia32_paddusb128(a, b);
253     }
254     else
255     {
256         ubyte[16] res;
257         byte16 sa = cast(byte16)a;
258         byte16 sb = cast(byte16)b;
259         foreach(i; 0..16)
260             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
261         return _mm_loadu_si128(cast(int4*)res.ptr);
262     }
263 }
264 unittest
265 {
266     byte16 res = cast(byte16) 
267         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
268                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
269     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
270                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
271     assert(res.array == correctResult);
272 }
273 
274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
275 // PERF: #GDC version?
276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
277 {
278     version(LDC)
279     {
280         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
281         {
282             // x86: Generates PADDUSW since LDC 1.15 -O0
283             // ARM: Generates uqadd.8h since LDC 1.21 -O1
284             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
285             enum ir = `
286                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
287                 ret <8 x i16> %r`;
288             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
289         }
290         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
291         {
292             ushort[8] res;
293             short8 sa = cast(short8)a;
294             short8 sb = cast(short8)b;
295             foreach(i; 0..8)
296                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
297             return _mm_loadu_si128(cast(int4*)res.ptr);
298         }
299         else
300             return __builtin_ia32_paddusw128(a, b);
301     }
302     else
303     {
304         ushort[8] res;
305         short8 sa = cast(short8)a;
306         short8 sb = cast(short8)b;
307         foreach(i; 0..8)
308             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
309         return _mm_loadu_si128(cast(int4*)res.ptr);
310     }
311 }
312 unittest
313 {
314     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
315                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
316     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
317     assert(res.array == correctResult);
318 }
319 
320 /// Compute the bitwise AND of packed double-precision (64-bit) 
321 /// floating-point elements in `a` and `b`.
322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
323 {
324     pragma(inline, true);
325     return cast(__m128d)( cast(long2)a & cast(long2)b );
326 }
327 unittest
328 {
329     double a = 4.32;
330     double b = -78.99;
331     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
332     __m128d A = _mm_set_pd(a, b);
333     __m128d B = _mm_set_pd(b, a);
334     long2 R = cast(long2)( _mm_and_pd(A, B) );
335     assert(R.array[0] == correct);
336     assert(R.array[1] == correct);
337 }
338 
339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
341 {
342     pragma(inline, true);
343     return a & b;
344 }
345 unittest
346 {
347     __m128i A = _mm_set1_epi32(7);
348     __m128i B = _mm_set1_epi32(14);
349     __m128i R = _mm_and_si128(A, B);
350     int[4] correct = [6, 6, 6, 6];
351     assert(R.array == correct);
352 }
353 
354 /// Compute the bitwise NOT of packed double-precision (64-bit) 
355 /// floating-point elements in `a` and then AND with `b`.
356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
357 {
358     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
359 }
360 unittest
361 {
362     double a = 4.32;
363     double b = -78.99;
364     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
365     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
366     __m128d A = _mm_setr_pd(a, b);
367     __m128d B = _mm_setr_pd(b, a);
368     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
369     assert(R.array[0] == correct);
370     assert(R.array[1] == correct2);
371 }
372 
373 /// Compute the bitwise NOT of 128 bits (representing integer data) 
374 /// in `a` and then AND with `b`.
375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
376 {
377     return (~a) & b;
378 }
379 unittest
380 {
381     __m128i A = _mm_set1_epi32(7);
382     __m128i B = _mm_set1_epi32(14);
383     __m128i R = _mm_andnot_si128(A, B);
384     int[4] correct = [8, 8, 8, 8];
385     assert(R.array == correct);
386 }
387 
388 /// Average packed unsigned 16-bit integers in `a` and `b`.
389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
390 {
391     static if (GDC_with_SSE2)
392     {
393         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
394     }
395     else static if (LDC_with_ARM64)
396     {
397         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
398     }
399     else version(LDC)
400     {
401         // Generates pavgw even in LDC 1.0, even in -O0
402         // But not in ARM
403         enum ir = `
404             %ia = zext <8 x i16> %0 to <8 x i32>
405             %ib = zext <8 x i16> %1 to <8 x i32>
406             %isum = add <8 x i32> %ia, %ib
407             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
408             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
409             %r = trunc <8 x i32> %isums to <8 x i16>
410             ret <8 x i16> %r`;
411         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
412     }
413     else
414     {
415         short8 sa = cast(short8)a;
416         short8 sb = cast(short8)b;
417         short8 sr = void;
418         foreach(i; 0..8)
419         {
420             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
421         }
422         return cast(int4)sr;
423     }
424 }
425 unittest
426 {
427     __m128i A = _mm_set1_epi16(31);
428     __m128i B = _mm_set1_epi16(64);
429     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
430     foreach(i; 0..8)
431         assert(avg.array[i] == 48);
432 }
433 
434 /// Average packed unsigned 8-bit integers in `a` and `b`.
435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
436 {
437     static if (GDC_with_SSE2)
438     {
439         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
440     }
441     else static if (LDC_with_ARM64)
442     {
443         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
444     }
445     else version(LDC)
446     {
447         // Generates pavgb even in LDC 1.0, even in -O0
448         // But not in ARM
449         enum ir = `
450             %ia = zext <16 x i8> %0 to <16 x i16>
451             %ib = zext <16 x i8> %1 to <16 x i16>
452             %isum = add <16 x i16> %ia, %ib
453             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
454             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
455             %r = trunc <16 x i16> %isums to <16 x i8>
456             ret <16 x i8> %r`;
457         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
458     }
459     else
460     {
461         byte16 sa = cast(byte16)a;
462         byte16 sb = cast(byte16)b;
463         byte16 sr = void;
464         foreach(i; 0..16)
465         {
466             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
467         }
468         return cast(int4)sr;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_set1_epi8(31);
474     __m128i B = _mm_set1_epi8(64);
475     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
476     foreach(i; 0..16)
477         assert(avg.array[i] == 48);
478 }
479 
480 /// Shift `a` left by `bytes` bytes while shifting in zeros.
481 alias _mm_bslli_si128 = _mm_slli_si128;
482 unittest
483 {
484     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
485     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
486     __m128i result = _mm_bslli_si128!5(toShift);
487     assert( (cast(byte16)result).array == exact);
488 }
489 
490 /// Shift `v` right by `bytes` bytes while shifting in zeros.
491 alias _mm_bsrli_si128 = _mm_srli_si128;
492 unittest
493 {
494     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
495     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
496     __m128i result = _mm_bsrli_si128!5(toShift);
497     assert( (cast(byte16)result).array == exact);
498 }
499 
500 /// Cast vector of type `__m128d` to type `__m128`. 
501 /// Note: Also possible with a regular `cast(__m128)(a)`.
502 __m128 _mm_castpd_ps (__m128d a) pure @safe
503 {
504     return cast(__m128)a;
505 }
506 
507 /// Cast vector of type `__m128d` to type `__m128i`. 
508 /// Note: Also possible with a regular `cast(__m128i)(a)`.
509 __m128i _mm_castpd_si128 (__m128d a) pure @safe
510 {
511     return cast(__m128i)a;
512 }
513 
514 /// Cast vector of type `__m128` to type `__m128d`. 
515 /// Note: Also possible with a regular `cast(__m128d)(a)`.
516 __m128d _mm_castps_pd (__m128 a) pure @safe
517 {
518     return cast(__m128d)a;
519 }
520 
521 /// Cast vector of type `__m128` to type `__m128i`. 
522 /// Note: Also possible with a regular `cast(__m128i)(a)`.
523 __m128i _mm_castps_si128 (__m128 a) pure @safe
524 {
525     return cast(__m128i)a;
526 }
527 
528 /// Cast vector of type `__m128i` to type `__m128d`. 
529 /// Note: Also possible with a regular `cast(__m128d)(a)`.
530 __m128d _mm_castsi128_pd (__m128i a) pure @safe
531 {
532     return cast(__m128d)a;
533 }
534 
535 /// Cast vector of type `__m128i` to type `__m128`. 
536 /// Note: Also possible with a regular `cast(__m128)(a)`.
537 __m128 _mm_castsi128_ps (__m128i a) pure @safe
538 {
539     return cast(__m128)a;
540 }
541 
542 /// Invalidate and flush the cache line that contains `p` 
543 /// from all levels of the cache hierarchy.
544 void _mm_clflush (const(void)* p) @trusted
545 {
546     static if (GDC_with_SSE2)
547     {
548         __builtin_ia32_clflush(p);
549     }
550     else static if (LDC_with_SSE2)
551     {
552         __builtin_ia32_clflush(cast(void*)p);
553     }
554     else version(D_InlineAsm_X86)
555     {
556         asm pure nothrow @nogc @safe
557         {
558             mov EAX, p;
559             clflush [EAX];
560         }
561     }
562     else version(D_InlineAsm_X86_64)
563     {
564         asm pure nothrow @nogc @safe
565         {
566             mov RAX, p;
567             clflush [RAX];
568         }
569     }
570     else 
571     {
572         // Do nothing. Invalidating cacheline does
573         // not affect correctness.
574     }
575 }
576 unittest
577 {
578     ubyte[64] cacheline;
579     _mm_clflush(cacheline.ptr);
580 }
581 
582 /// Compare packed 16-bit integers in `a` and `b` for equality.
583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
584 {
585     static if (GDC_with_SSE2)
586     {
587         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
588     }
589     else
590     {
591         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
592     }
593 }
594 unittest
595 {
596     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
597     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
598     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
599     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
600     assert(R.array == E);
601 }
602 
603 /// Compare packed 32-bit integers in `a` and `b` for equality.
604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
605 {
606     static if (GDC_with_SSE2)
607     {
608         return __builtin_ia32_pcmpeqd128(a, b);
609     }
610     else
611     {
612         return equalMask!__m128i(a, b);
613     }
614 }
615 unittest
616 {
617     int4   A = [-3, -2, -1,  0];
618     int4   B = [ 4, -2,  2,  0];
619     int[4] E = [ 0, -1,  0, -1];
620     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
621     assert(R.array == E);
622 }
623 
624 /// Compare packed 8-bit integers in `a` and `b` for equality.
625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
626 {
627     static if (GDC_with_SSE2)
628     {
629         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
630     }
631     else
632     {
633         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
634     }
635 }
636 unittest
637 {
638     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
639     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
640     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
641     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
642     assert(C.array == correct);
643 }
644 
645 /// Compare packed double-precision (64-bit) floating-point elements 
646 /// in `a` and `b` for equality.
647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
648 {
649     static if (GDC_with_SSE2)
650     {
651         return __builtin_ia32_cmpeqpd(a, b);
652     }
653     else
654     {
655         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
656     }
657 }
658 
659 /// Compare the lower double-precision (64-bit) floating-point elements
660 /// in `a` and `b` for equality, store the result in the lower element,
661 /// and copy the upper element from `a`.
662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
663 {
664     static if (GDC_with_SSE2)
665     {
666         return __builtin_ia32_cmpeqsd(a, b);
667     }
668     else
669     {
670         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
671     }
672 }
673 
674 /// Compare packed double-precision (64-bit) floating-point elements 
675 /// in `a` and `b` for greater-than-or-equal.
676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
677 {
678     static if (GDC_with_SSE2)
679     {
680         return __builtin_ia32_cmpgepd(a, b);
681     }
682     else
683     {
684         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
685     }
686 }
687 
688 /// Compare the lower double-precision (64-bit) floating-point elements 
689 /// in `a` and `b` for greater-than-or-equal, store the result in the 
690 /// lower element, and copy the upper element from `a`.
691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
692 {
693     // Note: There is no __builtin_ia32_cmpgesd builtin.
694     static if (GDC_with_SSE2)
695     {
696         return __builtin_ia32_cmpnltsd(b, a);
697     }
698     else
699     {
700         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
701     }
702 }
703 
704 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
706 {
707     static if (GDC_with_SSE2)
708     {
709         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
710     }
711     else
712     {
713         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
714     }
715 }
716 unittest
717 {
718     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
719     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
720     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
721     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
722     assert(R.array == E);
723 }
724 
725 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
727 {
728     static if (GDC_with_SSE2)
729     {
730         return __builtin_ia32_pcmpgtd128(a, b); 
731     }
732     else
733     {
734         return cast(__m128i)( greaterMask!int4(a, b));
735     }
736 }
737 unittest
738 {
739     int4   A = [-3,  2, -1,  0];
740     int4   B = [ 4, -2,  2,  0];
741     int[4] E = [ 0, -1,  0,  0];
742     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
743     assert(R.array == E);
744 }
745 
746 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
748 {
749     static if (GDC_with_SSE2)
750     {
751         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
752     }
753     else
754     {
755         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
756     }
757 }
758 unittest
759 {
760     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
761     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
762     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
763     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
764     __m128i D = _mm_cmpeq_epi8(A, B);
765     assert(C.array == correct);
766 }
767 
768 /// Compare packed double-precision (64-bit) floating-point elements 
769 /// in `a` and `b` for greater-than.
770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
771 {
772     static if (GDC_with_SSE2)
773     {
774         return __builtin_ia32_cmpgtpd(a, b); 
775     }
776     else
777     {
778         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
779     }
780 }
781 
782 /// Compare the lower double-precision (64-bit) floating-point elements 
783 /// in `a` and `b` for greater-than, store the result in the lower element,
784 /// and copy the upper element from `a`.
785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
786 {
787     // Note: There is no __builtin_ia32_cmpgtsd builtin.
788     static if (GDC_with_SSE2)
789     {
790         return __builtin_ia32_cmpnlesd(b, a);
791     }
792     else
793     {
794         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
795     }
796 }
797 
798 /// Compare packed double-precision (64-bit) floating-point elements 
799 /// in `a` and `b` for less-than-or-equal.
800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
801 {
802     static if (GDC_with_SSE2)
803     {
804         return __builtin_ia32_cmplepd(a, b); 
805     }
806     else
807     {
808         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
809     }
810 }
811 
812 /// Compare the lower double-precision (64-bit) floating-point elements 
813 /// in `a` and `b` for less-than-or-equal, store the result in the 
814 /// lower element, and copy the upper element from `a`.
815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
816 {
817     static if (GDC_with_SSE2)
818     {
819         return __builtin_ia32_cmplesd(a, b); 
820     }
821     else
822     {
823         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
824     }
825 }
826 
827 /// Compare packed 16-bit integers in `a` and `b` for less-than.
828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
829 {
830     return _mm_cmpgt_epi16(b, a);
831 }
832 
833 /// Compare packed 32-bit integers in `a` and `b` for less-than.
834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
835 {
836     return _mm_cmpgt_epi32(b, a);
837 }
838 
839 /// Compare packed 8-bit integers in `a` and `b` for less-than.
840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
841 {
842     return _mm_cmpgt_epi8(b, a);
843 }
844 
845 /// Compare packed double-precision (64-bit) floating-point elements
846 /// in `a` and `b` for less-than.
847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
848 {
849     static if (GDC_with_SSE2)
850     {
851         return __builtin_ia32_cmpltpd(a, b); 
852     }
853     else
854     {
855         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
856     }
857 }
858 
859 /// Compare the lower double-precision (64-bit) floating-point elements
860 /// in `a` and `b` for less-than, store the result in the lower 
861 /// element, and copy the upper element from `a`.
862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
863 {
864     static if (GDC_with_SSE2)
865     {
866         return __builtin_ia32_cmpltsd(a, b); 
867     }
868     else
869     {
870         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
871     }
872 }
873 
874 /// Compare packed double-precision (64-bit) floating-point elements
875 /// in `a` and `b` for not-equal.
876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
877 {
878     static if (GDC_with_SSE2)
879     {
880         return __builtin_ia32_cmpneqpd(a, b); 
881     }
882     else
883     {
884         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
885     }
886 }
887 
888 /// Compare the lower double-precision (64-bit) floating-point elements
889 /// in `a` and `b` for not-equal, store the result in the lower 
890 /// element, and copy the upper element from `a`.
891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
892 {
893     static if (GDC_with_SSE2)
894     {
895         return __builtin_ia32_cmpneqsd(a, b); 
896     }
897     else
898     {
899         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
900     }
901 }
902 
903 /// Compare packed double-precision (64-bit) floating-point elements 
904 /// in `a` and `b` for not-greater-than-or-equal.
905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
906 {
907     static if (GDC_with_SSE2)
908     {
909         return __builtin_ia32_cmpngepd(a, b); 
910     }
911     else
912     {
913         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
914     }
915 }
916 
917 /// Compare the lower double-precision (64-bit) floating-point elements 
918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
919 /// the lower element, and copy the upper element from `a`.
920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
921 {
922     // Note: There is no __builtin_ia32_cmpngesd builtin.
923     static if (GDC_with_SSE2)
924     {
925         return __builtin_ia32_cmpltsd(b, a); 
926     }
927     else
928     {
929         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
930     }
931 }
932 
933 /// Compare packed double-precision (64-bit) floating-point elements 
934 /// in `a` and `b` for not-greater-than.
935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
936 {
937     static if (GDC_with_SSE2)
938     {
939         return __builtin_ia32_cmpngtpd(a, b);
940     }
941     else
942     {
943         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
944     }
945 }
946 
947 /// Compare the lower double-precision (64-bit) floating-point elements 
948 /// in `a` and `b` for not-greater-than, store the result in the 
949 /// lower element, and copy the upper element from `a`.
950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
951 {
952     // Note: There is no __builtin_ia32_cmpngtsd builtin.
953     static if (GDC_with_SSE2)
954     {
955         return __builtin_ia32_cmplesd(b, a);
956     }
957     else
958     {
959         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
960     }
961 }
962 
963 /// Compare packed double-precision (64-bit) floating-point elements 
964 /// in `a` and `b` for not-less-than-or-equal.
965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
966 {
967     static if (GDC_with_SSE2)
968     {
969         return __builtin_ia32_cmpnlepd(a, b);
970     }
971     else
972     {
973         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
974     }
975 }
976 
977 /// Compare the lower double-precision (64-bit) floating-point elements 
978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
979 /// lower element, and copy the upper element from `a`.
980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
981 {
982     static if (GDC_with_SSE2)
983     {
984         return __builtin_ia32_cmpnlesd(a, b);
985     }
986     else
987     {
988         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
989     }
990 }
991  
992 /// Compare packed double-precision (64-bit) floating-point elements 
993 /// in `a` and `b` for not-less-than.
994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
995 {
996     static if (GDC_with_SSE2)
997     {
998         return __builtin_ia32_cmpnltpd(a, b);
999     }
1000     else
1001     {
1002         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1003     }
1004 }
1005 
1006 /// Compare the lower double-precision (64-bit) floating-point elements 
1007 /// in `a` and `b` for not-less-than, store the result in the lower 
1008 /// element, and copy the upper element from `a`.
1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1010 {
1011     static if (GDC_with_SSE2)
1012     {
1013         return __builtin_ia32_cmpnltsd(a, b);
1014     }
1015     else
1016     {
1017         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1018     }
1019 }
1020 
1021 /// Compare packed double-precision (64-bit) floating-point elements 
1022 /// in `a` and `b` to see if neither is NaN.
1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1024 {
1025     static if (GDC_with_SSE2)
1026     {
1027         return __builtin_ia32_cmpordpd(a, b);
1028     }
1029     else
1030     {
1031         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1032     }
1033 }
1034 
1035 /// Compare the lower double-precision (64-bit) floating-point elements 
1036 /// in `a` and `b` to see if neither is NaN, store the result in the 
1037 /// lower element, and copy the upper element from `a` to the upper element.
1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1039 {
1040     static if (GDC_with_SSE2)
1041     {
1042         return __builtin_ia32_cmpordsd(a, b);
1043     }
1044     else
1045     {
1046         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1047     }
1048 }
1049 
1050 /// Compare packed double-precision (64-bit) floating-point elements 
1051 /// in `a` and `b` to see if either is NaN.
1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1053 {
1054     static if (GDC_with_SSE2)
1055     {
1056         return __builtin_ia32_cmpunordpd(a, b);
1057     }
1058     else
1059     {
1060         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1061     }
1062 }
1063 
1064 /// Compare the lower double-precision (64-bit) floating-point elements 
1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1066 /// element, and copy the upper element from `a` to the upper element.
1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1068 {
1069     static if (GDC_with_SSE2)
1070     {
1071         return __builtin_ia32_cmpunordsd(a, b);
1072     }
1073     else
1074     {
1075         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1076     }
1077 }
1078 
1079 /// Compare the lower double-precision (64-bit) floating-point element 
1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1082 {
1083     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1084     // comisd instruction, it returns false in case of unordered instead.
1085     //
1086     // Actually C++ compilers disagree over the meaning of that instruction.
1087     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1088     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1089     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1090     return a.array[0] == b.array[0];
1091 }
1092 unittest
1093 {
1094     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1095     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1096     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1097     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1098     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1099 }
1100 
1101 /// Compare the lower double-precision (64-bit) floating-point element 
1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1103 /// result (0 or 1).
1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1105 {
1106     return a.array[0] >= b.array[0];
1107 }
1108 unittest
1109 {
1110     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1111     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1112     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1113     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1114     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1115     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1116 }
1117 
1118 /// Compare the lower double-precision (64-bit) floating-point element 
1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1121 {
1122     return a.array[0] > b.array[0];
1123 }
1124 unittest
1125 {
1126     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1127     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1128     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1129     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1130     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1131 }
1132 
1133 /// Compare the lower double-precision (64-bit) floating-point element 
1134 /// in `a` and `b` for less-than-or-equal.
1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1136 {
1137     return a.array[0] <= b.array[0];
1138 }
1139 unittest
1140 {
1141     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1142     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1143     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1144     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1145     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1146     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1147 }
1148 
1149 /// Compare the lower double-precision (64-bit) floating-point element 
1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1152 {
1153     return a.array[0] < b.array[0];
1154 }
1155 unittest
1156 {
1157     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1158     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1159     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1160     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1161     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1162     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1163 }
1164 
1165 /// Compare the lower double-precision (64-bit) floating-point element
1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1168 {
1169     return a.array[0] != b.array[0];
1170 }
1171 unittest
1172 {
1173     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1174     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1175     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1176     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1177     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1178 }
1179 
1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1181 /// floating-point elements.
1182  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1183 {
1184     version(LDC)
1185     {
1186         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1187         enum ir = `
1188             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1189             %r = sitofp <2 x i32> %v to <2 x double>
1190             ret <2 x double> %r`;
1191         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1192     }
1193     else static if (GDC_with_SSE2)
1194     {
1195         return __builtin_ia32_cvtdq2pd(a);
1196     }
1197     else
1198     {
1199         double2 r = void;
1200         r.ptr[0] = a.array[0];
1201         r.ptr[1] = a.array[1];
1202         return r;
1203     }
1204 }
1205 unittest
1206 {
1207     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1208     assert(A.array[0] == 54.0);
1209     assert(A.array[1] == 54.0);
1210 }
1211 
1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1213 /// floating-point elements.
1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1215 {
1216     static if (GDC_with_SSE2)
1217     {
1218         return __builtin_ia32_cvtdq2ps(a);
1219     }
1220     else version(LDC)
1221     {
1222         // See #86 for why we had to resort to LLVM IR.
1223         // Plain code below was leading to catastrophic behaviour. 
1224         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1225         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1226         enum ir = `
1227             %r = sitofp <4 x i32> %0 to <4 x float>
1228             ret <4 x float> %r`;
1229         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1230     }
1231     else
1232     {
1233         __m128 res;
1234         res.ptr[0] = cast(float)a.array[0];
1235         res.ptr[1] = cast(float)a.array[1];
1236         res.ptr[2] = cast(float)a.array[2];
1237         res.ptr[3] = cast(float)a.array[3];
1238         return res;
1239     }
1240 }
1241 unittest
1242 {
1243     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1244     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1245 }
1246 
1247 /// Convert packed double-precision (64-bit) floating-point elements 
1248 /// in `a` to packed 32-bit integers.
1249 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1250 {
1251     // PERF ARM32
1252     static if (LDC_with_SSE2)
1253     {
1254         return __builtin_ia32_cvtpd2dq(a);
1255     }
1256     else static if (GDC_with_SSE2)
1257     {
1258         return __builtin_ia32_cvtpd2dq(a);
1259     }
1260     else static if (LDC_with_ARM64)
1261     {
1262         // Get current rounding mode.
1263         uint fpscr = arm_get_fpcr();
1264         long2 i;
1265         switch(fpscr & _MM_ROUND_MASK_ARM)
1266         {
1267             default:
1268             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1269             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1270             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1271             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1272         }
1273         int4 zero = 0;
1274         return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1275     }
1276     else
1277     {
1278         // PERF ARM32
1279         __m128i r = _mm_setzero_si128();
1280         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1281         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1282         return r;
1283     }
1284 }
1285 unittest
1286 {
1287     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1288     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1289 }
1290 
1291 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1292 /// to packed 32-bit integers
1293 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1294 {
1295     return to_m64(_mm_cvtpd_epi32(v));
1296 }
1297 unittest
1298 {
1299     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1300     assert(A.array[0] == 55 && A.array[1] == 61);
1301 }
1302 
1303 /// Convert packed double-precision (64-bit) floating-point elements 
1304 /// in `a` to packed single-precision (32-bit) floating-point elements.
1305 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1306 {
1307     static if (LDC_with_SSE2)
1308     {
1309         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1310     }
1311     else static if (GDC_with_SSE2)
1312     {
1313         return __builtin_ia32_cvtpd2ps(a);
1314     }
1315     else
1316     { 
1317         __m128 r = void;
1318         r.ptr[0] = a.array[0];
1319         r.ptr[1] = a.array[1];
1320         r.ptr[2] = 0;
1321         r.ptr[3] = 0;
1322         return r;
1323     }
1324 }
1325 unittest
1326 {
1327     __m128d A = _mm_set_pd(5.25, 4.0);
1328     __m128 B = _mm_cvtpd_ps(A);
1329     assert(B.array == [4.0f, 5.25f, 0, 0]);
1330 }
1331 
1332 /// Convert packed 32-bit integers in `v` to packed double-precision 
1333 /// (64-bit) floating-point elements.
1334 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1335 {
1336     return _mm_cvtepi32_pd(to_m128i(v));
1337 }
1338 unittest
1339 {
1340     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1341     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1342 }
1343 
1344 /// Convert packed single-precision (32-bit) floating-point elements 
1345 /// in `a` to packed 32-bit integers
1346 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1347 {
1348     static if (LDC_with_SSE2)
1349     {
1350         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1351     }
1352     else static if (GDC_with_SSE2)
1353     {
1354         return __builtin_ia32_cvtps2dq(a);
1355     }
1356     else static if (LDC_with_ARM64)
1357     {
1358         // Get current rounding mode.
1359         uint fpscr = arm_get_fpcr();
1360         switch(fpscr & _MM_ROUND_MASK_ARM)
1361         {
1362             default:
1363             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1364             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1365             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1366             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1367         }
1368     }
1369     else
1370     {
1371         __m128i r = void;
1372         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1373         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1374         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1375         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1376         return r;
1377     }
1378 }
1379 unittest
1380 {
1381     // GDC bug #98607
1382     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1383     // GDC does not provide optimization barrier for rounding mode.
1384     // Workarounded with different literals. This bug will likely only manifest in unittest.
1385     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1386 
1387     uint savedRounding = _MM_GET_ROUNDING_MODE();
1388 
1389     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1390     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1391     assert(A.array == [1, -2, 54, -3]);
1392 
1393     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1394     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1395     assert(A.array == [1, -3, 53, -3]);
1396 
1397     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1398     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1399     assert(A.array == [2, -2, 54, -2]);
1400 
1401     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1402     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1403     assert(A.array == [1, -2, 53, -2]);
1404 
1405     _MM_SET_ROUNDING_MODE(savedRounding);
1406 }
1407 
1408 /// Convert packed single-precision (32-bit) floating-point elements 
1409 /// in `a` to packed double-precision (64-bit) floating-point elements.
1410 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1411 {
1412     version(LDC)
1413     {
1414         // Generates cvtps2pd since LDC 1.0 -O0
1415         enum ir = `
1416             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1417             %r = fpext <2 x float> %v to <2 x double>
1418             ret <2 x double> %r`;
1419         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1420     }
1421     else static if (GDC_with_SSE2)
1422     {
1423         return __builtin_ia32_cvtps2pd(a);
1424     }
1425     else
1426     {
1427         double2 r = void;
1428         r.ptr[0] = a.array[0];
1429         r.ptr[1] = a.array[1];
1430         return r;
1431     }
1432 }
1433 unittest
1434 {
1435     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1436     assert(A.array[0] == 54.0);
1437     assert(A.array[1] == 54.0);
1438 }
1439 
1440 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1441 double _mm_cvtsd_f64 (__m128d a) pure @safe
1442 {
1443     return a.array[0];
1444 }
1445 
1446 /// Convert the lower double-precision (64-bit) floating-point element
1447 /// in `a` to a 32-bit integer.
1448 int _mm_cvtsd_si32 (__m128d a) @safe
1449 {
1450     static if (LDC_with_SSE2)
1451     {
1452         return __builtin_ia32_cvtsd2si(a);
1453     }
1454     else static if (GDC_with_SSE2)
1455     {
1456         return __builtin_ia32_cvtsd2si(a);
1457     }
1458     else
1459     {
1460         return convertDoubleToInt32UsingMXCSR(a[0]);
1461     }
1462 }
1463 unittest
1464 {
1465     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1466 }
1467 
1468 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1469 long _mm_cvtsd_si64 (__m128d a) @trusted
1470 {
1471     version (LDC)
1472     {
1473         version (X86_64)
1474         {
1475             return __builtin_ia32_cvtsd2si64(a);
1476         }
1477         else
1478         {
1479             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1480             // using SSE instructions only. So the builtin doesn't exit for this arch.
1481             return convertDoubleToInt64UsingMXCSR(a[0]);
1482         }
1483     }
1484     else
1485     {
1486         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1487     }
1488 }
1489 unittest
1490 {
1491     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1492 
1493     uint savedRounding = _MM_GET_ROUNDING_MODE();
1494 
1495     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1496     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1497 
1498     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1499     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1500 
1501     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1502     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1503 
1504     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1505     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1506 
1507     _MM_SET_ROUNDING_MODE(savedRounding);
1508 }
1509 
1510 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1511 
1512 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1513 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1514 /// to the upper elements of result.
1515 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1516 {
1517     static if (GDC_with_SSE2)
1518     {
1519         return __builtin_ia32_cvtsd2ss(a, b); 
1520     }
1521     else
1522     {
1523         // Generates cvtsd2ss since LDC 1.3 -O0
1524         a.ptr[0] = b.array[0];
1525         return a;
1526     }
1527 }
1528 unittest
1529 {
1530     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1531     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1532 }
1533 
1534 /// Get the lower 32-bit integer in `a`.
1535 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1536 {
1537     return a.array[0];
1538 }
1539 
1540 /// Get the lower 64-bit integer in `a`.
1541 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1542 {
1543     long2 la = cast(long2)a;
1544     return la.array[0];
1545 }
1546 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1547 
1548 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1549 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1550 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1551 {
1552     a.ptr[0] = cast(double)b;
1553     return a;
1554 }
1555 unittest
1556 {
1557     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1558     assert(a.array == [42.0, 0]);
1559 }
1560 
1561 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1562 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1563 {
1564     int4 r = [0, 0, 0, 0];
1565     r.ptr[0] = a;
1566     return r;
1567 }
1568 unittest
1569 {
1570     __m128i a = _mm_cvtsi32_si128(65);
1571     assert(a.array == [65, 0, 0, 0]);
1572 }
1573 
1574 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1575 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1576 
1577 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1578 {
1579     a.ptr[0] = cast(double)b;
1580     return a;
1581 }
1582 unittest
1583 {
1584     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1585     assert(a.array == [42.0, 0]);
1586 }
1587 
1588 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1589 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1590 {
1591     long2 r = [0, 0];
1592     r.ptr[0] = a;
1593     return cast(__m128i)(r);
1594 }
1595 
1596 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1597 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1598 
1599 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1600 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1601 // element of result.
1602 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1603 {
1604     a.ptr[0] = b.array[0];
1605     return a;
1606 }
1607 unittest
1608 {
1609     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1610     assert(a.array == [42.0, 0]);
1611 }
1612 
1613 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1614 long _mm_cvttss_si64 (__m128 a) pure @safe
1615 {
1616     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1617 }
1618 unittest
1619 {
1620     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1621 }
1622 
1623 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1624 /// Put zeroes in the upper elements of result.
1625 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1626 {
1627     static if (LDC_with_SSE2)
1628     {
1629         return __builtin_ia32_cvttpd2dq(a);
1630     }
1631     else static if (GDC_with_SSE2)
1632     {
1633         return __builtin_ia32_cvttpd2dq(a);
1634     }
1635     else
1636     {
1637         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1638         __m128i r;
1639         r.ptr[0] = cast(int)a.array[0];
1640         r.ptr[1] = cast(int)a.array[1];
1641         r.ptr[2] = 0;
1642         r.ptr[3] = 0;
1643         return r;
1644     }
1645 }
1646 unittest
1647 {
1648     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1649     assert(R.array == [-4, 45641, 0, 0]);
1650 }
1651 
1652 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1653 /// to packed 32-bit integers with truncation.
1654 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1655 {
1656     return to_m64(_mm_cvttpd_epi32(v));
1657 }
1658 unittest
1659 {
1660     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1661     int[2] correct = [-4, 45641];
1662     assert(R.array == correct);
1663 }
1664 
1665 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1666 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1667 {
1668     // x86: Generates cvttps2dq since LDC 1.3 -O2
1669     // ARM64: generates fcvtze since LDC 1.8 -O2
1670     __m128i r;
1671     r.ptr[0] = cast(int)a.array[0];
1672     r.ptr[1] = cast(int)a.array[1];
1673     r.ptr[2] = cast(int)a.array[2];
1674     r.ptr[3] = cast(int)a.array[3];
1675     return r;
1676 }
1677 unittest
1678 {
1679     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1680     assert(R.array == [-4, 45641, 0, 1]);
1681 }
1682 
1683 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1684 int _mm_cvttsd_si32 (__m128d a)
1685 {
1686     // Generates cvttsd2si since LDC 1.3 -O0
1687     return cast(int)a.array[0];
1688 }
1689 
1690 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1691 long _mm_cvttsd_si64 (__m128d a)
1692 {
1693     // Generates cvttsd2si since LDC 1.3 -O0
1694     // but in 32-bit instead, it's a long sequence that resort to FPU
1695     return cast(long)a.array[0];
1696 }
1697 
1698 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1699 
1700 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1701 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1702 {
1703     pragma(inline, true);
1704     return a / b;
1705 }
1706 
1707 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1708 {
1709     static if (GDC_with_SSE2)
1710     {
1711         return __builtin_ia32_divsd(a, b);
1712     }
1713     else version(DigitalMars)
1714     {
1715         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1716         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1717         asm pure nothrow @nogc @trusted { nop;}
1718         a.array[0] = a.array[0] / b.array[0];
1719         return a;
1720     }
1721     else
1722     {
1723         a.ptr[0] /= b.array[0];
1724         return a;
1725     }
1726 }
1727 unittest
1728 {
1729     __m128d a = [2.0, 4.5];
1730     a = _mm_div_sd(a, a);
1731     assert(a.array == [1.0, 4.5]);
1732 }
1733 
1734 /// Extract a 16-bit integer from `v`, selected with `index`.
1735 /// Warning: the returned value is zero-extended to 32-bits.
1736 int _mm_extract_epi16(__m128i v, int index) pure @safe
1737 {
1738     short8 r = cast(short8)v;
1739     return cast(ushort)(r.array[index & 7]);
1740 }
1741 unittest
1742 {
1743     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1744     assert(_mm_extract_epi16(A, 6) == 6);
1745     assert(_mm_extract_epi16(A, 0) == 65535);
1746     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1747 }
1748 
1749 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1750 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1751 {
1752     short8 r = cast(short8)v;
1753     r.ptr[index & 7] = cast(short)i;
1754     return cast(__m128i)r;
1755 }
1756 unittest
1757 {
1758     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1759     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1760     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1761     assert(R.array == correct);
1762 }
1763 
1764 
1765 void _mm_lfence() @trusted
1766 {
1767     version(GNU)
1768     {
1769     
1770         static if (GDC_with_SSE2)
1771         {
1772             __builtin_ia32_lfence();
1773         }
1774         else version(X86)
1775         {
1776             asm pure nothrow @nogc @trusted
1777             {
1778                 "lfence;\n" : : : ;
1779             }
1780         }
1781         else
1782             static assert(false);
1783     }
1784     else static if (LDC_with_SSE2)
1785     {
1786         __builtin_ia32_lfence();
1787     }
1788     else static if (DMD_with_asm)
1789     {
1790         asm nothrow @nogc pure @safe
1791         {
1792             lfence;
1793         }
1794     }
1795     else version(LDC)
1796     {
1797         llvm_memory_fence(); // PERF actually generates mfence
1798     }
1799     else
1800         static assert(false);
1801 }
1802 unittest
1803 {
1804     _mm_lfence();
1805 }
1806 
1807 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1808 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1809 __m128d _mm_load_pd (const(double) * mem_addr) pure
1810 {
1811     pragma(inline, true);
1812     __m128d* aligned = cast(__m128d*)mem_addr;
1813     return *aligned;
1814 }
1815 unittest
1816 {
1817     align(16) double[2] S = [-5.0, 7.0];
1818     __m128d R = _mm_load_pd(S.ptr);
1819     assert(R.array == S);
1820 }
1821 
1822 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1823 /// `mem_addr` does not need to be aligned on any particular boundary.
1824 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1825 {
1826     double m = *mem_addr;
1827     __m128d r;
1828     r.ptr[0] = m;
1829     r.ptr[1] = m;
1830     return r;
1831 }
1832 unittest
1833 {
1834     double what = 4;
1835     __m128d R = _mm_load_pd1(&what);
1836     double[2] correct = [4.0, 4];
1837     assert(R.array == correct);
1838 }
1839 
1840 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1841 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1842 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1843 {
1844     double2 r = [0, 0];
1845     r.ptr[0] = *mem_addr;
1846     return r;
1847 }
1848 unittest
1849 {
1850     double x = -42;
1851     __m128d a = _mm_load_sd(&x);
1852     assert(a.array == [-42.0, 0.0]);
1853 }
1854 
1855 /// Load 128-bits of integer data from memory into dst. 
1856 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1857 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1858 {
1859     pragma(inline, true);
1860     return *mem_addr;
1861 }
1862 unittest
1863 {
1864     align(16) int[4] correct = [-1, 2, 3, 4];
1865     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1866     assert(A.array == correct);
1867 }
1868 
1869 alias _mm_load1_pd = _mm_load_pd1; ///
1870 
1871 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1872 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1873 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1874 {
1875     pragma(inline, true);
1876     a.ptr[1] = *mem_addr;
1877     return a;
1878 }
1879 unittest
1880 {
1881     double A = 7.0;
1882     __m128d B = _mm_setr_pd(4.0, -5.0);
1883     __m128d R = _mm_loadh_pd(B, &A);
1884     double[2] correct = [ 4.0, 7.0 ];
1885     assert(R.array == correct);
1886 }
1887 
1888 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1889 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1890 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1891 {
1892     pragma(inline, true);
1893     auto pLong = cast(const(long)*)mem_addr;
1894     long2 r = [0, 0];
1895     r.ptr[0] = *pLong;
1896     return cast(__m128i)(r);
1897 }
1898 unittest
1899 {
1900     long A = 0x7878787870707070;
1901     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1902     long[2] correct = [0x7878787870707070, 0];
1903     assert(R.array == correct);
1904 }
1905 
1906 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1907 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1908 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1909 {
1910     a.ptr[0] = *mem_addr;
1911     return a;
1912 }
1913 unittest
1914 {
1915     double A = 7.0;
1916     __m128d B = _mm_setr_pd(4.0, -5.0);
1917     __m128d R = _mm_loadl_pd(B, &A);
1918     double[2] correct = [ 7.0, -5.0 ];
1919     assert(R.array == correct);
1920 }
1921 
1922 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1924 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1925 {
1926     __m128d a = *cast(__m128d*)(mem_addr);
1927     __m128d r;
1928     r.ptr[0] = a.array[1];
1929     r.ptr[1] = a.array[0];
1930     return r;
1931 }
1932 unittest
1933 {
1934     align(16) double[2] A = [56.0, -74.0];
1935     __m128d R = _mm_loadr_pd(A.ptr);
1936     double[2] correct = [-74.0, 56.0];
1937     assert(R.array == correct);
1938 }
1939 
1940 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1941 /// `mem_addr` does not need to be aligned on any particular boundary.
1942 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1943 {
1944     pragma(inline, true);
1945     static if (GDC_with_SSE2)
1946     {
1947         return __builtin_ia32_loadupd(mem_addr); 
1948     }
1949     else version(LDC)
1950     {
1951         return loadUnaligned!(double2)(mem_addr);
1952     }
1953     else version(DigitalMars)
1954     {
1955         static if (DMD_with_DSIMD)
1956         {
1957             return cast(__m128d)__simd(XMM.LODUPD, *mem_addr);
1958         }
1959         else static if (SSESizedVectorsAreEmulated)
1960         {
1961             // Since this vector is emulated, it doesn't have alignement constraints
1962             // and as such we can just cast it.
1963             return *cast(__m128d*)(mem_addr);
1964         }
1965         else
1966         {
1967             __m128d result;
1968             result.ptr[0] = mem_addr[0];
1969             result.ptr[1] = mem_addr[1];
1970             return result;
1971         }
1972     }
1973     else
1974     {
1975         __m128d result;
1976         result.ptr[0] = mem_addr[0];
1977         result.ptr[1] = mem_addr[1];
1978         return result;
1979     }
1980 }
1981 unittest
1982 {
1983     double[2] A = [56.0, -75.0];
1984     __m128d R = _mm_loadu_pd(A.ptr);
1985     double[2] correct = [56.0, -75.0];
1986     assert(R.array == correct);
1987 }
1988 
1989 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
1990 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1991 {
1992     pragma(inline, true);
1993     static if (GDC_with_SSE2)
1994     {
1995         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1996     }
1997     else
1998     {
1999         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2000     }
2001 }
2002 unittest
2003 {
2004     align(16) int[4] correct = [-1, 2, -3, 4];
2005     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2006     assert(A.array == correct);
2007 }
2008 
2009 /// Load unaligned 32-bit integer from memory into the first element of result.
2010 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
2011 {
2012     pragma(inline, true);
2013     int r = *cast(int*)(mem_addr);
2014     int4 result = [0, 0, 0, 0];
2015     result.ptr[0] = r;
2016     return result;
2017 }
2018 unittest
2019 {
2020     int r = 42;
2021     __m128i A = _mm_loadu_si32(&r);
2022     int[4] correct = [42, 0, 0, 0];
2023     assert(A.array == correct);
2024 }
2025 
2026 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2027 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2028 /// and pack the results in destination.
2029 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2030 {
2031     static if (GDC_with_SSE2)
2032     {
2033         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2034     }
2035     else static if (LDC_with_SSE2)
2036     {
2037         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2038     }
2039     else static if (LDC_with_ARM64)
2040     {
2041         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2042         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2043         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2044         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2045         return vcombine_s32(rl, rh);
2046     }
2047     else
2048     {
2049         short8 sa = cast(short8)a;
2050         short8 sb = cast(short8)b;
2051         int4 r;
2052         foreach(i; 0..4)
2053         {
2054             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2055         }
2056         return r;
2057     }
2058 }
2059 unittest
2060 {
2061     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2062     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2063     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2064     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2065     assert(R.array == correct);
2066 }
2067 
2068 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2069 /// (elements are not stored when the highest bit is not set in the corresponding element)
2070 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2071 /// boundary.
2072 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2073 {
2074     static if (GDC_with_SSE2)
2075     {    
2076         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2077     }
2078     else static if (LDC_with_SSE2)
2079     {
2080         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2081     }
2082     else static if (LDC_with_ARM64)
2083     {
2084         // PERF: catastrophic on ARM32
2085         byte16 bmask  = cast(byte16)mask;
2086         byte16 shift = 7;
2087         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2088         mask = cast(__m128i) bmask;
2089         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2090         dest = (a & mask) | (dest & ~mask);
2091         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2092     }
2093     else
2094     {
2095         byte16 b = cast(byte16)a;
2096         byte16 m = cast(byte16)mask;
2097         byte* dest = cast(byte*)(mem_addr);
2098         foreach(j; 0..16)
2099         {
2100             if (m.array[j] & 128)
2101             {
2102                 dest[j] = b.array[j];
2103             }
2104         }
2105     }
2106 }
2107 unittest
2108 {
2109     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2110     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2111     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2112     _mm_maskmoveu_si128(A, mask, dest.ptr);
2113     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2114     assert(dest == correct);
2115 }
2116 
2117 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2118 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2119 {
2120     static if (GDC_with_SSE2)
2121     {
2122         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2123     }
2124     else version(LDC)
2125     {
2126         // x86: pmaxsw since LDC 1.0 -O1
2127         // ARM: smax.8h since LDC 1.5 -01
2128         short8 sa = cast(short8)a;
2129         short8 sb = cast(short8)b;
2130         short8 greater = greaterMask!short8(sa, sb);
2131         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2132     }
2133     else
2134     {
2135         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2136         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2137         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2138         return _mm_xor_si128(b, mask);
2139     }
2140 }
2141 unittest
2142 {
2143     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2144                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2145     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2146     assert(R.array == correct);
2147 }
2148 
2149 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2150 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2151 {
2152     version(LDC)
2153     {
2154         // x86: pmaxub since LDC 1.0.0 -O1
2155         // ARM64: umax.16b since LDC 1.5.0 -O1
2156         // PERF: catastrophic on ARM32
2157         ubyte16 sa = cast(ubyte16)a;
2158         ubyte16 sb = cast(ubyte16)b;
2159         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2160         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2161     }
2162     else
2163     {
2164         __m128i value128 = _mm_set1_epi8(-128);
2165         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2166         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2167         __m128i mask = _mm_and_si128(aTob, higher);
2168         return _mm_xor_si128(b, mask);
2169     }
2170 }
2171 unittest
2172 {
2173     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2174                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2175     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2176     assert(R.array == correct);
2177 }
2178 
2179 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2180 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2181 {
2182     static if (GDC_with_SSE2)
2183     {
2184         return __builtin_ia32_maxpd(a, b);
2185     }
2186     else
2187     {
2188         // x86: Generates maxpd starting with LDC 1.9 -O2
2189         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2190         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2191         return a;
2192     }
2193 }
2194 unittest
2195 {
2196     __m128d A = _mm_setr_pd(4.0, 1.0);
2197     __m128d B = _mm_setr_pd(1.0, 8.0);
2198     __m128d M = _mm_max_pd(A, B);
2199     assert(M.array[0] == 4.0);
2200     assert(M.array[1] == 8.0);
2201 }
2202 
2203 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2204 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2205 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2206 {
2207     static if (GDC_with_SSE2)
2208     {
2209         return __builtin_ia32_maxsd(a, b);
2210     }
2211     else
2212     {
2213          __m128d r = a;
2214         // Generates maxsd starting with LDC 1.3
2215         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2216         return r;
2217     }
2218 }
2219 unittest
2220 {
2221     __m128d A = _mm_setr_pd(1.0, 1.0);
2222     __m128d B = _mm_setr_pd(4.0, 2.0);
2223     __m128d M = _mm_max_sd(A, B);
2224     assert(M.array[0] == 4.0);
2225     assert(M.array[1] == 1.0);
2226 }
2227 
2228 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2229 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2230 /// is globally visible before any memory instruction which follows the fence in program order.
2231 void _mm_mfence() @trusted
2232 {
2233     version(GNU)
2234     {
2235         static if (GDC_with_SSE2)
2236         {
2237             __builtin_ia32_mfence();
2238         }
2239         else version(X86)
2240         {
2241             asm pure nothrow @nogc @trusted
2242             {
2243                 "mfence;\n" : : : ;
2244             }
2245         }
2246         else
2247             static assert(false);
2248     }
2249     else static if (LDC_with_SSE2)
2250     {
2251         __builtin_ia32_mfence();
2252     }
2253     else static if (DMD_with_asm)
2254     {
2255         asm nothrow @nogc pure @safe
2256         {
2257             mfence;
2258         }
2259     }
2260     else version(LDC)
2261     {
2262         void _mm_mfence() pure @safe
2263         {
2264             // Note: will generate the DMB instruction on ARM
2265             llvm_memory_fence();
2266         }
2267     }
2268     else
2269         static assert(false);
2270 }
2271 unittest
2272 {
2273     _mm_mfence();
2274 }
2275 
2276 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2277 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2278 {
2279     static if (GDC_with_SSE2)
2280     {
2281         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2282     }
2283     else version(LDC)
2284     {
2285         // x86: pminsw since LDC 1.0 -O1
2286         // ARM64: smin.8h since LDC 1.5 -01
2287         short8 sa = cast(short8)a;
2288         short8 sb = cast(short8)b;
2289         short8 greater = greaterMask!short8(sa, sb);
2290         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2291     }
2292     else
2293     {
2294         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2295         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2296         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2297         return _mm_xor_si128(b, mask);
2298     }
2299 }
2300 unittest
2301 {
2302     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2303                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2304     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2305     assert(R.array == correct);
2306 }
2307 
2308 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2309 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2310 {
2311     version(LDC)
2312     {
2313         // x86: pminub since LDC 1.0.0 -O1
2314         // ARM: umin.16b since LDC 1.5.0 -O1
2315         // PERF: catastrophic on ARM32
2316         ubyte16 sa = cast(ubyte16)a;
2317         ubyte16 sb = cast(ubyte16)b;
2318         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2319         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2320     }
2321     else
2322     {
2323         __m128i value128 = _mm_set1_epi8(-128);
2324         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2325         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2326         __m128i mask = _mm_and_si128(aTob, lower);
2327         return _mm_xor_si128(b, mask);
2328     }
2329 }
2330 unittest
2331 {
2332     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2333                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2334     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2335     assert(R.array == correct);
2336 }
2337 
2338 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2339 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2340 {
2341     static if (GDC_with_SSE2)
2342     {
2343         return __builtin_ia32_minpd(a, b);
2344     }
2345     else
2346     {
2347         // Generates minpd starting with LDC 1.9
2348         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2349         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2350         return a;
2351     }
2352 }
2353 unittest
2354 {
2355     __m128d A = _mm_setr_pd(1.0, 2.0);
2356     __m128d B = _mm_setr_pd(4.0, 1.0);
2357     __m128d M = _mm_min_pd(A, B);
2358     assert(M.array[0] == 1.0);
2359     assert(M.array[1] == 1.0);
2360 }
2361 
2362 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2363 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2364 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2365 {
2366     static if (GDC_with_SSE2)
2367     {
2368         return __builtin_ia32_minsd(a, b);
2369     }
2370     else
2371     {
2372         // Generates minsd starting with LDC 1.3
2373         __m128d r = a;
2374         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2375         return r;
2376     }
2377 }
2378 unittest
2379 {
2380     __m128d A = _mm_setr_pd(1.0, 3.0);
2381     __m128d B = _mm_setr_pd(4.0, 2.0);
2382     __m128d M = _mm_min_sd(A, B);
2383     assert(M.array[0] == 1.0);
2384     assert(M.array[1] == 3.0);
2385 }
2386 
2387 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2388 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2389 {
2390     static if (GDC_with_SSE2)
2391     {
2392         // slightly better with GDC -O0
2393         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2394     }
2395     else
2396     {
2397         long2 result = [ 0, 0 ];
2398         long2 la = cast(long2) a;
2399         result.ptr[0] = la.array[0];
2400         return cast(__m128i)(result);
2401     }
2402 }
2403 unittest
2404 {
2405     long2 A = [13, 47];
2406     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2407     long[2] correct = [13, 0];
2408     assert(B.array == correct);
2409 }
2410 
2411 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2412 /// the upper element from `a` to the upper element of dst.
2413 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2414 {
2415     static if (GDC_with_SSE2)
2416     {
2417         return __builtin_ia32_movsd(a, b); 
2418     }
2419     else
2420     {
2421         b.ptr[1] = a.array[1];
2422         return b;
2423     }
2424 }
2425 unittest
2426 {
2427     double2 A = [13.0, 47.0];
2428     double2 B = [34.0, 58.0];
2429     double2 C = _mm_move_sd(A, B);
2430     double[2] correct = [34.0, 47.0];
2431     assert(C.array == correct);
2432 }
2433 
2434 /// Create mask from the most significant bit of each 8-bit element in `v`.
2435 int _mm_movemask_epi8 (__m128i a) pure @trusted
2436 {
2437     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2438     static if (GDC_with_SSE2)
2439     {
2440         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2441     }
2442     else static if (LDC_with_SSE2)
2443     {
2444         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2445     }
2446     else static if (LDC_with_ARM64)
2447     {
2448         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2449         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2450         // SO there might be something a bit faster, but this one is reasonable and branchless.
2451         byte8 mask_shift;
2452         mask_shift.ptr[0] = 7;
2453         mask_shift.ptr[1] = 6;
2454         mask_shift.ptr[2] = 5;
2455         mask_shift.ptr[3] = 4;
2456         mask_shift.ptr[4] = 3;
2457         mask_shift.ptr[5] = 2;
2458         mask_shift.ptr[6] = 1;
2459         mask_shift.ptr[7] = 0;
2460         byte8 mask_and = byte8(-128);
2461         byte8 lo = vget_low_u8(cast(byte16)a);
2462         byte8 hi = vget_high_u8(cast(byte16)a);
2463         lo = vand_u8(lo, mask_and);
2464         lo = vshr_u8(lo, mask_shift);
2465         hi = vand_u8(hi, mask_and);
2466         hi = vshr_u8(hi, mask_shift);
2467         lo = vpadd_u8(lo,lo);
2468         lo = vpadd_u8(lo,lo);
2469         lo = vpadd_u8(lo,lo);
2470         hi = vpadd_u8(hi,hi);
2471         hi = vpadd_u8(hi,hi);
2472         hi = vpadd_u8(hi,hi);
2473         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2474     }
2475     else
2476     {
2477         byte16 ai = cast(byte16)a;
2478         int r = 0;
2479         foreach(bit; 0..16)
2480         {
2481             if (ai.array[bit] < 0) r += (1 << bit);
2482         }
2483         return r;
2484     }
2485 }
2486 unittest
2487 {
2488     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2489 }
2490 
2491 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2492 /// loating-point element in `v`.
2493 int _mm_movemask_pd(__m128d v) pure @safe
2494 {
2495     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2496     static if (GDC_with_SSE2)
2497     {
2498         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2499         /// packed double-precision (64-bit) floating-point element in `v`.
2500         return __builtin_ia32_movmskpd(v);
2501     }
2502     else static if (LDC_with_SSE2)
2503     {
2504         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2505         /// packed double-precision (64-bit) floating-point element in `v`.
2506         return __builtin_ia32_movmskpd(v);
2507     }
2508     else
2509     {
2510         long2 lv = cast(long2)v;
2511         int r = 0;
2512         if (lv.array[0] < 0) r += 1;
2513         if (lv.array[1] < 0) r += 2;
2514         return r;
2515     }
2516 }
2517 unittest
2518 {
2519     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2520     assert(_mm_movemask_pd(A) == 2);
2521 }
2522 
2523 /// Copy the lower 64-bit integer in `v`.
2524 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2525 {
2526     long2 lv = cast(long2)v;
2527     return long1(lv.array[0]);
2528 }
2529 unittest
2530 {
2531     __m128i A = _mm_set_epi64x(-1, -2);
2532     __m64 R = _mm_movepi64_pi64(A);
2533     assert(R.array[0] == -2);
2534 }
2535 
2536 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2537 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2538 {
2539     long2 r;
2540     r.ptr[0] = a.array[0];
2541     r.ptr[1] = 0;
2542     return cast(__m128i)r;
2543 }
2544 
2545 // Note: generates pmuludq in LDC with -O1
2546 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2547 {
2548     __m128i zero = _mm_setzero_si128();
2549 
2550     static if (__VERSION__ >= 2088)
2551     {
2552         // Need LLVM9 to avoid this shufflevector
2553         long2 la, lb;
2554         la.ptr[0] = cast(uint)a.array[0];
2555         la.ptr[1] = cast(uint)a.array[2];
2556         lb.ptr[0] = cast(uint)b.array[0];
2557         lb.ptr[1] = cast(uint)b.array[2];
2558     }
2559     else
2560     {
2561         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2562         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2563     }
2564 
2565     version(DigitalMars)
2566     {
2567         // DMD has no long2 mul
2568         // long2 mul not supported before LDC 1.5
2569         la.ptr[0] *= lb.array[0];
2570         la.ptr[1] *= lb.array[1];
2571         return cast(__m128i)(la);
2572     }
2573     else
2574     {
2575         static if (__VERSION__ >= 2076)
2576         {
2577             return cast(__m128i)(la * lb);
2578         }
2579         else
2580         {
2581             // long2 mul not supported before LDC 1.5
2582             la.ptr[0] *= lb.array[0];
2583             la.ptr[1] *= lb.array[1];
2584             return cast(__m128i)(la);
2585         }
2586     }
2587 }
2588 unittest
2589 {
2590     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2591     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2592     __m128i C = _mm_mul_epu32(A, B);
2593     long2 LC = cast(long2)C;
2594     assert(LC.array[0] == 18446744065119617025uL);
2595     assert(LC.array[1] == 12723420444339690338uL);
2596 }
2597 
2598 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2599 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2600 {
2601     pragma(inline, true);
2602     return a * b;
2603 }
2604 unittest
2605 {
2606     __m128d a = [-2.0, 1.5];
2607     a = _mm_mul_pd(a, a);
2608     assert(a.array == [4.0, 2.25]);
2609 }
2610 
2611 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2612 /// element of result, and copy the upper element from `a` to the upper element of result.
2613 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2614 {
2615     version(DigitalMars)
2616     {    
2617         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2618         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2619         asm pure nothrow @nogc @trusted { nop;}
2620         a.array[0] = a.array[0] * b.array[0];
2621         return a;
2622     }
2623     else static if (GDC_with_SSE2)
2624     {
2625         return __builtin_ia32_mulsd(a, b);
2626     }
2627     else
2628     {
2629         a.ptr[0] *= b.array[0];
2630         return a;
2631     }
2632 }
2633 unittest
2634 {
2635     __m128d a = [-2.0, 1.5];
2636     a = _mm_mul_sd(a, a);
2637     assert(a.array == [4.0, 1.5]);
2638 }
2639 
2640 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2641 /// and get an unsigned 64-bit result.
2642 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2643 {
2644     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2645 }
2646 unittest
2647 {
2648     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2649     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2650     __m64 C = _mm_mul_su32(A, B);
2651     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2652 }
2653 
2654 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2655 /// high 16 bits of the intermediate integers.
2656 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2657 {
2658     static if (GDC_with_SSE2)
2659     {
2660         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2661     }
2662     else static if (LDC_with_SSE2)
2663     {
2664         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2665     }
2666     else
2667     {
2668         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2669         //        PERF: it seems the simde solution has one less instruction in ARM64.
2670         // PERF: Catastrophic in ARM32.
2671         short8 sa = cast(short8)a;
2672         short8 sb = cast(short8)b;
2673         short8 r = void;
2674         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2675         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2676         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2677         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2678         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2679         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2680         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2681         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2682         return cast(__m128i)r;
2683     }
2684 }
2685 unittest
2686 {
2687     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2688     __m128i B = _mm_set1_epi16(16384);
2689     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2690     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2691     assert(R.array == correct);
2692 }
2693 
2694 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2695 /// high 16 bits of the intermediate integers.
2696 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2697 {
2698     static if (GDC_with_SSE2)
2699     {
2700         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2701     }
2702     else static if (LDC_with_SSE2)
2703     {
2704         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2705     }
2706     else
2707     {
2708         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2709         //      it seems the simde solution has one less instruction in ARM64
2710         // PERF: Catastrophic in ARM32.
2711         short8 sa = cast(short8)a;
2712         short8 sb = cast(short8)b;
2713         short8 r = void;
2714         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2715         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2716         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2717         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2718         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2719         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2720         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2721         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2722         return cast(__m128i)r;
2723     }
2724 }
2725 unittest
2726 {
2727     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2728     __m128i B = _mm_set1_epi16(16384);
2729     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2730     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2731     assert(R.array == correct);
2732 }
2733 
2734 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2735 /// bits of the intermediate integers.
2736 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2737 {
2738     return cast(__m128i)(cast(short8)a * cast(short8)b);
2739 }
2740 unittest
2741 {
2742     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2743     __m128i B = _mm_set1_epi16(16384);
2744     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2745     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2746     assert(R.array == correct);
2747 }
2748 
2749 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2750 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2751 {
2752     pragma(inline, true);
2753     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2754 }
2755 
2756 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2757 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2758 {
2759     pragma(inline, true);
2760     return a | b;
2761 }
2762 
2763 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2764 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2765 {
2766     static if (GDC_with_SSE2)
2767     {
2768         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2769     }    
2770     else static if (LDC_with_SSE2)
2771     {
2772         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2773     }
2774     else static if (LDC_with_ARM64)
2775     {
2776         short4 ra = vqmovn_s32(cast(int4)a);
2777         short4 rb = vqmovn_s32(cast(int4)b);
2778         return cast(__m128i)vcombine_s16(ra, rb);
2779     }
2780     else
2781     {
2782         // PERF: catastrophic on ARM32
2783         short8 r;
2784         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2785         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2786         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2787         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2788         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2789         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2790         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2791         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2792         return cast(__m128i)r;
2793     }
2794 }
2795 unittest
2796 {
2797     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2798     short8 R = cast(short8) _mm_packs_epi32(A, A);
2799     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2800     assert(R.array == correct);
2801 }
2802 
2803 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2804 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2805 {
2806     static if (GDC_with_SSE2)
2807     {
2808         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2809     }
2810     else static if (LDC_with_SSE2)
2811     {
2812         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2813     }
2814     else static if (LDC_with_ARM64)
2815     {
2816         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2817         byte8 ra = vqmovn_s16(cast(short8)a);
2818         byte8 rb = vqmovn_s16(cast(short8)b);
2819         return cast(__m128i)vcombine_s8(ra, rb);
2820     }
2821     else
2822     {
2823         // PERF: ARM32 is missing
2824         byte16 r;
2825         short8 sa = cast(short8)a;
2826         short8 sb = cast(short8)b;
2827         foreach(i; 0..8)
2828             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2829         foreach(i; 0..8)
2830             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2831         return cast(__m128i)r;
2832     }
2833 }
2834 unittest
2835 {
2836     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2837     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2838     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2839                         127, -128, 127, 0, 127, -128, 127, 0];
2840     assert(R.array == correct);
2841 }
2842 
2843 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2844 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2845 {
2846     static if (GDC_with_SSE2)
2847     {
2848         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2849     }
2850     else static if (LDC_with_SSE2)
2851     {
2852         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2853     }
2854     else static if (LDC_with_ARM64)
2855     {
2856         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2857         byte8 ra = vqmovun_s16(cast(short8)a);
2858         byte8 rb = vqmovun_s16(cast(short8)b);
2859         return cast(__m128i)vcombine_s8(ra, rb);
2860     }
2861     else
2862     {
2863         short8 sa = cast(short8)a;
2864         short8 sb = cast(short8)b;
2865         ubyte[16] result = void;
2866         for (int i = 0; i < 8; ++i)
2867         {
2868             short s = sa[i];
2869             if (s < 0) s = 0;
2870             if (s > 255) s = 255;
2871             result[i] = cast(ubyte)s;
2872 
2873             s = sb[i];
2874             if (s < 0) s = 0;
2875             if (s > 255) s = 255;
2876             result[i+8] = cast(ubyte)s;
2877         }
2878         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2879     }
2880 }
2881 unittest
2882 {
2883     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2884     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2885     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2886                                                 0, 255, 0, 255, 255, 2, 1, 0];
2887     foreach(i; 0..16)
2888         assert(AA.array[i] == cast(byte)(correctResult[i]));
2889 }
2890 
2891 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2892 /// and power consumption of spin-wait loops.
2893 void _mm_pause() @trusted
2894 {
2895     version(GNU)
2896     {
2897         static if (GDC_with_SSE2)
2898         {
2899             __builtin_ia32_pause();
2900         }
2901         else version(X86)
2902         {
2903             asm pure nothrow @nogc @trusted
2904             {
2905                 "pause;\n" : : : ;
2906             }
2907         }
2908         else
2909             static assert(false);
2910     }
2911     else static if (LDC_with_SSE2)
2912     {
2913         __builtin_ia32_pause();
2914     }
2915     else static if (DMD_with_asm)
2916     {
2917         asm nothrow @nogc pure @safe
2918         {
2919             rep; nop; // F3 90 =  pause
2920         }
2921     }
2922     else version (LDC)
2923     {
2924         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2925     }
2926     else
2927         static assert(false);
2928 }
2929 unittest
2930 {
2931     _mm_pause();
2932 }
2933 
2934 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2935 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2936 /// low 16 bits of 64-bit elements in result.
2937 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2938 {
2939     static if (GDC_with_SSE2)
2940     {
2941         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
2942     }
2943     else static if (LDC_with_SSE2)
2944     {
2945         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2946     }
2947     else static if (LDC_with_ARM64)
2948     {
2949         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
2950 
2951         // PERF: Looks suboptimal vs addp
2952         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
2953         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
2954         ushort8 r = 0;
2955         r[0] = r0;
2956         r[4] = r4;
2957         return cast(__m128i) r;
2958     }
2959     else
2960     {
2961         // PERF: ARM32 is lacking
2962         byte16 ab = cast(byte16)a;
2963         byte16 bb = cast(byte16)b;
2964         ubyte[16] t;
2965         foreach(i; 0..16)
2966         {
2967             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2968             if (diff < 0) diff = -diff;
2969             t[i] = cast(ubyte)(diff);
2970         }
2971         int4 r = _mm_setzero_si128();
2972         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2973         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2974         return r;
2975     }
2976 }
2977 unittest
2978 {
2979     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2980     __m128i B = _mm_set1_epi8(1);
2981     __m128i R = _mm_sad_epu8(A, B);
2982     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2983                       0,
2984                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2985                       0];
2986     assert(R.array == correct);
2987 }
2988 
2989 /// Set packed 16-bit integers with the supplied values.
2990 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2991 {
2992     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2993     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2994 }
2995 unittest
2996 {
2997     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2998     short8 B = cast(short8) A;
2999     foreach(i; 0..8)
3000         assert(B.array[i] == i);
3001 }
3002 
3003 /// Set packed 32-bit integers with the supplied values.
3004 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3005 {
3006     pragma(inline, true);
3007     int[4] result = [e0, e1, e2, e3];
3008     return loadUnaligned!(int4)(result.ptr);
3009 }
3010 unittest
3011 {
3012     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3013     foreach(i; 0..4)
3014         assert(A.array[i] == i);
3015 }
3016 
3017 /// Set packed 64-bit integers with the supplied values.
3018 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3019 {
3020     pragma(inline, true);
3021     long[2] result = [e0.array[0], e1.array[0]];
3022     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3023 }
3024 unittest
3025 {
3026     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3027     long2 B = cast(long2) A;
3028     assert(B.array[0] == 5678);
3029     assert(B.array[1] == 1234);
3030 }
3031 
3032 /// Set packed 64-bit integers with the supplied values.
3033 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3034 {
3035     pragma(inline, true);
3036     long[2] result = [e0, e1];
3037     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3038 }
3039 unittest
3040 {
3041     __m128i A = _mm_set_epi64x(1234, 5678);
3042     long2 B = cast(long2) A;
3043     assert(B.array[0] == 5678);
3044     assert(B.array[1] == 1234);
3045 }
3046 
3047 /// Set packed 8-bit integers with the supplied values.
3048 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3049                       byte e11, byte e10, byte e9, byte e8,
3050                       byte e7, byte e6, byte e5, byte e4,
3051                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3052 {
3053     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3054                      e8, e9, e10, e11, e12, e13, e14, e15];
3055     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3056 }
3057 
3058 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3059 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3060 {
3061     pragma(inline, true);
3062     double[2] result = [e0, e1];
3063     return loadUnaligned!(double2)(result.ptr);
3064 }
3065 unittest
3066 {
3067     __m128d A = _mm_set_pd(61.0, 55.0);
3068     double[2] correct = [55.0, 61.0];
3069     assert(A.array == correct);
3070 }
3071 
3072 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3073 __m128d _mm_set_pd1 (double a) pure @trusted
3074 {
3075     pragma(inline, true);
3076     double[2] result = [a, a];
3077     return loadUnaligned!(double2)(result.ptr);
3078 }
3079 unittest
3080 {
3081     __m128d A = _mm_set_pd1(61.0);
3082     double[2] correct = [61.0, 61.0];
3083     assert(A.array == correct);
3084 }
3085 
3086 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3087 /// and zero the upper element.
3088 __m128d _mm_set_sd (double a) pure @trusted
3089 {
3090     double[2] result = [a, 0];
3091     return loadUnaligned!(double2)(result.ptr);
3092 }
3093 
3094 /// Broadcast 16-bit integer a to all elements of dst.
3095 __m128i _mm_set1_epi16 (short a) pure @trusted
3096 {
3097     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3098     {
3099         short8 v = a;
3100         return cast(__m128i) v;
3101     }
3102     else
3103     {
3104         pragma(inline, true);
3105         return cast(__m128i)(short8(a));
3106     }
3107 }
3108 unittest
3109 {
3110     short8 a = cast(short8) _mm_set1_epi16(31);
3111     for (int i = 0; i < 8; ++i)
3112         assert(a.array[i] == 31);
3113 }
3114 
3115 /// Broadcast 32-bit integer `a` to all elements.
3116 __m128i _mm_set1_epi32 (int a) pure @trusted
3117 {
3118     pragma(inline, true);
3119     return cast(__m128i)(int4(a));
3120 }
3121 unittest
3122 {
3123     int4 a = cast(int4) _mm_set1_epi32(31);
3124     for (int i = 0; i < 4; ++i)
3125         assert(a.array[i] == 31);
3126 }
3127 
3128 /// Broadcast 64-bit integer `a` to all elements.
3129 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3130 {
3131     return _mm_set_epi64(a, a);
3132 }
3133 unittest
3134 {
3135     long b = 0x1DEADCAFE; 
3136     __m64 a;
3137     a.ptr[0] = b;
3138     long2 c = cast(long2) _mm_set1_epi64(a);
3139     assert(c.array[0] == b);
3140     assert(c.array[1] == b);
3141 }
3142 
3143 /// Broadcast 64-bit integer `a` to all elements
3144 __m128i _mm_set1_epi64x (long a) pure @trusted
3145 {
3146     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3147     return cast(__m128i)(b);
3148 }
3149 unittest
3150 {
3151     long b = 0x1DEADCAFE;
3152     long2 c = cast(long2) _mm_set1_epi64x(b);
3153     for (int i = 0; i < 2; ++i)
3154         assert(c.array[i] == b);
3155 }
3156 
3157 /// Broadcast 8-bit integer `a` to all elements.
3158 __m128i _mm_set1_epi8 (byte a) pure @trusted
3159 {
3160     pragma(inline, true);
3161     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3162     return cast(__m128i)(b);
3163 }
3164 unittest
3165 {
3166     byte16 b = cast(byte16) _mm_set1_epi8(31);
3167     for (int i = 0; i < 16; ++i)
3168         assert(b.array[i] == 31);
3169 }
3170 
3171 alias _mm_set1_pd = _mm_set_pd1;
3172 
3173 /// Set packed 16-bit integers with the supplied values in reverse order.
3174 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3175                         short e3, short e2, short e1, short e0) pure @trusted
3176 {
3177     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
3178     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
3179 }
3180 unittest
3181 {
3182     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3183     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3184     assert(A.array == correct);
3185 }
3186 
3187 /// Set packed 32-bit integers with the supplied values in reverse order.
3188 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3189 {
3190     pragma(inline, true);
3191     int[4] result = [e3, e2, e1, e0];
3192     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3193 }
3194 unittest
3195 {
3196     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3197     int[4] correct = [-1, 0, -2147483648, 2147483647];
3198     assert(A.array == correct);
3199 }
3200 
3201 /// Set packed 64-bit integers with the supplied values in reverse order.
3202 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3203 {
3204     long[2] result = [e1, e0];
3205     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3206 }
3207 unittest
3208 {
3209     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3210     long[2] correct = [-1, 0];
3211     assert(A.array == correct);
3212 }
3213 
3214 /// Set packed 8-bit integers with the supplied values in reverse order.
3215 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3216                        byte e11, byte e10, byte e9,  byte e8,
3217                        byte e7,  byte e6,  byte e5,  byte e4,
3218                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3219 {
3220     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3221                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3222     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3223 }
3224 
3225 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3226 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3227 {
3228     pragma(inline, true);
3229     double2 result;
3230     result.ptr[0] = e1;
3231     result.ptr[1] = e0;
3232     return result;
3233 }
3234 unittest
3235 {
3236     __m128d A = _mm_setr_pd(61.0, 55.0);
3237     double[2] correct = [61.0, 55.0];
3238     assert(A.array == correct);
3239 }
3240 
3241 /// Return vector of type `__m128d` with all elements set to zero.
3242 __m128d _mm_setzero_pd () pure @trusted
3243 {
3244     pragma(inline, true);
3245     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3246     double[2] result = [0.0, 0.0];
3247     return loadUnaligned!(double2)(result.ptr);
3248 }
3249 
3250 /// Return vector of type `__m128i` with all elements set to zero.
3251 __m128i _mm_setzero_si128() pure @trusted
3252 {
3253     pragma(inline, true);
3254     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3255     int[4] result = [0, 0, 0, 0];
3256     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3257 }
3258 
3259 /// Shuffle 32-bit integers in a using the control in `imm8`.
3260 /// See_also: `_MM_SHUFFLE`.
3261 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3262 {
3263     static if (GDC_with_SSE2)
3264     {
3265         return __builtin_ia32_pshufd(a, imm8);
3266     }
3267     else
3268     {
3269         return shufflevector!(int4, (imm8 >> 0) & 3,
3270                                     (imm8 >> 2) & 3,
3271                                     (imm8 >> 4) & 3,
3272                                     (imm8 >> 6) & 3)(a, a);
3273     }
3274 }
3275 unittest
3276 {
3277     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3278     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3279     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3280     int[4] expectedB = [ 3, 2, 1, 0 ];
3281     assert(B.array == expectedB);
3282 }
3283 
3284 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3285 /// See_also: `_MM_SHUFFLE2`.
3286 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3287 {
3288     static if (GDC_with_SSE2)
3289     {
3290         return __builtin_ia32_shufpd(a, b, imm8);
3291     }
3292     else
3293     {
3294         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3295                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
3296     }
3297 }
3298 unittest
3299 {
3300     __m128d A = _mm_setr_pd(0.5, 2.0);
3301     __m128d B = _mm_setr_pd(4.0, 5.0);
3302     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3303     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3304     double[2] correct = [ 2.0, 5.0 ];
3305     assert(R.array == correct);
3306 }
3307 
3308 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3309 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3310 /// See also: `_MM_SHUFFLE`.
3311 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3312 {
3313     static if (GDC_with_SSE2)
3314     {
3315         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3316     }
3317     else
3318     {
3319         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3320                                           4 + ( (imm8 >> 0) & 3 ),
3321                                           4 + ( (imm8 >> 2) & 3 ),
3322                                           4 + ( (imm8 >> 4) & 3 ),
3323                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3324     }
3325 }
3326 unittest
3327 {
3328     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3329     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3330     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3331     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3332     assert(C.array == expectedC);
3333 }
3334 
3335 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3336 /// bits of result, with the high 64 bits being copied from from `a` to result.
3337 /// See_also: `_MM_SHUFFLE`.
3338 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3339 {
3340     static if (GDC_with_SSE2)
3341     {
3342         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3343     }
3344     else
3345     {
3346         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3347                                                     ( (imm8 >> 2) & 3 ),
3348                                                     ( (imm8 >> 4) & 3 ),
3349                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3350     }
3351 }
3352 unittest
3353 {
3354     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3355     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3356     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3357     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3358     assert(B.array == expectedB);
3359 }
3360 
3361 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3362 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3363 {
3364     static if (LDC_with_SSE2)
3365     {
3366         return __builtin_ia32_pslld128(a, count);
3367     }
3368     else static if (GDC_with_SSE2)
3369     {
3370         return __builtin_ia32_pslld128(a, count);
3371     }
3372     else static if (DMD_with_32bit_asm)
3373     {
3374         asm pure nothrow @nogc @trusted
3375         {
3376             movdqu XMM0, a;
3377             movdqu XMM1, count;
3378             pslld XMM0, XMM1;
3379             movdqu a, XMM0;
3380         }
3381         return a;
3382     }
3383     else
3384     {
3385         int4 r = void;
3386         long2 lc = cast(long2)count;
3387         int bits = cast(int)(lc.array[0]);
3388         foreach(i; 0..4)
3389             r[i] = cast(uint)(a[i]) << bits;
3390         return r;
3391     }
3392 }
3393 
3394 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3395 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3396 {
3397     static if (LDC_with_SSE2)
3398     {
3399         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3400     }
3401     else static if (GDC_with_SSE2)
3402     {
3403         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3404     }
3405     else static if (DMD_with_32bit_asm)
3406     {
3407         asm pure nothrow @nogc @trusted
3408         {
3409             movdqu XMM0, a;
3410             movdqu XMM1, count;
3411             psllq XMM0, XMM1;
3412             movdqu a, XMM0;
3413         }
3414         return a;
3415     }
3416     else
3417     {
3418         // ARM: good since LDC 1.12 -O2
3419         // ~but -O0 version is catastrophic
3420         long2 r = void;
3421         long2 sa = cast(long2)a;
3422         long2 lc = cast(long2)count;
3423         int bits = cast(int)(lc.array[0]);
3424         foreach(i; 0..2)
3425             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3426         return cast(__m128i)r;
3427     }
3428 }
3429 
3430 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3431 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3432 {
3433     static if (LDC_with_SSE2)
3434     {
3435         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3436     }
3437     else static if (GDC_with_SSE2)
3438     {
3439         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3440     }
3441     else static if (DMD_with_32bit_asm)
3442     {
3443         asm pure nothrow @nogc
3444         {
3445             movdqu XMM0, a;
3446             movdqu XMM1, count;
3447             psllw XMM0, XMM1;
3448             movdqu a, XMM0;
3449         }
3450         return a;
3451     }
3452     else
3453     {
3454         short8 sa = cast(short8)a;
3455         long2 lc = cast(long2)count;
3456         int bits = cast(int)(lc.array[0]);
3457         short8 r = void;
3458         foreach(i; 0..8)
3459             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3460         return cast(int4)r;
3461     }
3462 }
3463 
3464 
3465 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3466 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3467 {
3468     static if (GDC_with_SSE2)
3469     {
3470         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3471     }
3472     else static if (LDC_with_SSE2)
3473     {
3474         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3475     }
3476     else
3477     {
3478         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3479         //       D says "It's illegal to shift by the same or more bits 
3480         //       than the size of the quantity being shifted"
3481         //       and it's UB instead.
3482         int4 r = _mm_setzero_si128();
3483 
3484         ubyte count = cast(ubyte) imm8;
3485         if (count > 31)
3486             return r;
3487         
3488         foreach(i; 0..4)
3489             r.array[i] = cast(uint)(a.array[i]) << count;
3490         return r;
3491     }
3492 }
3493 unittest
3494 {
3495     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3496     __m128i B = _mm_slli_epi32(A, 1);
3497     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3498     int[4] expectedB = [ 0, 4, 6, -8];
3499     assert(B.array == expectedB);
3500     assert(B2.array == expectedB);
3501 
3502     __m128i C = _mm_slli_epi32(A, 0);
3503     int[4] expectedC = [ 0, 2, 3, -4];
3504     assert(C.array == expectedC);
3505 
3506     __m128i D = _mm_slli_epi32(A, 65);
3507     int[4] expectedD = [ 0, 0, 0, 0];
3508     assert(D.array == expectedD);
3509 }
3510 
3511 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3512 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3513 {
3514     static if (GDC_with_SSE2)
3515     {
3516         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3517     }
3518     else static if (LDC_with_SSE2)
3519     {
3520         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3521     }
3522     else
3523     {
3524         long2 sa = cast(long2)a;
3525 
3526         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3527         //       D says "It's illegal to shift by the same or more bits 
3528         //       than the size of the quantity being shifted"
3529         //       and it's UB instead.
3530         long2 r = cast(long2) _mm_setzero_si128();
3531         ubyte count = cast(ubyte) imm8;
3532         if (count > 63)
3533             return cast(__m128i)r;
3534 
3535         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3536         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3537         return cast(__m128i)r;
3538     }
3539 }
3540 unittest
3541 {
3542     __m128i A = _mm_setr_epi64(8, -4);
3543     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3544     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3545     long[2] expectedB = [ 16, -8];
3546     assert(B.array == expectedB);
3547     assert(B2.array == expectedB);
3548 
3549     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3550     long[2] expectedC = [ 8, -4];
3551     assert(C.array == expectedC);
3552 
3553     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3554     long[2] expectedD = [ 0, -0];
3555     assert(D.array == expectedD);
3556 }
3557 
3558 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3559 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3560 {
3561     static if (GDC_with_SSE2)
3562     {
3563         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3564     }
3565     else static if (LDC_with_SSE2)
3566     {
3567         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3568     }
3569     else static if (LDC_with_ARM64)
3570     {
3571         short8 sa = cast(short8)a;
3572         short8 r = cast(short8)_mm_setzero_si128();
3573         ubyte count = cast(ubyte) imm8;
3574         if (count > 15)
3575             return cast(__m128i)r;
3576         r = sa << short8(count);
3577         return cast(__m128i)r;
3578     }
3579     else
3580     {
3581         short8 sa = cast(short8)a;
3582         short8 r = cast(short8)_mm_setzero_si128();
3583         ubyte count = cast(ubyte) imm8;
3584         if (count > 15)
3585             return cast(__m128i)r;
3586         foreach(i; 0..8)
3587             r.ptr[i] = cast(short)(sa.array[i] << count);
3588         return cast(__m128i)r;
3589     }
3590 }
3591 unittest
3592 {
3593     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3594     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3595     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3596     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3597     assert(B.array == expectedB);
3598     assert(B2.array == expectedB);
3599 
3600     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3601     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3602     assert(C.array == expectedC);
3603 }
3604 
3605 
3606 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3607 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3608 {
3609     static if (bytes & 0xF0)
3610     {
3611         return _mm_setzero_si128();
3612     }
3613     else
3614     {
3615         static if (GDC_with_SSE2)
3616         {
3617             return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3618         }
3619         else version(DigitalMars)
3620         {
3621             version(D_InlineAsm_X86)
3622             {
3623                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3624                 {
3625                     movdqu XMM0, op;
3626                     pslldq XMM0, bytes;
3627                     movdqu op, XMM0;
3628                 }
3629                 return op;
3630             }
3631             else
3632             {
3633                 byte16 A = cast(byte16)op;
3634                 byte16 R;
3635                 for (int n = 15; n >= bytes; --n)
3636                     R.ptr[n] = A.array[n-bytes];
3637                 for (int n = bytes-1; n >= 0; --n)
3638                     R.ptr[n] = 0;
3639                 return cast(__m128i)R;
3640             }
3641         }
3642         else
3643         {
3644             return cast(__m128i) shufflevector!(byte16,
3645             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3646             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3647             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3648             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3649         }
3650     }
3651 }
3652 unittest
3653 {
3654     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3655     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3656     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3657     assert(R.array == correct);
3658 
3659     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3660     int[4] expectedB = [0, 0, 0, 0];
3661     assert(B.array == expectedB);
3662 }
3663 
3664 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3665 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3666 {
3667     version(LDC)
3668     {
3669         // Disappeared with LDC 1.11
3670         static if (__VERSION__ < 2081)
3671             return __builtin_ia32_sqrtpd(vec);
3672         else
3673         {
3674             vec.array[0] = llvm_sqrt(vec.array[0]);
3675             vec.array[1] = llvm_sqrt(vec.array[1]);
3676             return vec;
3677         }
3678     }
3679     else static if (GDC_with_SSE2)    
3680     {
3681         return __builtin_ia32_sqrtpd(vec);
3682     }
3683     else
3684     {
3685         vec.ptr[0] = sqrt(vec.array[0]);
3686         vec.ptr[1] = sqrt(vec.array[1]);
3687         return vec;
3688     }
3689 }
3690 
3691 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3692 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3693 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3694 {
3695     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3696     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3697     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3698     version(LDC)
3699     {
3700         // Disappeared with LDC 1.11
3701         static if (__VERSION__ < 2081)
3702         {
3703             __m128d c = __builtin_ia32_sqrtsd(b);
3704             a[0] = c[0];
3705             return a;
3706         }
3707         else
3708         {
3709             a.array[0] = llvm_sqrt(b.array[0]);
3710             return a;
3711         }
3712     }
3713     else static if (GDC_with_SSE2)
3714     {
3715         __m128d c = __builtin_ia32_sqrtsd(b);
3716         a.ptr[0] = c.array[0];
3717         return a;
3718     }
3719     else
3720     {
3721         a.ptr[0] = sqrt(b.array[0]);
3722         return a;
3723     }
3724 }
3725 unittest
3726 {
3727     __m128d A = _mm_setr_pd(1.0, 3.0);
3728     __m128d B = _mm_setr_pd(4.0, 5.0);
3729     __m128d R = _mm_sqrt_sd(A, B);
3730     double[2] correct = [2.0, 3.0 ];
3731     assert(R.array == correct);
3732 }
3733 
3734 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3735 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3736 {
3737     static if (GDC_with_SSE2)
3738     {
3739         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3740     }
3741     else static if (LDC_with_SSE2)
3742     {
3743         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3744     }
3745     else
3746     {
3747         short8 sa = cast(short8)a;
3748         long2 lc = cast(long2)count;
3749         int bits = cast(int)(lc.array[0]);
3750         short8 r = void;
3751         foreach(i; 0..8)
3752             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3753         return cast(int4)r;
3754     }
3755 }
3756 
3757 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3758 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3759 {
3760     static if (LDC_with_SSE2)
3761     {
3762         return __builtin_ia32_psrad128(a, count);
3763     }
3764     else static if (GDC_with_SSE2)
3765     {
3766         return __builtin_ia32_psrad128(a, count);
3767     }
3768     else
3769     {    
3770         int4 r = void;
3771         long2 lc = cast(long2)count;
3772         int bits = cast(int)(lc.array[0]);
3773         r.ptr[0] = (a.array[0] >> bits);
3774         r.ptr[1] = (a.array[1] >> bits);
3775         r.ptr[2] = (a.array[2] >> bits);
3776         r.ptr[3] = (a.array[3] >> bits);
3777         return r;
3778     }
3779 }
3780 
3781 
3782 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3783 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3784 {
3785     static if (GDC_with_SSE2)
3786     {
3787         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3788     }
3789     else static if (LDC_with_SSE2)
3790     {
3791         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3792     }
3793     else static if (LDC_with_ARM64)
3794     {
3795         short8 sa = cast(short8)a;
3796         ubyte count = cast(ubyte)imm8;
3797         if (count > 15) 
3798             count = 15;
3799         short8 r = sa >> short8(count);
3800         return cast(__m128i)r;
3801     }
3802     else
3803     {
3804         short8 sa = cast(short8)a;
3805         short8 r = void;
3806 
3807         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3808         //       D says "It's illegal to shift by the same or more bits 
3809         //       than the size of the quantity being shifted"
3810         //       and it's UB instead.
3811         ubyte count = cast(ubyte)imm8;
3812         if (count > 15) 
3813             count = 15;
3814         foreach(i; 0..8)
3815             r.ptr[i] = cast(short)(sa.array[i] >> count);
3816         return cast(int4)r;
3817     }
3818 }
3819 unittest
3820 {
3821     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3822     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3823     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3824     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3825     assert(B.array == expectedB);
3826     assert(B2.array == expectedB);
3827 
3828     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3829     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3830     assert(C.array == expectedC);
3831 }
3832 
3833 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3834 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3835 {
3836     static if (LDC_with_SSE2)
3837     {
3838         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3839     }
3840     else static if (GDC_with_SSE2)
3841     {
3842         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3843     }
3844     else
3845     {
3846         int4 r = void;
3847 
3848         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3849         //       D says "It's illegal to shift by the same or more bits 
3850         //       than the size of the quantity being shifted"
3851         //       and it's UB instead.
3852         ubyte count = cast(ubyte) imm8;
3853         if (count > 31)
3854             count = 31;
3855 
3856         r.ptr[0] = (a.array[0] >> count);
3857         r.ptr[1] = (a.array[1] >> count);
3858         r.ptr[2] = (a.array[2] >> count);
3859         r.ptr[3] = (a.array[3] >> count);
3860         return r;
3861     }
3862 }
3863 unittest
3864 {
3865     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3866     __m128i B = _mm_srai_epi32(A, 1);
3867     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3868     int[4] expectedB = [ 0, 1, 1, -2];
3869     assert(B.array == expectedB);
3870     assert(B2.array == expectedB);
3871 
3872     __m128i C = _mm_srai_epi32(A, 32);
3873     int[4] expectedC = [ 0, 0, 0, -1];
3874     assert(C.array == expectedC);
3875 
3876     __m128i D = _mm_srai_epi32(A, 0);
3877     int[4] expectedD = [ 0, 2, 3, -4];
3878     assert(D.array == expectedD);
3879 }
3880 
3881 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3882 {
3883     static if (LDC_with_SSE2)
3884     {
3885         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3886     }
3887     else static if (GDC_with_SSE2)
3888     {
3889         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3890     }
3891     else
3892     {
3893         short8 sa = cast(short8)a;
3894         long2 lc = cast(long2)count;
3895         int bits = cast(int)(lc.array[0]);
3896         short8 r = void;
3897         foreach(i; 0..8)
3898             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3899         return cast(int4)r;
3900     }
3901 }
3902 
3903 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
3904 {
3905     static if (LDC_with_SSE2)
3906     {
3907         return __builtin_ia32_psrld128(a, count);
3908     }
3909     else static if (GDC_with_SSE2)
3910     {
3911         return __builtin_ia32_psrld128(a, count);
3912     }
3913     else
3914     {
3915         int4 r = void;
3916         long2 lc = cast(long2)count;
3917         int bits = cast(int)(lc.array[0]);
3918         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
3919         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
3920         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
3921         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
3922         return r;
3923     }
3924 }
3925 
3926 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
3927 {
3928     static if (LDC_with_SSE2)
3929     {
3930         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3931     }
3932     else static if (GDC_with_SSE2)
3933     {
3934         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3935     }
3936     else
3937     {
3938         long2 r = void;
3939         long2 sa = cast(long2)a;
3940         long2 lc = cast(long2)count;
3941         int bits = cast(int)(lc.array[0]);
3942         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
3943         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
3944         return cast(__m128i)r;
3945     }
3946 }
3947 
3948 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3949 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3950 {
3951     static if (GDC_with_SSE2)
3952     {
3953         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3954     }
3955     else static if (LDC_with_SSE2)
3956     {
3957         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3958     }
3959     else static if (LDC_with_ARM64)
3960     {
3961         short8 sa = cast(short8)a;
3962         short8 r = cast(short8) _mm_setzero_si128();
3963 
3964         ubyte count = cast(ubyte)imm8;
3965         if (count >= 16)
3966             return cast(__m128i)r;
3967 
3968         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3969         return cast(__m128i)r;
3970     }
3971     else
3972     {
3973         short8 sa = cast(short8)a;
3974         ubyte count = cast(ubyte)imm8;
3975 
3976         short8 r = cast(short8) _mm_setzero_si128();
3977         if (count >= 16)
3978             return cast(__m128i)r;
3979 
3980         foreach(i; 0..8)
3981             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
3982         return cast(__m128i)r;
3983     }
3984 }
3985 unittest
3986 {
3987     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3988     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3989     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
3990     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3991     assert(B.array == expectedB);
3992     assert(B2.array == expectedB);
3993 
3994     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
3995     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
3996     assert(C.array == expectedC);
3997 
3998     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
3999     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4000     assert(D.array == expectedD);
4001 }
4002 
4003 
4004 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4005 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4006 {
4007     static if (GDC_with_SSE2)
4008     {
4009         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4010     }
4011     else static if (LDC_with_SSE2)
4012     {
4013         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4014     }
4015     else
4016     {
4017         ubyte count = cast(ubyte) imm8;
4018 
4019         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4020         //       D says "It's illegal to shift by the same or more bits 
4021         //       than the size of the quantity being shifted"
4022         //       and it's UB instead.
4023         int4 r = _mm_setzero_si128();
4024         if (count >= 32)
4025             return r;
4026         r.ptr[0] = a.array[0] >>> count;
4027         r.ptr[1] = a.array[1] >>> count;
4028         r.ptr[2] = a.array[2] >>> count;
4029         r.ptr[3] = a.array[3] >>> count;
4030         return r;
4031     }
4032 }
4033 unittest
4034 {
4035     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4036     __m128i B = _mm_srli_epi32(A, 1);
4037     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4038     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4039     assert(B.array == expectedB);
4040     assert(B2.array == expectedB);
4041  
4042     __m128i C = _mm_srli_epi32(A, 255);
4043     int[4] expectedC = [ 0, 0, 0, 0 ];
4044     assert(C.array == expectedC);
4045 }
4046 
4047 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4048 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4049 {
4050     static if (GDC_with_SSE2)
4051     {
4052         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4053     }
4054     else static if (LDC_with_SSE2)
4055     {
4056         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4057     }
4058     else
4059     {
4060         long2 r = cast(long2) _mm_setzero_si128();
4061         long2 sa = cast(long2)a;
4062 
4063         ubyte count = cast(ubyte) imm8;
4064         if (count >= 64)
4065             return cast(__m128i)r;
4066 
4067         r.ptr[0] = sa.array[0] >>> count;
4068         r.ptr[1] = sa.array[1] >>> count;
4069         return cast(__m128i)r;
4070     }
4071 }
4072 unittest
4073 {
4074     __m128i A = _mm_setr_epi64(8, -4);
4075     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4076     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4077     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4078     assert(B.array == expectedB);
4079     assert(B2.array == expectedB);
4080 
4081     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4082     long[2] expectedC = [ 0, 0 ];
4083     assert(C.array == expectedC);
4084 }
4085 
4086 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4087 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
4088 {
4089     static if (bytes & 0xF0)
4090     {
4091         return _mm_setzero_si128();
4092     }
4093     else static if (GDC_with_SSE2)
4094     {
4095         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4096     }
4097     else static if (DMD_with_32bit_asm)
4098     {
4099         asm pure nothrow @nogc @trusted
4100         {
4101             movdqu XMM0, v;
4102             psrldq XMM0, bytes;
4103             movdqu v, XMM0;
4104         }
4105         return v;
4106     }
4107     else
4108     {
4109         return cast(__m128i) shufflevector!(byte16,
4110                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4111                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4112                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4113     }
4114 }
4115 unittest
4116 {
4117     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
4118     int[4] correct = [2, 3, 4, 0];
4119     assert(R.array == correct);
4120 
4121     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4122     int[4] expectedA = [0, 0, 0, 0];
4123     assert(A.array == expectedA);
4124 }
4125 
4126 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4127 /// #BONUS
4128 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4129 {
4130     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4131 }
4132 unittest
4133 {
4134     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4135     float[4] correct = [3.0f, 4.0f, 0, 0];
4136     assert(R.array == correct);
4137 }
4138 
4139 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4140 /// #BONUS
4141 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4142 {
4143     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4144 }
4145 
4146 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4147 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4148 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4149 {
4150     pragma(inline, true);
4151     __m128d* aligned = cast(__m128d*)mem_addr;
4152     *aligned = a;
4153 }
4154 
4155 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4156 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4157 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4158 {
4159     __m128d* aligned = cast(__m128d*)mem_addr;
4160     __m128d r;
4161     r.ptr[0] = a.array[0];
4162     r.ptr[1] = a.array[0];
4163     *aligned = r;
4164 }
4165 
4166 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4167 /// be aligned on any particular boundary.
4168 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4169 {
4170     pragma(inline, true);
4171     *mem_addr = a.array[0];
4172 }
4173 
4174 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4175 /// general-protection exception may be generated.
4176 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4177 {
4178     pragma(inline, true);
4179     *mem_addr = a;
4180 }
4181 
4182 alias _mm_store1_pd = _mm_store_pd1; ///
4183 
4184 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4185 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4186 {
4187     pragma(inline, true);
4188     *mem_addr = a.array[1];
4189 }
4190 
4191 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4192 // expectations from the user point of view. This problem also exist in C++.
4193 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4194 {
4195     pragma(inline, true);
4196     long* dest = cast(long*)mem_addr;
4197     long2 la = cast(long2)a;
4198     *dest = la.array[0];
4199 }
4200 unittest
4201 {
4202     long[3] A = [1, 2, 3];
4203     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4204     long[3] correct = [1, 0x1_0000_0000, 3];
4205     assert(A == correct);
4206 }
4207 
4208 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4209 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4210 {
4211     pragma(inline, true);
4212     *mem_addr = a.array[0];
4213 }
4214 
4215 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4216 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4217 void _mm_storer_pd (double* mem_addr, __m128d a) pure
4218 {
4219     __m128d* aligned = cast(__m128d*)mem_addr;
4220     *aligned = shufflevector!(double2, 1, 0)(a, a);
4221 }
4222 
4223 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4224 /// `mem_addr` does not need to be aligned on any particular boundary.
4225 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
4226 {
4227     pragma(inline, true);
4228     storeUnaligned!double2(a, mem_addr);
4229 }
4230 
4231 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4232 /// boundary.
4233 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4234 {
4235     pragma(inline, true);
4236     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4237 }
4238 
4239 /// Store 32-bit integer from the first element of `a` into memory. 
4240 /// `mem_addr` does not need to be aligned on any particular boundary.
4241 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4242 {
4243     pragma(inline, true);
4244     int* dest = cast(int*)mem_addr;
4245     *dest = a.array[0];
4246 }
4247 unittest
4248 {
4249     int[2] arr = [-24, 12];
4250     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4251     assert(arr == [-24, -1]);
4252 }
4253 
4254 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4255 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4256 /// boundary or a general-protection exception may be generated.
4257 void _mm_stream_pd (double* mem_addr, __m128d a)
4258 {
4259     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4260     __m128d* dest = cast(__m128d*)mem_addr;
4261     *dest = a;
4262 }
4263 
4264 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4265 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4266 /// may be generated.
4267 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4268 {
4269     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4270     __m128i* dest = cast(__m128i*)mem_addr;
4271     *dest = a;
4272 }
4273 
4274 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4275 /// pollution. If the cache line containing address mem_addr is already in the cache,
4276 /// the cache will be updated.
4277 void _mm_stream_si32 (int* mem_addr, int a)
4278 {
4279     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4280     *mem_addr = a;
4281 }
4282 
4283 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4284 /// cache pollution. If the cache line containing address mem_addr is already
4285 /// in the cache, the cache will be updated.
4286 void _mm_stream_si64 (long* mem_addr, long a)
4287 {
4288     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4289     *mem_addr = a;
4290 }
4291 
4292 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4293 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4294 {
4295     pragma(inline, true);
4296     return cast(__m128i)(cast(short8)a - cast(short8)b);
4297 }
4298 
4299 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4300 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4301 {
4302     pragma(inline, true);
4303     return cast(__m128i)(cast(int4)a - cast(int4)b);
4304 }
4305 
4306 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4307 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4308 {
4309     pragma(inline, true);
4310     return cast(__m128i)(cast(long2)a - cast(long2)b);
4311 }
4312 
4313 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4314 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4315 {
4316     pragma(inline, true);
4317     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4318 }
4319 
4320 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4321 /// floating-point elements in `a`.
4322 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4323 {
4324     pragma(inline, true);
4325     return a - b;
4326 }
4327 
4328 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4329 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4330 /// upper element of result.
4331 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4332 {
4333     version(DigitalMars)
4334     {
4335         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4336         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4337         asm pure nothrow @nogc @trusted { nop;}
4338         a[0] = a[0] - b[0];
4339         return a;
4340     }
4341     else static if (GDC_with_SSE2)
4342     {
4343         return __builtin_ia32_subsd(a, b);
4344     }
4345     else
4346     {
4347         a.ptr[0] -= b.array[0];
4348         return a;
4349     }
4350 }
4351 unittest
4352 {
4353     __m128d a = [1.5, -2.0];
4354     a = _mm_sub_sd(a, a);
4355     assert(a.array == [0.0, -2.0]);
4356 }
4357 
4358 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4359 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4360 {
4361     pragma(inline, true);
4362     return a - b;
4363 }
4364 
4365 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4366 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4367 {
4368     version(LDC)
4369     {
4370         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4371         {
4372             // Generates PSUBSW since LDC 1.15 -O0
4373             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4374             
4375             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4376             enum ir = `
4377                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4378                 ret <8 x i16> %r`;
4379             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4380         }
4381         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4382         {
4383             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4384             short[8] res;
4385             short8 sa = cast(short8)a;
4386             short8 sb = cast(short8)b;
4387             foreach(i; 0..8)
4388                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4389             return _mm_loadu_si128(cast(int4*)res.ptr);
4390         }
4391         else static if (LDC_with_SSE2)
4392         {
4393             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4394         }
4395         else
4396             static assert(false);
4397     }
4398     else static if (GDC_with_SSE2)
4399     {
4400         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4401     }
4402     else
4403     {
4404         short[8] res;
4405         short8 sa = cast(short8)a;
4406         short8 sb = cast(short8)b;
4407         foreach(i; 0..8)
4408             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4409         return _mm_loadu_si128(cast(int4*)res.ptr);
4410     }
4411 }
4412 unittest
4413 {
4414     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4415                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4416     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4417     assert(res.array == correctResult);
4418 }
4419 
4420 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4421 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4422 {
4423     version(LDC)
4424     {
4425         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4426         {
4427             // x86: Generates PSUBSB since LDC 1.15 -O0
4428             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4429             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4430             enum ir = `
4431                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4432                 ret <16 x i8> %r`;
4433             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4434         }
4435         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4436         {
4437             byte[16] res;
4438             byte16 sa = cast(byte16)a;
4439             byte16 sb = cast(byte16)b;
4440             foreach(i; 0..16)
4441                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4442             return _mm_loadu_si128(cast(int4*)res.ptr);
4443         }
4444         else static if (LDC_with_SSE2)
4445         {
4446             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4447         }
4448         else
4449             static assert(false);
4450     }
4451     else static if (GDC_with_SSE2)
4452     {
4453         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4454     }
4455     else
4456     {
4457         byte[16] res;
4458         byte16 sa = cast(byte16)a;
4459         byte16 sb = cast(byte16)b;
4460         foreach(i; 0..16)
4461             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4462         return _mm_loadu_si128(cast(int4*)res.ptr);
4463     }
4464 }
4465 unittest
4466 {
4467     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4468                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4469     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4470     assert(res.array == correctResult);
4471 }
4472 
4473 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4474 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4475 {
4476     version(LDC)
4477     {
4478         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4479         {
4480             // x86: Generates PSUBUSW since LDC 1.15 -O0
4481             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4482             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4483             enum ir = `
4484                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4485                 ret <8 x i16> %r`;
4486             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4487         }
4488         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4489         {
4490             short[8] res;
4491             short8 sa = cast(short8)a;
4492             short8 sb = cast(short8)b;
4493             foreach(i; 0..8)
4494             {
4495                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4496                 res[i] = saturateSignedIntToUnsignedShort(sum);
4497             }
4498             return _mm_loadu_si128(cast(int4*)res.ptr);
4499         }
4500         else static if (LDC_with_SSE2)
4501         {
4502             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4503         }
4504         else 
4505             static assert(false);
4506     }
4507     else static if (GDC_with_SSE2)
4508     {
4509         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4510     }
4511     else
4512     {
4513         short[8] res;
4514         short8 sa = cast(short8)a;
4515         short8 sb = cast(short8)b;
4516         foreach(i; 0..8)
4517         {
4518             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4519             res[i] = saturateSignedIntToUnsignedShort(sum);
4520         }
4521         return _mm_loadu_si128(cast(int4*)res.ptr);
4522     }
4523 }
4524 unittest
4525 {
4526     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4527                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4528     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4529     assert(R.array == correct);
4530 }
4531 
4532 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4533 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4534 {
4535     version(LDC)
4536     {
4537         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4538         {
4539             // x86: Generates PSUBUSB since LDC 1.15 -O0
4540             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4541             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4542             enum ir = `
4543                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4544                 ret <16 x i8> %r`;
4545             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4546         }
4547         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4548         {
4549             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4550             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4551             {
4552                 ubyte[16] res;
4553                 byte16 sa = cast(byte16)a;
4554                 byte16 sb = cast(byte16)b;
4555                 foreach(i; 0..16)
4556                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4557                 return _mm_loadu_si128(cast(int4*)res.ptr);
4558             }
4559         }
4560         else static if (LDC_with_SSE2)
4561         {
4562             return __builtin_ia32_psubusb128(a, b);
4563         }
4564         else 
4565             static assert(false);
4566     }
4567     else static if (GDC_with_SSE2)
4568     {
4569         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4570     }
4571     else
4572     {
4573         ubyte[16] res;
4574         byte16 sa = cast(byte16)a;
4575         byte16 sb = cast(byte16)b;
4576         foreach(i; 0..16)
4577             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4578         return _mm_loadu_si128(cast(int4*)res.ptr);
4579     }
4580 }
4581 unittest
4582 {
4583     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4584                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4585     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4586     assert(res.array == correctResult);
4587 }
4588 
4589 // Note: the only difference between these intrinsics is the signalling
4590 //       behaviour of quiet NaNs. This is incorrect but the case where
4591 //       you would want to differentiate between qNaN and sNaN and then
4592 //       treat them differently on purpose seems extremely rare.
4593 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4594 alias _mm_ucomige_sd = _mm_comige_sd; ///
4595 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4596 alias _mm_ucomile_sd = _mm_comile_sd; ///
4597 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4598 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4599 
4600 /// Return vector of type `__m128d` with undefined elements.
4601 __m128d _mm_undefined_pd() pure @safe
4602 {
4603     pragma(inline, true);
4604     __m128d result = void;
4605     return result;
4606 }
4607 
4608 /// Return vector of type `__m128i` with undefined elements.
4609 __m128i _mm_undefined_si128() pure @safe
4610 {
4611     pragma(inline, true);
4612     __m128i result = void;
4613     return result;
4614 }
4615 
4616 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4617 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4618 {
4619     static if (GDC_with_SSE2)
4620     {
4621         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4622     }
4623     else static if (DMD_with_32bit_asm)
4624     {
4625         asm pure nothrow @nogc @trusted
4626         {
4627             movdqu XMM0, a;
4628             movdqu XMM1, b;
4629             punpckhwd XMM0, XMM1;
4630             movdqu a, XMM0;
4631         }
4632         return a;
4633     }
4634     else
4635     {
4636         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4637                                            (cast(short8)a, cast(short8)b);
4638     }
4639 }
4640 unittest
4641 {
4642     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4643     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4644     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4645     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4646     assert(C.array == correct);
4647 }
4648 
4649 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4650 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
4651 {
4652     static if (GDC_with_SSE2)
4653     {
4654         return __builtin_ia32_punpckhdq128(a, b);
4655     }
4656     else version(DigitalMars)
4657     {
4658         __m128i r;
4659         r.ptr[0] = a.array[2];
4660         r.ptr[1] = b.array[2];
4661         r.ptr[2] = a.array[3];
4662         r.ptr[3] = b.array[3];
4663         return r;
4664     }
4665     else
4666     {
4667         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4668     }
4669 }
4670 unittest
4671 {
4672     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4673     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4674     __m128i C = _mm_unpackhi_epi32(A, B);
4675     int[4] correct = [3, 7, 4, 8];
4676     assert(C.array == correct);
4677 }
4678 
4679 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4680 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4681 {
4682     static if (GDC_with_SSE2)
4683     {
4684         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
4685     }
4686     else
4687     {
4688         __m128i r = cast(__m128i)b;
4689         r[0] = a[2];
4690         r[1] = a[3];
4691         return r; 
4692     }
4693 }
4694 unittest // Issue #36
4695 {
4696     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4697     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4698     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4699     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4700     assert(C.array == correct);
4701 }
4702 
4703 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4704 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4705 {
4706     static if (GDC_with_SSE2)
4707     {
4708         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
4709     }
4710     else static if (DMD_with_32bit_asm)
4711     {
4712         asm pure nothrow @nogc @trusted
4713         {
4714             movdqu XMM0, a;
4715             movdqu XMM1, b;
4716             punpckhbw XMM0, XMM1;
4717             movdqu a, XMM0;
4718         }
4719         return a;
4720     }
4721     else
4722     {
4723         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4724                                                    12, 28, 13, 29, 14, 30, 15, 31)
4725                                                    (cast(byte16)a, cast(byte16)b);
4726     }
4727 }
4728 unittest
4729 {
4730     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4731     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4732     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
4733     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
4734     assert(C.array == correct);
4735 }
4736 
4737 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4738 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4739 {
4740     static if (GDC_with_SSE2)
4741     {
4742         return __builtin_ia32_unpckhpd(a, b);
4743     }
4744     else
4745     {
4746         return shufflevector!(__m128d, 1, 3)(a, b);
4747     }
4748 }
4749 unittest
4750 {
4751     __m128d A = _mm_setr_pd(4.0, 6.0);
4752     __m128d B = _mm_setr_pd(7.0, 9.0);
4753     __m128d C = _mm_unpackhi_pd(A, B);
4754     double[2] correct = [6.0, 9.0];
4755     assert(C.array == correct);
4756 }
4757 
4758 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4759 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4760 {
4761     static if (GDC_with_SSE2)
4762     {
4763         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
4764     }
4765     else static if (DMD_with_32bit_asm)
4766     {
4767         asm pure nothrow @nogc @trusted
4768         {
4769             movdqu XMM0, a;
4770             movdqu XMM1, b;
4771             punpcklwd XMM0, XMM1;
4772             movdqu a, XMM0;
4773         }
4774         return a;
4775     }
4776     else
4777     {
4778         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4779                                            (cast(short8)a, cast(short8)b);
4780     }
4781 }
4782 unittest
4783 {
4784     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4785     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4786     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
4787     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
4788     assert(C.array == correct);
4789 }
4790 
4791 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4792 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
4793 {
4794     static if (GDC_with_SSE2)
4795     {
4796         return __builtin_ia32_punpckldq128(a, b);
4797     }
4798     else version(DigitalMars)
4799     {
4800         __m128i r;
4801         r.ptr[0] = a.array[0];
4802         r.ptr[1] = b.array[0];
4803         r.ptr[2] = a.array[1];
4804         r.ptr[3] = b.array[1];
4805         return r;
4806     }
4807     else
4808     {
4809         return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
4810     }
4811 }
4812 unittest
4813 {
4814     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4815     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4816     __m128i C = _mm_unpacklo_epi32(A, B);
4817     int[4] correct = [1, 5, 2, 6];
4818     assert(C.array == correct);
4819 }
4820 
4821 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4822 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4823 {
4824     static if (GDC_with_SSE2)
4825     {
4826         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
4827     }
4828     else
4829     {
4830         long2 lA = cast(long2)a;
4831         long2 lB = cast(long2)b;
4832         long2 R;
4833         R.ptr[0] = lA.array[0];
4834         R.ptr[1] = lB.array[0];
4835         return cast(__m128i)R;
4836     }
4837 }
4838 unittest // Issue #36
4839 {
4840     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4841     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4842     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4843     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4844     assert(C.array == correct);
4845 }
4846 
4847 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
4848 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4849 {
4850     static if (GDC_with_SSE2)
4851     {
4852         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
4853     }
4854     else static if (DMD_with_32bit_asm)
4855     {
4856         asm pure nothrow @nogc @trusted
4857         {
4858             movdqu XMM0, a;
4859             movdqu XMM1, b;
4860             punpcklbw XMM0, XMM1;
4861             movdqu a, XMM0;
4862         }
4863         return a;
4864     }
4865     else
4866     {
4867         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4868                                                     4, 20, 5, 21, 6, 22, 7, 23)
4869                                            (cast(byte16)a, cast(byte16)b);
4870     }
4871 }
4872 unittest
4873 {
4874     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4875     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4876     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
4877     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
4878     assert(C.array == correct);
4879 }
4880 
4881 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
4882 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4883 {
4884     static if (GDC_with_SSE2)
4885     {
4886         return __builtin_ia32_unpcklpd(a, b);
4887     }
4888     else
4889     {
4890         return shufflevector!(__m128d, 0, 2)(a, b);
4891     }
4892 }
4893 unittest
4894 {
4895     __m128d A = _mm_setr_pd(4.0, 6.0);
4896     __m128d B = _mm_setr_pd(7.0, 9.0);
4897     __m128d C = _mm_unpacklo_pd(A, B);
4898     double[2] correct = [4.0, 7.0];
4899     assert(C.array == correct);
4900 }
4901 
4902 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4903 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4904 {
4905     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4906 }
4907 // TODO unittest and thus force inline
4908 
4909 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
4910 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4911 {
4912     return a ^ b;
4913 }
4914 // TODO unittest and thus force inline
4915 
4916 unittest
4917 {
4918     float distance(float[4] a, float[4] b) nothrow @nogc
4919     {
4920         __m128 va = _mm_loadu_ps(a.ptr);
4921         __m128 vb = _mm_loadu_ps(b.ptr);
4922         __m128 diffSquared = _mm_sub_ps(va, vb);
4923         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4924         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4925         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4926         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4927     }
4928     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4929 }