1 /**
2 * SSE2 intrinsics. 
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.emmintrin;
8 
9 public import inteli.types;
10 public import inteli.xmmintrin; // SSE2 includes SSE1
11 import inteli.mmx;
12 import inteli.internals;
13 
14 nothrow @nogc:
15 
16 
17 // SSE2 instructions
18 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
19 
20 /// Add packed 16-bit integers in `a` and `b`.
21 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
22 {
23     return cast(__m128i)(cast(short8)a + cast(short8)b);
24 }
25 unittest
26 {
27     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
28     short8 R = cast(short8) _mm_add_epi16(A, A);
29     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
30     assert(R.array == correct);
31 }
32 
33 /// Add packed 32-bit integers in `a` and `b`.
34 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
35 {
36     return cast(__m128i)(cast(int4)a + cast(int4)b);
37 }
38 unittest
39 {
40     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
41     int4 R = _mm_add_epi32(A, A);
42     int[4] correct = [ -14, -2, 0, 18 ];
43     assert(R.array == correct);
44 }
45 
46 /// Add packed 64-bit integers in `a` and `b`.
47 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
48 {
49     return cast(__m128i)(cast(long2)a + cast(long2)b);
50 }
51 unittest
52 {
53     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
54     long2 R = cast(long2) _mm_add_epi64(A, A);
55     long[2] correct = [ -2, 0 ];
56     assert(R.array == correct);
57 }
58 
59 /// Add packed 8-bit integers in `a` and `b`.
60 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
61 {
62     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
63 }
64 unittest
65 {
66     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
67     byte16 R = cast(byte16) _mm_add_epi8(A, A);
68     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
69     assert(R.array == correct);
70 }
71 
72 /// Add the lower double-precision (64-bit) floating-point element 
73 /// in `a` and `b`, store the result in the lower element of dst, 
74 /// and copy the upper element from `a` to the upper element of destination. 
75 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
76 {
77     static if (GDC_with_SSE2)
78     {
79         return __builtin_ia32_addsd(a, b);
80     }
81     else version(DigitalMars)
82     {
83         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
84         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
85         asm pure nothrow @nogc @trusted { nop;}
86         a[0] = a[0] + b[0];
87         return a;
88     }
89     else
90     {
91         a[0] += b[0];
92         return a;
93     }
94 }
95 unittest
96 {
97     __m128d a = [1.5, -2.0];
98     a = _mm_add_sd(a, a);
99     assert(a.array == [3.0, -2.0]);
100 }
101 
102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
104 {
105     return a + b;
106 }
107 unittest
108 {
109     __m128d a = [1.5, -2.0];
110     a = _mm_add_pd(a, a);
111     assert(a.array == [3.0, -4.0]);
112 }
113 
114 /// Add 64-bit integers `a` and `b`.
115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
116 {
117     return a + b;
118 }
119 
120 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
122 {
123     static if (GDC_with_SSE2)
124     {
125         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
126     }
127     else version(LDC)
128     {
129         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
130         {
131             // x86: Generates PADDSW since LDC 1.15 -O0
132             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
133             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
134             enum ir = `
135                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
136                 ret <8 x i16> %r`;
137             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
138         }
139         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
140         {
141             short[8] res;
142             short8 sa = cast(short8)a;
143             short8 sb = cast(short8)b;
144             foreach(i; 0..8)
145                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
146             return _mm_loadu_si128(cast(int4*)res.ptr);
147         }
148         else
149             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
150     }
151     else
152     {
153         short[8] res;
154         short8 sa = cast(short8)a;
155         short8 sb = cast(short8)b;
156         foreach(i; 0..8)
157             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
158         return _mm_loadu_si128(cast(int4*)res.ptr);
159     }
160 }
161 unittest
162 {
163     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
164                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
165     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
166     assert(res.array == correctResult);
167 }
168 
169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
171 {
172     static if (GDC_with_SSE2)
173     {
174         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
175     }
176     else version(LDC)
177     {
178         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
179         {
180             // x86: Generates PADDSB since LDC 1.15 -O0
181             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
182             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
183             enum ir = `
184                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
185                 ret <16 x i8> %r`;
186             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
187         }
188         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
189         {
190             byte[16] res;
191             byte16 sa = cast(byte16)a;
192             byte16 sb = cast(byte16)b;
193             foreach(i; 0..16)
194                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
195             return _mm_loadu_si128(cast(int4*)res.ptr);
196         }
197         else
198             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
199     }
200     else
201     {
202         byte[16] res;
203         byte16 sa = cast(byte16)a;
204         byte16 sb = cast(byte16)b;
205         foreach(i; 0..16)
206             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
207         return _mm_loadu_si128(cast(int4*)res.ptr);
208     }
209 }
210 unittest
211 {
212     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
213                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
214     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
215                                                16, 18, 20, 22, 24, 26, 28, 30];
216     assert(res.array == correctResult);
217 }
218 
219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
220 // PERF: #GDC version?
221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
222 {
223     version(LDC)
224     {
225         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
226         {
227             // x86: Generates PADDUSB since LDC 1.15 -O0
228             // ARM: Generates uqadd.16b since LDC 1.21 -O1
229             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
230             enum ir = `
231                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
232                 ret <16 x i8> %r`;
233             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
234         }
235         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
236         {
237             ubyte[16] res;
238             byte16 sa = cast(byte16)a;
239             byte16 sb = cast(byte16)b;
240             foreach(i; 0..16)
241                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
242             return _mm_loadu_si128(cast(int4*)res.ptr);
243         }
244         else
245             return __builtin_ia32_paddusb128(a, b);
246     }
247     else
248     {
249         ubyte[16] res;
250         byte16 sa = cast(byte16)a;
251         byte16 sb = cast(byte16)b;
252         foreach(i; 0..16)
253             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
254         return _mm_loadu_si128(cast(int4*)res.ptr);
255     }
256 }
257 unittest
258 {
259     byte16 res = cast(byte16) 
260         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
261                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
262     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
263                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
264     assert(res.array == correctResult);
265 }
266 
267 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
268 // PERF: #GDC version?
269 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
270 {
271     version(LDC)
272     {
273         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
274         {
275             // x86: Generates PADDUSW since LDC 1.15 -O0
276             // ARM: Generates uqadd.8h since LDC 1.21 -O1
277             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
278             enum ir = `
279                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
280                 ret <8 x i16> %r`;
281             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
282         }
283         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
284         {
285             ushort[8] res;
286             short8 sa = cast(short8)a;
287             short8 sb = cast(short8)b;
288             foreach(i; 0..8)
289                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
290             return _mm_loadu_si128(cast(int4*)res.ptr);
291         }
292         else
293             return __builtin_ia32_paddusw128(a, b);
294     }
295     else
296     {
297         ushort[8] res;
298         short8 sa = cast(short8)a;
299         short8 sb = cast(short8)b;
300         foreach(i; 0..8)
301             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
302         return _mm_loadu_si128(cast(int4*)res.ptr);
303     }
304 }
305 unittest
306 {
307     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
308                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
309     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
310     assert(res.array == correctResult);
311 }
312 
313 /// Compute the bitwise AND of packed double-precision (64-bit) 
314 /// floating-point elements in `a` and `b`.
315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
316 {
317     return cast(__m128d)( cast(long2)a & cast(long2)b );
318 }
319 unittest
320 {
321     double a = 4.32;
322     double b = -78.99;
323     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
324     __m128d A = _mm_set_pd(a, b);
325     __m128d B = _mm_set_pd(b, a);
326     long2 R = cast(long2)( _mm_and_pd(A, B) );
327     assert(R.array[0] == correct);
328     assert(R.array[1] == correct);
329 }
330 
331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
333 {
334     return a & b;
335 }
336 unittest
337 {
338     __m128i A = _mm_set1_epi32(7);
339     __m128i B = _mm_set1_epi32(14);
340     __m128i R = _mm_and_si128(A, B);
341     int[4] correct = [6, 6, 6, 6];
342     assert(R.array == correct);
343 }
344 
345 /// Compute the bitwise NOT of packed double-precision (64-bit) 
346 /// floating-point elements in `a` and then AND with `b`.
347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
348 {
349     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
350 }
351 unittest
352 {
353     double a = 4.32;
354     double b = -78.99;
355     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
356     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
357     __m128d A = _mm_setr_pd(a, b);
358     __m128d B = _mm_setr_pd(b, a);
359     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
360     assert(R.array[0] == correct);
361     assert(R.array[1] == correct2);
362 }
363 
364 /// Compute the bitwise NOT of 128 bits (representing integer data) 
365 /// in `a` and then AND with `b`.
366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
367 {
368     return (~a) & b;
369 }
370 unittest
371 {
372     __m128i A = _mm_set1_epi32(7);
373     __m128i B = _mm_set1_epi32(14);
374     __m128i R = _mm_andnot_si128(A, B);
375     int[4] correct = [8, 8, 8, 8];
376     assert(R.array == correct);
377 }
378 
379 /// Average packed unsigned 16-bit integers in `a` and `b`.
380 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
381 {
382     static if (GDC_with_SSE2)
383     {
384         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
385     }
386     else static if (LDC_with_ARM64)
387     {
388         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
389     }
390     else version(LDC)
391     {
392         // Generates pavgw even in LDC 1.0, even in -O0
393         // But not in ARM
394         enum ir = `
395             %ia = zext <8 x i16> %0 to <8 x i32>
396             %ib = zext <8 x i16> %1 to <8 x i32>
397             %isum = add <8 x i32> %ia, %ib
398             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
399             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
400             %r = trunc <8 x i32> %isums to <8 x i16>
401             ret <8 x i16> %r`;
402         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
403     }
404     else
405     {
406         short8 sa = cast(short8)a;
407         short8 sb = cast(short8)b;
408         short8 sr = void;
409         foreach(i; 0..8)
410         {
411             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
412         }
413         return cast(int4)sr;
414     }
415 }
416 unittest
417 {
418     __m128i A = _mm_set1_epi16(31);
419     __m128i B = _mm_set1_epi16(64);
420     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
421     foreach(i; 0..8)
422         assert(avg.array[i] == 48);
423 }
424 
425 /// Average packed unsigned 8-bit integers in `a` and `b`.
426 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
427 {
428     static if (GDC_with_SSE2)
429     {
430         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
431     }
432     else static if (LDC_with_ARM64)
433     {
434         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
435     }
436     else version(LDC)
437     {
438         // Generates pavgb even in LDC 1.0, even in -O0
439         // But not in ARM
440         enum ir = `
441             %ia = zext <16 x i8> %0 to <16 x i16>
442             %ib = zext <16 x i8> %1 to <16 x i16>
443             %isum = add <16 x i16> %ia, %ib
444             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
445             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
446             %r = trunc <16 x i16> %isums to <16 x i8>
447             ret <16 x i8> %r`;
448         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
449     }
450     else
451     {
452         byte16 sa = cast(byte16)a;
453         byte16 sb = cast(byte16)b;
454         byte16 sr = void;
455         foreach(i; 0..16)
456         {
457             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
458         }
459         return cast(int4)sr;
460     }
461 }
462 unittest
463 {
464     __m128i A = _mm_set1_epi8(31);
465     __m128i B = _mm_set1_epi8(64);
466     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
467     foreach(i; 0..16)
468         assert(avg.array[i] == 48);
469 }
470 
471 /// Shift `a` left by `bytes` bytes while shifting in zeros.
472 alias _mm_bslli_si128 = _mm_slli_si128;
473 unittest
474 {
475     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
476     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
477     __m128i result = _mm_bslli_si128!5(toShift);
478     assert( (cast(byte16)result).array == exact);
479 }
480 
481 /// Shift `v` right by `bytes` bytes while shifting in zeros.
482 alias _mm_bsrli_si128 = _mm_srli_si128;
483 unittest
484 {
485     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
486     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
487     __m128i result = _mm_bsrli_si128!5(toShift);
488     assert( (cast(byte16)result).array == exact);
489 }
490 
491 /// Cast vector of type `__m128d` to type `__m128`. 
492 /// Note: Also possible with a regular `cast(__m128)(a)`.
493 __m128 _mm_castpd_ps (__m128d a) pure @safe
494 {
495     return cast(__m128)a;
496 }
497 
498 /// Cast vector of type `__m128d` to type `__m128i`. 
499 /// Note: Also possible with a regular `cast(__m128i)(a)`.
500 __m128i _mm_castpd_si128 (__m128d a) pure @safe
501 {
502     return cast(__m128i)a;
503 }
504 
505 /// Cast vector of type `__m128` to type `__m128d`. 
506 /// Note: Also possible with a regular `cast(__m128d)(a)`.
507 __m128d _mm_castps_pd (__m128 a) pure @safe
508 {
509     return cast(__m128d)a;
510 }
511 
512 /// Cast vector of type `__m128` to type `__m128i`. 
513 /// Note: Also possible with a regular `cast(__m128i)(a)`.
514 __m128i _mm_castps_si128 (__m128 a) pure @safe
515 {
516     return cast(__m128i)a;
517 }
518 
519 /// Cast vector of type `__m128i` to type `__m128d`. 
520 /// Note: Also possible with a regular `cast(__m128d)(a)`.
521 __m128d _mm_castsi128_pd (__m128i a) pure @safe
522 {
523     return cast(__m128d)a;
524 }
525 
526 /// Cast vector of type `__m128i` to type `__m128`. 
527 /// Note: Also possible with a regular `cast(__m128)(a)`.
528 __m128 _mm_castsi128_ps (__m128i a) pure @safe
529 {
530     return cast(__m128)a;
531 }
532 
533 /// Invalidate and flush the cache line that contains `p` 
534 /// from all levels of the cache hierarchy.
535 void _mm_clflush (const(void)* p) @trusted
536 {
537     static if (GDC_with_SSE2)
538     {
539         __builtin_ia32_clflush(p);
540     }
541     else static if (LDC_with_SSE2)
542     {
543         __builtin_ia32_clflush(cast(void*)p);
544     }
545     else version(D_InlineAsm_X86)
546     {
547         asm pure nothrow @nogc @safe
548         {
549             mov EAX, p;
550             clflush [EAX];
551         }
552     }
553     else version(D_InlineAsm_X86_64)
554     {
555         asm pure nothrow @nogc @safe
556         {
557             mov RAX, p;
558             clflush [RAX];
559         }
560     }
561     else 
562     {
563         // Do nothing. Invalidating cacheline does
564         // not affect correctness.
565     }
566 }
567 unittest
568 {
569     ubyte[64] cacheline;
570     _mm_clflush(cacheline.ptr);
571 }
572 
573 /// Compare packed 16-bit integers in `a` and `b` for equality.
574 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
575 {
576     static if (GDC_with_SSE2)
577     {
578         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
579     }
580     else
581     {
582         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
583     }
584 }
585 unittest
586 {
587     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
588     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
589     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
590     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
591     assert(R.array == E);
592 }
593 
594 /// Compare packed 32-bit integers in `a` and `b` for equality.
595 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
596 {
597     static if (GDC_with_SSE2)
598     {
599         return __builtin_ia32_pcmpeqd128(a, b);
600     }
601     else
602     {
603         return equalMask!__m128i(a, b);
604     }
605 }
606 unittest
607 {
608     int4   A = [-3, -2, -1,  0];
609     int4   B = [ 4, -2,  2,  0];
610     int[4] E = [ 0, -1,  0, -1];
611     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
612     assert(R.array == E);
613 }
614 
615 /// Compare packed 8-bit integers in `a` and `b` for equality.
616 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
617 {
618     static if (GDC_with_SSE2)
619     {
620         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
621     }
622     else
623     {
624         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
625     }
626 }
627 unittest
628 {
629     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
630     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
631     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
632     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
633     assert(C.array == correct);
634 }
635 
636 /// Compare packed double-precision (64-bit) floating-point elements 
637 /// in `a` and `b` for equality.
638 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
639 {
640     static if (GDC_with_SSE2)
641     {
642         return __builtin_ia32_cmpeqpd(a, b);
643     }
644     else
645     {
646         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
647     }
648 }
649 
650 /// Compare the lower double-precision (64-bit) floating-point elements
651 /// in `a` and `b` for equality, store the result in the lower element,
652 /// and copy the upper element from `a`.
653 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
654 {
655     static if (GDC_with_SSE2)
656     {
657         return __builtin_ia32_cmpeqsd(a, b);
658     }
659     else
660     {
661         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
662     }
663 }
664 
665 /// Compare packed double-precision (64-bit) floating-point elements 
666 /// in `a` and `b` for greater-than-or-equal.
667 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
668 {
669     static if (GDC_with_SSE2)
670     {
671         return __builtin_ia32_cmpgepd(a, b);
672     }
673     else
674     {
675         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
676     }
677 }
678 
679 /// Compare the lower double-precision (64-bit) floating-point elements 
680 /// in `a` and `b` for greater-than-or-equal, store the result in the 
681 /// lower element, and copy the upper element from `a`.
682 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
683 {
684     // Note: There is no __builtin_ia32_cmpgesd builtin.
685     static if (GDC_with_SSE2)
686     {
687         return __builtin_ia32_cmpnltsd(b, a);
688     }
689     else
690     {
691         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
692     }
693 }
694 
695 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
696 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
697 {
698     static if (GDC_with_SSE2)
699     {
700         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
701     }
702     else
703     {
704         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
705     }
706 }
707 unittest
708 {
709     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
710     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
711     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
712     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
713     assert(R.array == E);
714 }
715 
716 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
717 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
718 {
719     static if (GDC_with_SSE2)
720     {
721         return __builtin_ia32_pcmpgtd128(a, b); 
722     }
723     else
724     {
725         return cast(__m128i)( greaterMask!int4(a, b));
726     }
727 }
728 unittest
729 {
730     int4   A = [-3,  2, -1,  0];
731     int4   B = [ 4, -2,  2,  0];
732     int[4] E = [ 0, -1,  0,  0];
733     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
734     assert(R.array == E);
735 }
736 
737 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
738 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
739 {
740     static if (GDC_with_SSE2)
741     {
742         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
743     }
744     else
745     {
746         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
747     }
748 }
749 unittest
750 {
751     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
752     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
753     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
754     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
755     __m128i D = _mm_cmpeq_epi8(A, B);
756     assert(C.array == correct);
757 }
758 
759 /// Compare packed double-precision (64-bit) floating-point elements 
760 /// in `a` and `b` for greater-than.
761 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
762 {
763     static if (GDC_with_SSE2)
764     {
765         return __builtin_ia32_cmpgtpd(a, b); 
766     }
767     else
768     {
769         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
770     }
771 }
772 
773 /// Compare the lower double-precision (64-bit) floating-point elements 
774 /// in `a` and `b` for greater-than, store the result in the lower element,
775 /// and copy the upper element from `a`.
776 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
777 {
778     // Note: There is no __builtin_ia32_cmpgtsd builtin.
779     static if (GDC_with_SSE2)
780     {
781         return __builtin_ia32_cmpnlesd(b, a);
782     }
783     else
784     {
785         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
786     }
787 }
788 
789 /// Compare packed double-precision (64-bit) floating-point elements 
790 /// in `a` and `b` for less-than-or-equal.
791 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
792 {
793     static if (GDC_with_SSE2)
794     {
795         return __builtin_ia32_cmplepd(a, b); 
796     }
797     else
798     {
799         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
800     }
801 }
802 
803 /// Compare the lower double-precision (64-bit) floating-point elements 
804 /// in `a` and `b` for less-than-or-equal, store the result in the 
805 /// lower element, and copy the upper element from `a`.
806 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
807 {
808     static if (GDC_with_SSE2)
809     {
810         return __builtin_ia32_cmplesd(a, b); 
811     }
812     else
813     {
814         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
815     }
816 }
817 
818 /// Compare packed 16-bit integers in `a` and `b` for less-than.
819 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
820 {
821     return _mm_cmpgt_epi16(b, a);
822 }
823 
824 /// Compare packed 32-bit integers in `a` and `b` for less-than.
825 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
826 {
827     return _mm_cmpgt_epi32(b, a);
828 }
829 
830 /// Compare packed 8-bit integers in `a` and `b` for less-than.
831 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
832 {
833     return _mm_cmpgt_epi8(b, a);
834 }
835 
836 /// Compare packed double-precision (64-bit) floating-point elements
837 /// in `a` and `b` for less-than.
838 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
839 {
840     static if (GDC_with_SSE2)
841     {
842         return __builtin_ia32_cmpltpd(a, b); 
843     }
844     else
845     {
846         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
847     }
848 }
849 
850 /// Compare the lower double-precision (64-bit) floating-point elements
851 /// in `a` and `b` for less-than, store the result in the lower 
852 /// element, and copy the upper element from `a`.
853 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
854 {
855     static if (GDC_with_SSE2)
856     {
857         return __builtin_ia32_cmpltsd(a, b); 
858     }
859     else
860     {
861         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
862     }
863 }
864 
865 /// Compare packed double-precision (64-bit) floating-point elements
866 /// in `a` and `b` for not-equal.
867 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
868 {
869     static if (GDC_with_SSE2)
870     {
871         return __builtin_ia32_cmpneqpd(a, b); 
872     }
873     else
874     {
875         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
876     }
877 }
878 
879 /// Compare the lower double-precision (64-bit) floating-point elements
880 /// in `a` and `b` for not-equal, store the result in the lower 
881 /// element, and copy the upper element from `a`.
882 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
883 {
884     static if (GDC_with_SSE2)
885     {
886         return __builtin_ia32_cmpneqsd(a, b); 
887     }
888     else
889     {
890         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
891     }
892 }
893 
894 /// Compare packed double-precision (64-bit) floating-point elements 
895 /// in `a` and `b` for not-greater-than-or-equal.
896 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
897 {
898     static if (GDC_with_SSE2)
899     {
900         return __builtin_ia32_cmpngepd(a, b); 
901     }
902     else
903     {
904         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
905     }
906 }
907 
908 /// Compare the lower double-precision (64-bit) floating-point elements 
909 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
910 /// the lower element, and copy the upper element from `a`.
911 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
912 {
913     // Note: There is no __builtin_ia32_cmpngesd builtin.
914     static if (GDC_with_SSE2)
915     {
916         return __builtin_ia32_cmpltsd(b, a); 
917     }
918     else
919     {
920         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
921     }
922 }
923 
924 /// Compare packed double-precision (64-bit) floating-point elements 
925 /// in `a` and `b` for not-greater-than.
926 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
927 {
928     static if (GDC_with_SSE2)
929     {
930         return __builtin_ia32_cmpngtpd(a, b);
931     }
932     else
933     {
934         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
935     }
936 }
937 
938 /// Compare the lower double-precision (64-bit) floating-point elements 
939 /// in `a` and `b` for not-greater-than, store the result in the 
940 /// lower element, and copy the upper element from `a`.
941 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
942 {
943     // Note: There is no __builtin_ia32_cmpngtsd builtin.
944     static if (GDC_with_SSE2)
945     {
946         return __builtin_ia32_cmplesd(b, a);
947     }
948     else
949     {
950         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
951     }
952 }
953 
954 /// Compare packed double-precision (64-bit) floating-point elements 
955 /// in `a` and `b` for not-less-than-or-equal.
956 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
957 {
958     static if (GDC_with_SSE2)
959     {
960         return __builtin_ia32_cmpnlepd(a, b);
961     }
962     else
963     {
964         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
965     }
966 }
967 
968 /// Compare the lower double-precision (64-bit) floating-point elements 
969 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
970 /// lower element, and copy the upper element from `a`.
971 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
972 {
973     static if (GDC_with_SSE2)
974     {
975         return __builtin_ia32_cmpnlesd(a, b);
976     }
977     else
978     {
979         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
980     }
981 }
982  
983 /// Compare packed double-precision (64-bit) floating-point elements 
984 /// in `a` and `b` for not-less-than.
985 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
986 {
987     static if (GDC_with_SSE2)
988     {
989         return __builtin_ia32_cmpnltpd(a, b);
990     }
991     else
992     {
993         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
994     }
995 }
996 
997 /// Compare the lower double-precision (64-bit) floating-point elements 
998 /// in `a` and `b` for not-less-than, store the result in the lower 
999 /// element, and copy the upper element from `a`.
1000 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1001 {
1002     static if (GDC_with_SSE2)
1003     {
1004         return __builtin_ia32_cmpnltsd(a, b);
1005     }
1006     else
1007     {
1008         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1009     }
1010 }
1011 
1012 /// Compare packed double-precision (64-bit) floating-point elements 
1013 /// in `a` and `b` to see if neither is NaN.
1014 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1015 {
1016     static if (GDC_with_SSE2)
1017     {
1018         return __builtin_ia32_cmpordpd(a, b);
1019     }
1020     else
1021     {
1022         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1023     }
1024 }
1025 
1026 /// Compare the lower double-precision (64-bit) floating-point elements 
1027 /// in `a` and `b` to see if neither is NaN, store the result in the 
1028 /// lower element, and copy the upper element from `a` to the upper element.
1029 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1030 {
1031     static if (GDC_with_SSE2)
1032     {
1033         return __builtin_ia32_cmpordsd(a, b);
1034     }
1035     else
1036     {
1037         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1038     }
1039 }
1040 
1041 /// Compare packed double-precision (64-bit) floating-point elements 
1042 /// in `a` and `b` to see if either is NaN.
1043 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1044 {
1045     static if (GDC_with_SSE2)
1046     {
1047         return __builtin_ia32_cmpunordpd(a, b);
1048     }
1049     else
1050     {
1051         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1052     }
1053 }
1054 
1055 /// Compare the lower double-precision (64-bit) floating-point elements 
1056 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1057 /// element, and copy the upper element from `a` to the upper element.
1058 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1059 {
1060     static if (GDC_with_SSE2)
1061     {
1062         return __builtin_ia32_cmpunordsd(a, b);
1063     }
1064     else
1065     {
1066         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1067     }
1068 }
1069 
1070 /// Compare the lower double-precision (64-bit) floating-point element 
1071 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1072 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1073 {
1074     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1075     // comisd instruction, it returns false in case of unordered instead.
1076     //
1077     // Actually C++ compilers disagree over the meaning of that instruction.
1078     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1079     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1080     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1081     return a.array[0] == b.array[0];
1082 }
1083 unittest
1084 {
1085     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1086     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1087     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1088     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1089     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1090 }
1091 
1092 /// Compare the lower double-precision (64-bit) floating-point element 
1093 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1094 /// result (0 or 1).
1095 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1096 {
1097     return a.array[0] >= b.array[0];
1098 }
1099 unittest
1100 {
1101     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1102     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1103     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1104     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1105     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1106     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1107 }
1108 
1109 /// Compare the lower double-precision (64-bit) floating-point element 
1110 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1111 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1112 {
1113     return a.array[0] > b.array[0];
1114 }
1115 unittest
1116 {
1117     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1118     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1119     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1120     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1121     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1122 }
1123 
1124 /// Compare the lower double-precision (64-bit) floating-point element 
1125 /// in `a` and `b` for less-than-or-equal.
1126 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1127 {
1128     return a.array[0] <= b.array[0];
1129 }
1130 unittest
1131 {
1132     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1133     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1134     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1135     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1136     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1137     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1138 }
1139 
1140 /// Compare the lower double-precision (64-bit) floating-point element 
1141 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1142 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1143 {
1144     return a.array[0] < b.array[0];
1145 }
1146 unittest
1147 {
1148     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1149     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1150     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1151     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1152     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1153     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1154 }
1155 
1156 /// Compare the lower double-precision (64-bit) floating-point element
1157 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1158 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1159 {
1160     return a.array[0] != b.array[0];
1161 }
1162 unittest
1163 {
1164     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1165     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1166     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1167     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1168     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1169 }
1170 
1171 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1172 /// floating-point elements.
1173  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1174 {
1175     version(LDC)
1176     {
1177         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1178         enum ir = `
1179             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1180             %r = sitofp <2 x i32> %v to <2 x double>
1181             ret <2 x double> %r`;
1182         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1183     }
1184     else static if (GDC_with_SSE2)
1185     {
1186         return __builtin_ia32_cvtdq2pd(a);
1187     }
1188     else
1189     {
1190         double2 r = void;
1191         r.ptr[0] = a.array[0];
1192         r.ptr[1] = a.array[1];
1193         return r;
1194     }
1195 }
1196 unittest
1197 {
1198     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1199     assert(A.array[0] == 54.0);
1200     assert(A.array[1] == 54.0);
1201 }
1202 
1203 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1204 /// floating-point elements.
1205 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1206 {
1207     static if (GDC_with_SSE2)
1208     {
1209         return __builtin_ia32_cvtdq2ps(a);
1210     }
1211     else
1212     {
1213         // x86: Generates cvtdq2ps since LDC 1.0.0 -O1
1214         // ARM: Generats scvtf.4s since LDC 1.8.0 -02
1215         __m128 res;
1216         res.ptr[0] = cast(float)a.array[0];
1217         res.ptr[1] = cast(float)a.array[1];
1218         res.ptr[2] = cast(float)a.array[2];
1219         res.ptr[3] = cast(float)a.array[3];
1220         return res;
1221     }
1222 }
1223 unittest
1224 {
1225     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1226     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1227 }
1228 
1229 /// Convert packed double-precision (64-bit) floating-point elements 
1230 /// in `a` to packed 32-bit integers.
1231 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1232 {
1233     // PERF ARM32
1234     static if (LDC_with_SSE2)
1235     {
1236         return __builtin_ia32_cvtpd2dq(a);
1237     }
1238     else static if (GDC_with_SSE2)
1239     {
1240         return __builtin_ia32_cvtpd2dq(a);
1241     }
1242     else static if (LDC_with_ARM64)
1243     {
1244         // Get current rounding mode.
1245         uint fpscr = arm_get_fpcr();
1246         long2 i;
1247         switch(fpscr & _MM_ROUND_MASK_ARM)
1248         {
1249             default:
1250             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1251             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1252             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1253             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1254         }
1255         int4 zero = 0;
1256         return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1257     }
1258     else
1259     {
1260         // PERF ARM32
1261         __m128i r = _mm_setzero_si128();
1262         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1263         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1264         return r;
1265     }
1266 }
1267 unittest
1268 {
1269     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1270     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1271 }
1272 
1273 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1274 /// to packed 32-bit integers
1275 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1276 {
1277     return to_m64(_mm_cvtpd_epi32(v));
1278 }
1279 unittest
1280 {
1281     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1282     assert(A.array[0] == 55 && A.array[1] == 61);
1283 }
1284 
1285 /// Convert packed double-precision (64-bit) floating-point elements 
1286 /// in `a` to packed single-precision (32-bit) floating-point elements.
1287 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1288 {
1289     static if (LDC_with_SSE2)
1290     {
1291         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1292     }
1293     else static if (GDC_with_SSE2)
1294     {
1295         return __builtin_ia32_cvtpd2ps(a);
1296     }
1297     else
1298     { 
1299         __m128 r = void;
1300         r.ptr[0] = a.array[0];
1301         r.ptr[1] = a.array[1];
1302         r.ptr[2] = 0;
1303         r.ptr[3] = 0;
1304         return r;
1305     }
1306 }
1307 unittest
1308 {
1309     __m128d A = _mm_set_pd(5.25, 4.0);
1310     __m128 B = _mm_cvtpd_ps(A);
1311     assert(B.array == [4.0f, 5.25f, 0, 0]);
1312 }
1313 
1314 /// Convert packed 32-bit integers in `v` to packed double-precision 
1315 /// (64-bit) floating-point elements.
1316 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1317 {
1318     return _mm_cvtepi32_pd(to_m128i(v));
1319 }
1320 unittest
1321 {
1322     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1323     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1324 }
1325 
1326 /// Convert packed single-precision (32-bit) floating-point elements 
1327 /// in `a` to packed 32-bit integers
1328 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1329 {
1330     static if (LDC_with_SSE2)
1331     {
1332         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1333     }
1334     else static if (GDC_with_SSE2)
1335     {
1336         return __builtin_ia32_cvtps2dq(a);
1337     }
1338     else static if (LDC_with_ARM64)
1339     {
1340         // Get current rounding mode.
1341         uint fpscr = arm_get_fpcr();
1342         switch(fpscr & _MM_ROUND_MASK_ARM)
1343         {
1344             default:
1345             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1346             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1347             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1348             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1349         }
1350     }
1351     else
1352     {
1353         __m128i r = void;
1354         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1355         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1356         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1357         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1358         return r;
1359     }
1360 }
1361 unittest
1362 {
1363     // GDC bug #98607
1364     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1365     // GDC does not provide optimization barrier for rounding mode.
1366     // Workarounded with different literals. This bug will likely only manifest in unittest.
1367     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1368 
1369     uint savedRounding = _MM_GET_ROUNDING_MODE();
1370 
1371     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1372     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1373     assert(A.array == [1, -2, 54, -3]);
1374 
1375     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1376     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1377     assert(A.array == [1, -3, 53, -3]);
1378 
1379     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1380     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1381     assert(A.array == [2, -2, 54, -2]);
1382 
1383     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1384     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1385     assert(A.array == [1, -2, 53, -2]);
1386 
1387     _MM_SET_ROUNDING_MODE(savedRounding);
1388 }
1389 
1390 /// Convert packed single-precision (32-bit) floating-point elements 
1391 /// in `a` to packed double-precision (64-bit) floating-point elements.
1392 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1393 {
1394     version(LDC)
1395     {
1396         // Generates cvtps2pd since LDC 1.0 -O0
1397         enum ir = `
1398             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1399             %r = fpext <2 x float> %v to <2 x double>
1400             ret <2 x double> %r`;
1401         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1402     }
1403     else static if (GDC_with_SSE2)
1404     {
1405         return __builtin_ia32_cvtps2pd(a);
1406     }
1407     else
1408     {
1409         double2 r = void;
1410         r.ptr[0] = a.array[0];
1411         r.ptr[1] = a.array[1];
1412         return r;
1413     }
1414 }
1415 unittest
1416 {
1417     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1418     assert(A.array[0] == 54.0);
1419     assert(A.array[1] == 54.0);
1420 }
1421 
1422 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1423 double _mm_cvtsd_f64 (__m128d a) pure @safe
1424 {
1425     return a.array[0];
1426 }
1427 
1428 /// Convert the lower double-precision (64-bit) floating-point element
1429 /// in `a` to a 32-bit integer.
1430 int _mm_cvtsd_si32 (__m128d a) @safe
1431 {
1432     static if (LDC_with_SSE2)
1433     {
1434         return __builtin_ia32_cvtsd2si(a);
1435     }
1436     else static if (GDC_with_SSE2)
1437     {
1438         return __builtin_ia32_cvtsd2si(a);
1439     }
1440     else
1441     {
1442         return convertDoubleToInt32UsingMXCSR(a[0]);
1443     }
1444 }
1445 unittest
1446 {
1447     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1448 }
1449 
1450 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1451 long _mm_cvtsd_si64 (__m128d a) @trusted
1452 {
1453     version (LDC)
1454     {
1455         version (X86_64)
1456         {
1457             return __builtin_ia32_cvtsd2si64(a);
1458         }
1459         else
1460         {
1461             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1462             // using SSE instructions only. So the builtin doesn't exit for this arch.
1463             return convertDoubleToInt64UsingMXCSR(a[0]);
1464         }
1465     }
1466     else
1467     {
1468         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1469     }
1470 }
1471 unittest
1472 {
1473     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1474 
1475     uint savedRounding = _MM_GET_ROUNDING_MODE();
1476 
1477     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1478     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1479 
1480     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1481     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1482 
1483     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1484     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1485 
1486     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1487     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1488 
1489     _MM_SET_ROUNDING_MODE(savedRounding);
1490 }
1491 
1492 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1493 
1494 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1495 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1496 /// to the upper elements of result.
1497 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1498 {
1499     static if (GDC_with_SSE2)
1500     {
1501         return __builtin_ia32_cvtsd2ss(a, b); 
1502     }
1503     else
1504     {
1505         // Generates cvtsd2ss since LDC 1.3 -O0
1506         a.ptr[0] = b.array[0];
1507         return a;
1508     }
1509 }
1510 unittest
1511 {
1512     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1513     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1514 }
1515 
1516 /// Get the lower 32-bit integer in `a`.
1517 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1518 {
1519     return a.array[0];
1520 }
1521 
1522 /// Get the lower 64-bit integer in `a`.
1523 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1524 {
1525     long2 la = cast(long2)a;
1526     return la.array[0];
1527 }
1528 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1529 
1530 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1531 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1532 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1533 {
1534     a.ptr[0] = cast(double)b;
1535     return a;
1536 }
1537 unittest
1538 {
1539     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1540     assert(a.array == [42.0, 0]);
1541 }
1542 
1543 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1544 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1545 {
1546     int4 r = [0, 0, 0, 0];
1547     r.ptr[0] = a;
1548     return r;
1549 }
1550 unittest
1551 {
1552     __m128i a = _mm_cvtsi32_si128(65);
1553     assert(a.array == [65, 0, 0, 0]);
1554 }
1555 
1556 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1557 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1558 
1559 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1560 {
1561     a.ptr[0] = cast(double)b;
1562     return a;
1563 }
1564 unittest
1565 {
1566     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1567     assert(a.array == [42.0, 0]);
1568 }
1569 
1570 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1571 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1572 {
1573     long2 r = [0, 0];
1574     r.ptr[0] = a;
1575     return cast(__m128i)(r);
1576 }
1577 
1578 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1579 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1580 
1581 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1582 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1583 // element of result.
1584 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1585 {
1586     a.ptr[0] = b.array[0];
1587     return a;
1588 }
1589 unittest
1590 {
1591     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1592     assert(a.array == [42.0, 0]);
1593 }
1594 
1595 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1596 long _mm_cvttss_si64 (__m128 a) pure @safe
1597 {
1598     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1599 }
1600 unittest
1601 {
1602     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1603 }
1604 
1605 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1606 /// Put zeroes in the upper elements of result.
1607 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1608 {
1609     static if (LDC_with_SSE2)
1610     {
1611         return __builtin_ia32_cvttpd2dq(a);
1612     }
1613     else static if (GDC_with_SSE2)
1614     {
1615         return __builtin_ia32_cvttpd2dq(a);
1616     }
1617     else
1618     {
1619         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1620         __m128i r;
1621         r.ptr[0] = cast(int)a.array[0];
1622         r.ptr[1] = cast(int)a.array[1];
1623         r.ptr[2] = 0;
1624         r.ptr[3] = 0;
1625         return r;
1626     }
1627 }
1628 unittest
1629 {
1630     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1631     assert(R.array == [-4, 45641, 0, 0]);
1632 }
1633 
1634 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1635 /// to packed 32-bit integers with truncation.
1636 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1637 {
1638     return to_m64(_mm_cvttpd_epi32(v));
1639 }
1640 unittest
1641 {
1642     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1643     int[2] correct = [-4, 45641];
1644     assert(R.array == correct);
1645 }
1646 
1647 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1648 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1649 {
1650     // x86: Generates cvttps2dq since LDC 1.3 -O2
1651     // ARM64: generates fcvtze since LDC 1.8 -O2
1652     __m128i r;
1653     r.ptr[0] = cast(int)a.array[0];
1654     r.ptr[1] = cast(int)a.array[1];
1655     r.ptr[2] = cast(int)a.array[2];
1656     r.ptr[3] = cast(int)a.array[3];
1657     return r;
1658 }
1659 unittest
1660 {
1661     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1662     assert(R.array == [-4, 45641, 0, 1]);
1663 }
1664 
1665 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1666 int _mm_cvttsd_si32 (__m128d a)
1667 {
1668     // Generates cvttsd2si since LDC 1.3 -O0
1669     return cast(int)a.array[0];
1670 }
1671 
1672 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1673 long _mm_cvttsd_si64 (__m128d a)
1674 {
1675     // Generates cvttsd2si since LDC 1.3 -O0
1676     // but in 32-bit instead, it's a long sequence that resort to FPU
1677     return cast(long)a.array[0];
1678 }
1679 
1680 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1681 
1682 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1683 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1684 {
1685     return a / b;
1686 }
1687 
1688 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1689 {
1690     static if (GDC_with_SSE2)
1691     {
1692         return __builtin_ia32_divsd(a, b);
1693     }
1694     else version(DigitalMars)
1695     {
1696         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1697         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1698         asm pure nothrow @nogc @trusted { nop;}
1699         a.array[0] = a.array[0] / b.array[0];
1700         return a;
1701     }
1702     else
1703     {
1704         a.ptr[0] /= b.array[0];
1705         return a;
1706     }
1707 }
1708 unittest
1709 {
1710     __m128d a = [2.0, 4.5];
1711     a = _mm_div_sd(a, a);
1712     assert(a.array == [1.0, 4.5]);
1713 }
1714 
1715 /// Extract a 16-bit integer from `v`, selected with `index`.
1716 /// Warning: the returned value is zero-extended to 32-bits.
1717 int _mm_extract_epi16(__m128i v, int index) pure @safe
1718 {
1719     short8 r = cast(short8)v;
1720     return cast(ushort)(r.array[index & 7]);
1721 }
1722 unittest
1723 {
1724     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1725     assert(_mm_extract_epi16(A, 6) == 6);
1726     assert(_mm_extract_epi16(A, 0) == 65535);
1727     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1728 }
1729 
1730 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1731 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1732 {
1733     short8 r = cast(short8)v;
1734     r.ptr[index & 7] = cast(short)i;
1735     return cast(__m128i)r;
1736 }
1737 unittest
1738 {
1739     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1740     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1741     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1742     assert(R.array == correct);
1743 }
1744 
1745 
1746 void _mm_lfence() @trusted
1747 {
1748     version(GNU)
1749     {
1750     
1751         static if (GDC_with_SSE2)
1752         {
1753             __builtin_ia32_lfence();
1754         }
1755         else version(X86)
1756         {
1757             asm pure nothrow @nogc @trusted
1758             {
1759                 "lfence;\n" : : : ;
1760             }
1761         }
1762         else
1763             static assert(false);
1764     }
1765     else static if (LDC_with_SSE2)
1766     {
1767         __builtin_ia32_lfence();
1768     }
1769     else static if (DMD_with_asm)
1770     {
1771         asm nothrow @nogc pure @safe
1772         {
1773             lfence;
1774         }
1775     }
1776     else version(LDC)
1777     {
1778         llvm_memory_fence(); // PERF actually generates mfence
1779     }
1780     else
1781         static assert(false);
1782 }
1783 unittest
1784 {
1785     _mm_lfence();
1786 }
1787 
1788 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1789 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1790 __m128d _mm_load_pd (const(double) * mem_addr) pure
1791 {
1792     __m128d* aligned = cast(__m128d*)mem_addr;
1793     return *aligned;
1794 }
1795 unittest
1796 {
1797     align(16) double[2] S = [-5.0, 7.0];
1798     __m128d R = _mm_load_pd(S.ptr);
1799     assert(R.array == S);
1800 }
1801 
1802 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1803 /// `mem_addr` does not need to be aligned on any particular boundary.
1804 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1805 {
1806     double m = *mem_addr;
1807     __m128d r;
1808     r.ptr[0] = m;
1809     r.ptr[1] = m;
1810     return r;
1811 }
1812 unittest
1813 {
1814     double what = 4;
1815     __m128d R = _mm_load_pd1(&what);
1816     double[2] correct = [4.0, 4];
1817     assert(R.array == correct);
1818 }
1819 
1820 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1821 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1822 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1823 {
1824     double2 r = [0, 0];
1825     r.ptr[0] = *mem_addr;
1826     return r;
1827 }
1828 unittest
1829 {
1830     double x = -42;
1831     __m128d a = _mm_load_sd(&x);
1832     assert(a.array == [-42.0, 0.0]);
1833 }
1834 
1835 /// Load 128-bits of integer data from memory into dst. 
1836 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1837 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1838 {
1839     return *mem_addr;
1840 }
1841 unittest
1842 {
1843     align(16) int[4] correct = [-1, 2, 3, 4];
1844     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1845     assert(A.array == correct);
1846 }
1847 
1848 alias _mm_load1_pd = _mm_load_pd1; ///
1849 
1850 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1851 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1852 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1853 {
1854     a.ptr[1] = *mem_addr;
1855     return a;
1856 }
1857 unittest
1858 {
1859     double A = 7.0;
1860     __m128d B = _mm_setr_pd(4.0, -5.0);
1861     __m128d R = _mm_loadh_pd(B, &A);
1862     double[2] correct = [ 4.0, 7.0 ];
1863     assert(R.array == correct);
1864 }
1865 
1866 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1867 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1868 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1869 {
1870     auto pLong = cast(const(long)*)mem_addr;
1871     long2 r = [0, 0];
1872     r.ptr[0] = *pLong;
1873     return cast(__m128i)(r);
1874 }
1875 unittest
1876 {
1877     long A = 0x7878787870707070;
1878     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1879     long[2] correct = [0x7878787870707070, 0];
1880     assert(R.array == correct);
1881 }
1882 
1883 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1884 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1885 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1886 {
1887     a.ptr[0] = *mem_addr;
1888     return a;
1889 }
1890 unittest
1891 {
1892     double A = 7.0;
1893     __m128d B = _mm_setr_pd(4.0, -5.0);
1894     __m128d R = _mm_loadl_pd(B, &A);
1895     double[2] correct = [ 7.0, -5.0 ];
1896     assert(R.array == correct);
1897 }
1898 
1899 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1900 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1901 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1902 {
1903     __m128d a = *cast(__m128d*)(mem_addr);
1904     __m128d r;
1905     r.ptr[0] = a.array[1];
1906     r.ptr[1] = a.array[0];
1907     return r;
1908 }
1909 unittest
1910 {
1911     align(16) double[2] A = [56.0, -74.0];
1912     __m128d R = _mm_loadr_pd(A.ptr);
1913     double[2] correct = [-74.0, 56.0];
1914     assert(R.array == correct);
1915 }
1916 
1917 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1918 /// `mem_addr` does not need to be aligned on any particular boundary.
1919 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1920 {
1921     static if (GDC_with_SSE2)
1922     {
1923         return __builtin_ia32_loadupd(mem_addr); 
1924     }
1925     else version(LDC)
1926     {
1927         return loadUnaligned!(double2)(mem_addr);
1928     }
1929     else version(DigitalMars)
1930     {
1931         static if (DMD_with_DSIMD)
1932         {
1933             return cast(__m128d)__simd(XMM.LODUPD, *mem_addr);
1934         }
1935         else static if (SSESizedVectorsAreEmulated)
1936         {
1937             // Since this vector is emulated, it doesn't have alignement constraints
1938             // and as such we can just cast it.
1939             return *cast(__m128d*)(mem_addr);
1940         }
1941         else
1942         {
1943             __m128d result;
1944             result.ptr[0] = mem_addr[0];
1945             result.ptr[1] = mem_addr[1];
1946             return result;
1947         }
1948     }
1949     else
1950     {
1951         __m128d result;
1952         result.ptr[0] = mem_addr[0];
1953         result.ptr[1] = mem_addr[1];
1954         return result;
1955     }
1956 }
1957 unittest
1958 {
1959     double[2] A = [56.0, -75.0];
1960     __m128d R = _mm_loadu_pd(A.ptr);
1961     double[2] correct = [56.0, -75.0];
1962     assert(R.array == correct);
1963 }
1964 
1965 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
1966 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1967 {
1968     static if (GDC_with_SSE2)
1969     {
1970         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1971     }
1972     else
1973     {
1974         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1975     }
1976 }
1977 unittest
1978 {
1979     align(16) int[4] correct = [-1, 2, -3, 4];
1980     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
1981     assert(A.array == correct);
1982 }
1983 
1984 /// Load unaligned 32-bit integer from memory into the first element of result.
1985 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1986 {
1987     int r = *cast(int*)(mem_addr);
1988     int4 result = [0, 0, 0, 0];
1989     result.ptr[0] = r;
1990     return result;
1991 }
1992 unittest
1993 {
1994     int r = 42;
1995     __m128i A = _mm_loadu_si32(&r);
1996     int[4] correct = [42, 0, 0, 0];
1997     assert(A.array == correct);
1998 }
1999 
2000 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2001 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2002 /// and pack the results in destination.
2003 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2004 {
2005     static if (GDC_with_SSE2)
2006     {
2007         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2008     }
2009     else static if (LDC_with_SSE2)
2010     {
2011         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2012     }
2013     else static if (LDC_with_ARM64)
2014     {
2015         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2016         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2017         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2018         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2019         return vcombine_s32(rl, rh);
2020     }
2021     else
2022     {
2023         short8 sa = cast(short8)a;
2024         short8 sb = cast(short8)b;
2025         int4 r;
2026         foreach(i; 0..4)
2027         {
2028             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2029         }
2030         return r;
2031     }
2032 }
2033 unittest
2034 {
2035     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2036     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2037     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2038     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2039     assert(R.array == correct);
2040 }
2041 
2042 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2043 /// (elements are not stored when the highest bit is not set in the corresponding element)
2044 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2045 /// boundary.
2046 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2047 {
2048     static if (GDC_with_SSE2)
2049     {    
2050         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2051     }
2052     else static if (LDC_with_SSE2)
2053     {
2054         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2055     }
2056     else static if (LDC_with_ARM64)
2057     {
2058         // PERF: catastrophic on ARM32
2059         byte16 bmask  = cast(byte16)mask;
2060         byte16 shift = 7;
2061         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2062         mask = cast(__m128i) bmask;
2063         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2064         dest = (a & mask) | (dest & ~mask);
2065         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2066     }
2067     else
2068     {
2069         byte16 b = cast(byte16)a;
2070         byte16 m = cast(byte16)mask;
2071         byte* dest = cast(byte*)(mem_addr);
2072         foreach(j; 0..16)
2073         {
2074             if (m.array[j] & 128)
2075             {
2076                 dest[j] = b.array[j];
2077             }
2078         }
2079     }
2080 }
2081 unittest
2082 {
2083     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2084     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2085     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2086     _mm_maskmoveu_si128(A, mask, dest.ptr);
2087     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2088     assert(dest == correct);
2089 }
2090 
2091 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2092 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2093 {
2094     static if (GDC_with_SSE2)
2095     {
2096         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2097     }
2098     else version(LDC)
2099     {
2100         // x86: pmaxsw since LDC 1.0 -O1
2101         // ARM: smax.8h since LDC 1.5 -01
2102         short8 sa = cast(short8)a;
2103         short8 sb = cast(short8)b;
2104         short8 greater = greaterMask!short8(sa, sb);
2105         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2106     }
2107     else
2108     {
2109         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2110         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2111         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2112         return _mm_xor_si128(b, mask);
2113     }
2114 }
2115 unittest
2116 {
2117     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2118                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2119     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2120     assert(R.array == correct);
2121 }
2122 
2123 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2124 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2125 {
2126     version(LDC)
2127     {
2128         // x86: pmaxub since LDC 1.0.0 -O1
2129         // ARM64: umax.16b since LDC 1.5.0 -O1
2130         // PERF: catastrophic on ARM32
2131         ubyte16 sa = cast(ubyte16)a;
2132         ubyte16 sb = cast(ubyte16)b;
2133         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2134         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2135     }
2136     else
2137     {
2138         __m128i value128 = _mm_set1_epi8(-128);
2139         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2140         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2141         __m128i mask = _mm_and_si128(aTob, higher);
2142         return _mm_xor_si128(b, mask);
2143     }
2144 }
2145 unittest
2146 {
2147     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2148                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2149     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2150     assert(R.array == correct);
2151 }
2152 
2153 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2154 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2155 {
2156     static if (GDC_with_SSE2)
2157     {
2158         return __builtin_ia32_maxpd(a, b);
2159     }
2160     else
2161     {
2162         // x86: Generates maxpd starting with LDC 1.9 -O2
2163         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2164         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2165         return a;
2166     }
2167 }
2168 unittest
2169 {
2170     __m128d A = _mm_setr_pd(4.0, 1.0);
2171     __m128d B = _mm_setr_pd(1.0, 8.0);
2172     __m128d M = _mm_max_pd(A, B);
2173     assert(M.array[0] == 4.0);
2174     assert(M.array[1] == 8.0);
2175 }
2176 
2177 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2178 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2179 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2180 {
2181     static if (GDC_with_SSE2)
2182     {
2183         return __builtin_ia32_maxsd(a, b);
2184     }
2185     else
2186     {
2187          __m128d r = a;
2188         // Generates maxsd starting with LDC 1.3
2189         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2190         return r;
2191     }
2192 }
2193 unittest
2194 {
2195     __m128d A = _mm_setr_pd(1.0, 1.0);
2196     __m128d B = _mm_setr_pd(4.0, 2.0);
2197     __m128d M = _mm_max_sd(A, B);
2198     assert(M.array[0] == 4.0);
2199     assert(M.array[1] == 1.0);
2200 }
2201 
2202 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2203 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2204 /// is globally visible before any memory instruction which follows the fence in program order.
2205 void _mm_mfence() @trusted
2206 {
2207     version(GNU)
2208     {
2209         static if (GDC_with_SSE2)
2210         {
2211             __builtin_ia32_mfence();
2212         }
2213         else version(X86)
2214         {
2215             asm pure nothrow @nogc @trusted
2216             {
2217                 "mfence;\n" : : : ;
2218             }
2219         }
2220         else
2221             static assert(false);
2222     }
2223     else static if (LDC_with_SSE2)
2224     {
2225         __builtin_ia32_mfence();
2226     }
2227     else static if (DMD_with_asm)
2228     {
2229         asm nothrow @nogc pure @safe
2230         {
2231             mfence;
2232         }
2233     }
2234     else version(LDC)
2235     {
2236         void _mm_mfence() pure @safe
2237         {
2238             // Note: will generate the DMB instruction on ARM
2239             llvm_memory_fence();
2240         }
2241     }
2242     else
2243         static assert(false);
2244 }
2245 unittest
2246 {
2247     _mm_mfence();
2248 }
2249 
2250 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2251 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2252 {
2253     static if (GDC_with_SSE2)
2254     {
2255         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2256     }
2257     else version(LDC)
2258     {
2259         // x86: pminsw since LDC 1.0 -O1
2260         // ARM64: smin.8h since LDC 1.5 -01
2261         short8 sa = cast(short8)a;
2262         short8 sb = cast(short8)b;
2263         short8 greater = greaterMask!short8(sa, sb);
2264         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2265     }
2266     else
2267     {
2268         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2269         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2270         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2271         return _mm_xor_si128(b, mask);
2272     }
2273 }
2274 unittest
2275 {
2276     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2277                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2278     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2279     assert(R.array == correct);
2280 }
2281 
2282 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2283 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2284 {
2285     version(LDC)
2286     {
2287         // x86: pminub since LDC 1.0.0 -O1
2288         // ARM: umin.16b since LDC 1.5.0 -O1
2289         // PERF: catastrophic on ARM32
2290         ubyte16 sa = cast(ubyte16)a;
2291         ubyte16 sb = cast(ubyte16)b;
2292         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2293         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2294     }
2295     else
2296     {
2297         __m128i value128 = _mm_set1_epi8(-128);
2298         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2299         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2300         __m128i mask = _mm_and_si128(aTob, lower);
2301         return _mm_xor_si128(b, mask);
2302     }
2303 }
2304 unittest
2305 {
2306     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2307                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2308     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2309     assert(R.array == correct);
2310 }
2311 
2312 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2313 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2314 {
2315     static if (GDC_with_SSE2)
2316     {
2317         return __builtin_ia32_minpd(a, b);
2318     }
2319     else
2320     {
2321         // Generates minpd starting with LDC 1.9
2322         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2323         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2324         return a;
2325     }
2326 }
2327 unittest
2328 {
2329     __m128d A = _mm_setr_pd(1.0, 2.0);
2330     __m128d B = _mm_setr_pd(4.0, 1.0);
2331     __m128d M = _mm_min_pd(A, B);
2332     assert(M.array[0] == 1.0);
2333     assert(M.array[1] == 1.0);
2334 }
2335 
2336 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2337 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2338 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2339 {
2340     static if (GDC_with_SSE2)
2341     {
2342         return __builtin_ia32_minsd(a, b);
2343     }
2344     else
2345     {
2346         // Generates minsd starting with LDC 1.3
2347         __m128d r = a;
2348         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2349         return r;
2350     }
2351 }
2352 unittest
2353 {
2354     __m128d A = _mm_setr_pd(1.0, 3.0);
2355     __m128d B = _mm_setr_pd(4.0, 2.0);
2356     __m128d M = _mm_min_sd(A, B);
2357     assert(M.array[0] == 1.0);
2358     assert(M.array[1] == 3.0);
2359 }
2360 
2361 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2362 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2363 {
2364     static if (GDC_with_SSE2)
2365     {
2366         // slightly better with GDC -O0
2367         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2368     }
2369     else
2370     {
2371         long2 result = [ 0, 0 ];
2372         long2 la = cast(long2) a;
2373         result.ptr[0] = la.array[0];
2374         return cast(__m128i)(result);
2375     }
2376 }
2377 unittest
2378 {
2379     long2 A = [13, 47];
2380     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2381     long[2] correct = [13, 0];
2382     assert(B.array == correct);
2383 }
2384 
2385 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2386 /// the upper element from `a` to the upper element of dst.
2387 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2388 {
2389     static if (GDC_with_SSE2)
2390     {
2391         return __builtin_ia32_movsd(a, b); 
2392     }
2393     else
2394     {
2395         b.ptr[1] = a.array[1];
2396         return b;
2397     }
2398 }
2399 unittest
2400 {
2401     double2 A = [13.0, 47.0];
2402     double2 B = [34.0, 58.0];
2403     double2 C = _mm_move_sd(A, B);
2404     double[2] correct = [34.0, 47.0];
2405     assert(C.array == correct);
2406 }
2407 
2408 /// Create mask from the most significant bit of each 8-bit element in `v`.
2409 int _mm_movemask_epi8 (__m128i a) pure @trusted
2410 {
2411     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2412     static if (GDC_with_SSE2)
2413     {
2414         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2415     }
2416     else static if (LDC_with_SSE2)
2417     {
2418         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2419     }
2420     else static if (LDC_with_ARM64)
2421     {
2422         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2423         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2424         // SO there might be something a bit faster, but this one is reasonable and branchless.
2425         byte8 mask_shift;
2426         mask_shift.ptr[0] = 7;
2427         mask_shift.ptr[1] = 6;
2428         mask_shift.ptr[2] = 5;
2429         mask_shift.ptr[3] = 4;
2430         mask_shift.ptr[4] = 3;
2431         mask_shift.ptr[5] = 2;
2432         mask_shift.ptr[6] = 1;
2433         mask_shift.ptr[7] = 0;
2434         byte8 mask_and = byte8(-128);
2435         byte8 lo = vget_low_u8(cast(byte16)a);
2436         byte8 hi = vget_high_u8(cast(byte16)a);
2437         lo = vand_u8(lo, mask_and);
2438         lo = vshr_u8(lo, mask_shift);
2439         hi = vand_u8(hi, mask_and);
2440         hi = vshr_u8(hi, mask_shift);
2441         lo = vpadd_u8(lo,lo);
2442         lo = vpadd_u8(lo,lo);
2443         lo = vpadd_u8(lo,lo);
2444         hi = vpadd_u8(hi,hi);
2445         hi = vpadd_u8(hi,hi);
2446         hi = vpadd_u8(hi,hi);
2447         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2448     }
2449     else
2450     {
2451         byte16 ai = cast(byte16)a;
2452         int r = 0;
2453         foreach(bit; 0..16)
2454         {
2455             if (ai.array[bit] < 0) r += (1 << bit);
2456         }
2457         return r;
2458     }
2459 }
2460 unittest
2461 {
2462     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2463 }
2464 
2465 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2466 /// loating-point element in `v`.
2467 int _mm_movemask_pd(__m128d v) pure @safe
2468 {
2469     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2470     static if (GDC_with_SSE2)
2471     {
2472         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2473         /// packed double-precision (64-bit) floating-point element in `v`.
2474         return __builtin_ia32_movmskpd(v);
2475     }
2476     else static if (LDC_with_SSE2)
2477     {
2478         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2479         /// packed double-precision (64-bit) floating-point element in `v`.
2480         return __builtin_ia32_movmskpd(v);
2481     }
2482     else
2483     {
2484         long2 lv = cast(long2)v;
2485         int r = 0;
2486         if (lv.array[0] < 0) r += 1;
2487         if (lv.array[1] < 0) r += 2;
2488         return r;
2489     }
2490 }
2491 unittest
2492 {
2493     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2494     assert(_mm_movemask_pd(A) == 2);
2495 }
2496 
2497 /// Copy the lower 64-bit integer in `v`.
2498 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2499 {
2500     long2 lv = cast(long2)v;
2501     return long1(lv.array[0]);
2502 }
2503 unittest
2504 {
2505     __m128i A = _mm_set_epi64x(-1, -2);
2506     __m64 R = _mm_movepi64_pi64(A);
2507     assert(R.array[0] == -2);
2508 }
2509 
2510 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2511 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2512 {
2513     long2 r;
2514     r.ptr[0] = a.array[0];
2515     r.ptr[1] = 0;
2516     return cast(__m128i)r;
2517 }
2518 
2519 // Note: generates pmuludq in LDC with -O1
2520 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2521 {
2522     __m128i zero = _mm_setzero_si128();
2523 
2524     static if (__VERSION__ >= 2088)
2525     {
2526         // Need LLVM9 to avoid this shufflevector
2527         long2 la, lb;
2528         la.ptr[0] = cast(uint)a.array[0];
2529         la.ptr[1] = cast(uint)a.array[2];
2530         lb.ptr[0] = cast(uint)b.array[0];
2531         lb.ptr[1] = cast(uint)b.array[2];
2532     }
2533     else
2534     {
2535         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2536         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2537     }
2538 
2539     version(DigitalMars)
2540     {
2541         // DMD has no long2 mul
2542         // long2 mul not supported before LDC 1.5
2543         la.ptr[0] *= lb.array[0];
2544         la.ptr[1] *= lb.array[1];
2545         return cast(__m128i)(la);
2546     }
2547     else
2548     {
2549         static if (__VERSION__ >= 2076)
2550         {
2551             return cast(__m128i)(la * lb);
2552         }
2553         else
2554         {
2555             // long2 mul not supported before LDC 1.5
2556             la.ptr[0] *= lb.array[0];
2557             la.ptr[1] *= lb.array[1];
2558             return cast(__m128i)(la);
2559         }
2560     }
2561 }
2562 unittest
2563 {
2564     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2565     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2566     __m128i C = _mm_mul_epu32(A, B);
2567     long2 LC = cast(long2)C;
2568     assert(LC.array[0] == 18446744065119617025uL);
2569     assert(LC.array[1] == 12723420444339690338uL);
2570 }
2571 
2572 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2573 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2574 {
2575     return a * b;
2576 }
2577 unittest
2578 {
2579     __m128d a = [-2.0, 1.5];
2580     a = _mm_mul_pd(a, a);
2581     assert(a.array == [4.0, 2.25]);
2582 }
2583 
2584 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2585 /// element of result, and copy the upper element from `a` to the upper element of result.
2586 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2587 {
2588     version(DigitalMars)
2589     {    
2590         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2591         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2592         asm pure nothrow @nogc @trusted { nop;}
2593         a.array[0] = a.array[0] * b.array[0];
2594         return a;
2595     }
2596     else static if (GDC_with_SSE2)
2597     {
2598         return __builtin_ia32_mulsd(a, b);
2599     }
2600     else
2601     {
2602         a.ptr[0] *= b.array[0];
2603         return a;
2604     }
2605 }
2606 unittest
2607 {
2608     __m128d a = [-2.0, 1.5];
2609     a = _mm_mul_sd(a, a);
2610     assert(a.array == [4.0, 1.5]);
2611 }
2612 
2613 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2614 /// and get an unsigned 64-bit result.
2615 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2616 {
2617     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2618 }
2619 unittest
2620 {
2621     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2622     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2623     __m64 C = _mm_mul_su32(A, B);
2624     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2625 }
2626 
2627 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2628 /// high 16 bits of the intermediate integers.
2629 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2630 {
2631     static if (GDC_with_SSE2)
2632     {
2633         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2634     }
2635     else static if (LDC_with_SSE2)
2636     {
2637         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2638     }
2639     else
2640     {
2641         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2642         //        PERF: it seems the simde solution has one less instruction in ARM64.
2643         // PERF: Catastrophic in ARM32.
2644         short8 sa = cast(short8)a;
2645         short8 sb = cast(short8)b;
2646         short8 r = void;
2647         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2648         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2649         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2650         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2651         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2652         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2653         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2654         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2655         return cast(__m128i)r;
2656     }
2657 }
2658 unittest
2659 {
2660     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2661     __m128i B = _mm_set1_epi16(16384);
2662     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2663     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2664     assert(R.array == correct);
2665 }
2666 
2667 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2668 /// high 16 bits of the intermediate integers.
2669 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2670 {
2671     static if (GDC_with_SSE2)
2672     {
2673         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2674     }
2675     else static if (LDC_with_SSE2)
2676     {
2677         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2678     }
2679     else
2680     {
2681         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2682         //      it seems the simde solution has one less instruction in ARM64
2683         // PERF: Catastrophic in ARM32.
2684         short8 sa = cast(short8)a;
2685         short8 sb = cast(short8)b;
2686         short8 r = void;
2687         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2688         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2689         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2690         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2691         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2692         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2693         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2694         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2695         return cast(__m128i)r;
2696     }
2697 }
2698 unittest
2699 {
2700     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2701     __m128i B = _mm_set1_epi16(16384);
2702     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2703     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2704     assert(R.array == correct);
2705 }
2706 
2707 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2708 /// bits of the intermediate integers.
2709 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2710 {
2711     return cast(__m128i)(cast(short8)a * cast(short8)b);
2712 }
2713 unittest
2714 {
2715     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2716     __m128i B = _mm_set1_epi16(16384);
2717     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2718     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2719     assert(R.array == correct);
2720 }
2721 
2722 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2723 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2724 {
2725     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2726 }
2727 
2728 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2729 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2730 {
2731     return a | b;
2732 }
2733 
2734 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2735 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2736 {
2737     static if (GDC_with_SSE2)
2738     {
2739         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2740     }    
2741     else static if (LDC_with_SSE2)
2742     {
2743         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2744     }
2745     else static if (LDC_with_ARM64)
2746     {
2747         short4 ra = vqmovn_s32(cast(int4)a);
2748         short4 rb = vqmovn_s32(cast(int4)b);
2749         return cast(__m128i)vcombine_s16(ra, rb);
2750     }
2751     else
2752     {
2753         // PERF: catastrophic on ARM32
2754         short8 r;
2755         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2756         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2757         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2758         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2759         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2760         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2761         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2762         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2763         return cast(__m128i)r;
2764     }
2765 }
2766 unittest
2767 {
2768     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2769     short8 R = cast(short8) _mm_packs_epi32(A, A);
2770     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2771     assert(R.array == correct);
2772 }
2773 
2774 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2775 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2776 {
2777     static if (GDC_with_SSE2)
2778     {
2779         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2780     }
2781     else static if (LDC_with_SSE2)
2782     {
2783         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2784     }
2785     else static if (LDC_with_ARM64)
2786     {
2787         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2788         byte8 ra = vqmovn_s16(cast(short8)a);
2789         byte8 rb = vqmovn_s16(cast(short8)b);
2790         return cast(__m128i)vcombine_s8(ra, rb);
2791     }
2792     else
2793     {
2794         // PERF: ARM32 is missing
2795         byte16 r;
2796         short8 sa = cast(short8)a;
2797         short8 sb = cast(short8)b;
2798         foreach(i; 0..8)
2799             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2800         foreach(i; 0..8)
2801             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2802         return cast(__m128i)r;
2803     }
2804 }
2805 unittest
2806 {
2807     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2808     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2809     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2810                         127, -128, 127, 0, 127, -128, 127, 0];
2811     assert(R.array == correct);
2812 }
2813 
2814 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2815 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2816 {
2817     static if (GDC_with_SSE2)
2818     {
2819         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2820     }
2821     else static if (LDC_with_SSE2)
2822     {
2823         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2824     }
2825     else static if (LDC_with_ARM64)
2826     {
2827         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2828         byte8 ra = vqmovun_s16(cast(short8)a);
2829         byte8 rb = vqmovun_s16(cast(short8)b);
2830         return cast(__m128i)vcombine_s8(ra, rb);
2831     }
2832     else
2833     {
2834         short8 sa = cast(short8)a;
2835         short8 sb = cast(short8)b;
2836         ubyte[16] result = void;
2837         for (int i = 0; i < 8; ++i)
2838         {
2839             short s = sa[i];
2840             if (s < 0) s = 0;
2841             if (s > 255) s = 255;
2842             result[i] = cast(ubyte)s;
2843 
2844             s = sb[i];
2845             if (s < 0) s = 0;
2846             if (s > 255) s = 255;
2847             result[i+8] = cast(ubyte)s;
2848         }
2849         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2850     }
2851 }
2852 unittest
2853 {
2854     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2855     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2856     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2857                                                 0, 255, 0, 255, 255, 2, 1, 0];
2858     foreach(i; 0..16)
2859         assert(AA.array[i] == cast(byte)(correctResult[i]));
2860 }
2861 
2862 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2863 /// and power consumption of spin-wait loops.
2864 void _mm_pause() @trusted
2865 {
2866     version(GNU)
2867     {
2868         static if (GDC_with_SSE2)
2869         {
2870             __builtin_ia32_pause();
2871         }
2872         else version(X86)
2873         {
2874             asm pure nothrow @nogc @trusted
2875             {
2876                 "pause;\n" : : : ;
2877             }
2878         }
2879         else
2880             static assert(false);
2881     }
2882     else static if (LDC_with_SSE2)
2883     {
2884         __builtin_ia32_pause();
2885     }
2886     else static if (DMD_with_asm)
2887     {
2888         asm nothrow @nogc pure @safe
2889         {
2890             rep; nop; // F3 90 =  pause
2891         }
2892     }
2893     else version (LDC)
2894     {
2895         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2896     }
2897     else
2898         static assert(false);
2899 }
2900 unittest
2901 {
2902     _mm_pause();
2903 }
2904 
2905 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2906 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2907 /// low 16 bits of 64-bit elements in result.
2908 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2909 {
2910     static if (GDC_with_SSE2)
2911     {
2912         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
2913     }
2914     else static if (LDC_with_SSE2)
2915     {
2916         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2917     }
2918     else static if (LDC_with_ARM64)
2919     {
2920         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
2921 
2922         // PERF: Looks suboptimal vs addp
2923         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
2924         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
2925         ushort8 r = 0;
2926         r[0] = r0;
2927         r[4] = r4;
2928         return cast(__m128i) r;
2929     }
2930     else
2931     {
2932         // PERF: ARM32 is lacking
2933         byte16 ab = cast(byte16)a;
2934         byte16 bb = cast(byte16)b;
2935         ubyte[16] t;
2936         foreach(i; 0..16)
2937         {
2938             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2939             if (diff < 0) diff = -diff;
2940             t[i] = cast(ubyte)(diff);
2941         }
2942         int4 r = _mm_setzero_si128();
2943         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2944         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2945         return r;
2946     }
2947 }
2948 unittest
2949 {
2950     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2951     __m128i B = _mm_set1_epi8(1);
2952     __m128i R = _mm_sad_epu8(A, B);
2953     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2954                       0,
2955                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2956                       0];
2957     assert(R.array == correct);
2958 }
2959 
2960 /// Set packed 16-bit integers with the supplied values.
2961 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2962 {
2963     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2964     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2965 }
2966 unittest
2967 {
2968     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2969     short8 B = cast(short8) A;
2970     foreach(i; 0..8)
2971         assert(B.array[i] == i);
2972 }
2973 
2974 /// Set packed 32-bit integers with the supplied values.
2975 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2976 {
2977     int[4] result = [e0, e1, e2, e3];
2978     return loadUnaligned!(int4)(result.ptr);
2979 }
2980 unittest
2981 {
2982     __m128i A = _mm_set_epi32(3, 2, 1, 0);
2983     foreach(i; 0..4)
2984         assert(A.array[i] == i);
2985 }
2986 
2987 /// Set packed 64-bit integers with the supplied values.
2988 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
2989 {
2990     long[2] result = [e0.array[0], e1.array[0]];
2991     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2992 }
2993 unittest
2994 {
2995     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
2996     long2 B = cast(long2) A;
2997     assert(B.array[0] == 5678);
2998     assert(B.array[1] == 1234);
2999 }
3000 
3001 /// Set packed 64-bit integers with the supplied values.
3002 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3003 {
3004     long[2] result = [e0, e1];
3005     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3006 }
3007 unittest
3008 {
3009     __m128i A = _mm_set_epi64x(1234, 5678);
3010     long2 B = cast(long2) A;
3011     assert(B.array[0] == 5678);
3012     assert(B.array[1] == 1234);
3013 }
3014 
3015 /// Set packed 8-bit integers with the supplied values.
3016 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3017                       byte e11, byte e10, byte e9, byte e8,
3018                       byte e7, byte e6, byte e5, byte e4,
3019                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3020 {
3021     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3022                      e8, e9, e10, e11, e12, e13, e14, e15];
3023     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3024 }
3025 
3026 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3027 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3028 {
3029     double[2] result = [e0, e1];
3030     return loadUnaligned!(double2)(result.ptr);
3031 }
3032 unittest
3033 {
3034     __m128d A = _mm_set_pd(61.0, 55.0);
3035     double[2] correct = [55.0, 61.0];
3036     assert(A.array == correct);
3037 }
3038 
3039 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3040 __m128d _mm_set_pd1 (double a) pure @trusted
3041 {
3042     double[2] result = [a, a];
3043     return loadUnaligned!(double2)(result.ptr);
3044 }
3045 unittest
3046 {
3047     __m128d A = _mm_set_pd1(61.0);
3048     double[2] correct = [61.0, 61.0];
3049     assert(A.array == correct);
3050 }
3051 
3052 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3053 /// and zero the upper element.
3054 __m128d _mm_set_sd (double a) pure @trusted
3055 {
3056     double[2] result = [a, 0];
3057     return loadUnaligned!(double2)(result.ptr);
3058 }
3059 
3060 /// Broadcast 16-bit integer a to all elements of dst.
3061 __m128i _mm_set1_epi16 (short a) pure @trusted
3062 {
3063     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3064     {
3065         short8 v = a;
3066         return cast(__m128i) v;
3067     }
3068     else
3069         return cast(__m128i)(short8(a));
3070 }
3071 unittest
3072 {
3073     short8 a = cast(short8) _mm_set1_epi16(31);
3074     for (int i = 0; i < 8; ++i)
3075         assert(a.array[i] == 31);
3076 }
3077 
3078 /// Broadcast 32-bit integer `a` to all elements.
3079 __m128i _mm_set1_epi32 (int a) pure @trusted
3080 {
3081     return cast(__m128i)(int4(a));
3082 }
3083 unittest
3084 {
3085     int4 a = cast(int4) _mm_set1_epi32(31);
3086     for (int i = 0; i < 4; ++i)
3087         assert(a.array[i] == 31);
3088 }
3089 
3090 /// Broadcast 64-bit integer `a` to all elements.
3091 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3092 {
3093     return _mm_set_epi64(a, a);
3094 }
3095 unittest
3096 {
3097     long b = 0x1DEADCAFE; 
3098     __m64 a;
3099     a.ptr[0] = b;
3100     long2 c = cast(long2) _mm_set1_epi64(a);
3101     assert(c.array[0] == b);
3102     assert(c.array[1] == b);
3103 }
3104 
3105 /// Broadcast 64-bit integer `a` to all elements
3106 __m128i _mm_set1_epi64x (long a) pure @trusted
3107 {
3108     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3109     return cast(__m128i)(b);
3110 }
3111 unittest
3112 {
3113     long b = 0x1DEADCAFE;
3114     long2 c = cast(long2) _mm_set1_epi64x(b);
3115     for (int i = 0; i < 2; ++i)
3116         assert(c.array[i] == b);
3117 }
3118 
3119 /// Broadcast 8-bit integer `a` to all elements.
3120 __m128i _mm_set1_epi8 (byte a) pure @trusted
3121 {
3122     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3123     return cast(__m128i)(b);
3124 }
3125 unittest
3126 {
3127     byte16 b = cast(byte16) _mm_set1_epi8(31);
3128     for (int i = 0; i < 16; ++i)
3129         assert(b.array[i] == 31);
3130 }
3131 
3132 alias _mm_set1_pd = _mm_set_pd1;
3133 
3134 /// Set packed 16-bit integers with the supplied values in reverse order.
3135 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3136                         short e3, short e2, short e1, short e0) pure @trusted
3137 {
3138     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
3139     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
3140 }
3141 unittest
3142 {
3143     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3144     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3145     assert(A.array == correct);
3146 }
3147 
3148 /// Set packed 32-bit integers with the supplied values in reverse order.
3149 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3150 {
3151     int[4] result = [e3, e2, e1, e0];
3152     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3153 }
3154 unittest
3155 {
3156     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3157     int[4] correct = [-1, 0, -2147483648, 2147483647];
3158     assert(A.array == correct);
3159 }
3160 
3161 /// Set packed 64-bit integers with the supplied values in reverse order.
3162 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3163 {
3164     long[2] result = [e1, e0];
3165     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3166 }
3167 unittest
3168 {
3169     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3170     long[2] correct = [-1, 0];
3171     assert(A.array == correct);
3172 }
3173 
3174 /// Set packed 8-bit integers with the supplied values in reverse order.
3175 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3176                        byte e11, byte e10, byte e9,  byte e8,
3177                        byte e7,  byte e6,  byte e5,  byte e4,
3178                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3179 {
3180     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3181                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3182     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3183 }
3184 
3185 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3186 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3187 {
3188     double2 result;
3189     result.ptr[0] = e1;
3190     result.ptr[1] = e0;
3191     return result;
3192 }
3193 unittest
3194 {
3195     __m128d A = _mm_setr_pd(61.0, 55.0);
3196     double[2] correct = [61.0, 55.0];
3197     assert(A.array == correct);
3198 }
3199 
3200 /// Return vector of type `__m128d` with all elements set to zero.
3201 __m128d _mm_setzero_pd () pure @trusted
3202 {
3203     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3204     double[2] result = [0.0, 0.0];
3205     return loadUnaligned!(double2)(result.ptr);
3206 }
3207 
3208 /// Return vector of type `__m128i` with all elements set to zero.
3209 __m128i _mm_setzero_si128() pure @trusted
3210 {
3211     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3212     int[4] result = [0, 0, 0, 0];
3213     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3214 }
3215 
3216 /// Shuffle 32-bit integers in a using the control in `imm8`.
3217 /// See_also: `_MM_SHUFFLE`.
3218 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3219 {
3220     static if (GDC_with_SSE2)
3221     {
3222         return __builtin_ia32_pshufd(a, imm8);
3223     }
3224     else
3225     {
3226         return shufflevector!(int4, (imm8 >> 0) & 3,
3227                                     (imm8 >> 2) & 3,
3228                                     (imm8 >> 4) & 3,
3229                                     (imm8 >> 6) & 3)(a, a);
3230     }
3231 }
3232 unittest
3233 {
3234     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3235     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3236     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3237     int[4] expectedB = [ 3, 2, 1, 0 ];
3238     assert(B.array == expectedB);
3239 }
3240 
3241 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3242 /// See_also: `_MM_SHUFFLE2`.
3243 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3244 {
3245     static if (GDC_with_SSE2)
3246     {
3247         return __builtin_ia32_shufpd(a, b, imm8);
3248     }
3249     else
3250     {
3251         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3252                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
3253     }
3254 }
3255 unittest
3256 {
3257     __m128d A = _mm_setr_pd(0.5, 2.0);
3258     __m128d B = _mm_setr_pd(4.0, 5.0);
3259     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3260     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3261     double[2] correct = [ 2.0, 5.0 ];
3262     assert(R.array == correct);
3263 }
3264 
3265 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3266 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3267 /// See also: `_MM_SHUFFLE`.
3268 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3269 {
3270     static if (GDC_with_SSE2)
3271     {
3272         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3273     }
3274     else
3275     {
3276         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3277                                           4 + ( (imm8 >> 0) & 3 ),
3278                                           4 + ( (imm8 >> 2) & 3 ),
3279                                           4 + ( (imm8 >> 4) & 3 ),
3280                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3281     }
3282 }
3283 unittest
3284 {
3285     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3286     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3287     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3288     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3289     assert(C.array == expectedC);
3290 }
3291 
3292 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3293 /// bits of result, with the high 64 bits being copied from from `a` to result.
3294 /// See_also: `_MM_SHUFFLE`.
3295 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3296 {
3297     static if (GDC_with_SSE2)
3298     {
3299         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3300     }
3301     else
3302     {
3303         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3304                                                     ( (imm8 >> 2) & 3 ),
3305                                                     ( (imm8 >> 4) & 3 ),
3306                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3307     }
3308 }
3309 unittest
3310 {
3311     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3312     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3313     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3314     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3315     assert(B.array == expectedB);
3316 }
3317 
3318 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3319 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3320 {
3321     static if (LDC_with_SSE2)
3322     {
3323         return __builtin_ia32_pslld128(a, count);
3324     }
3325     else static if (GDC_with_SSE2)
3326     {
3327         return __builtin_ia32_pslld128(a, count);
3328     }
3329     else static if (DMD_with_32bit_asm)
3330     {
3331         asm pure nothrow @nogc @trusted
3332         {
3333             movdqu XMM0, a;
3334             movdqu XMM1, count;
3335             pslld XMM0, XMM1;
3336             movdqu a, XMM0;
3337         }
3338         return a;
3339     }
3340     else
3341     {
3342         int4 r = void;
3343         long2 lc = cast(long2)count;
3344         int bits = cast(int)(lc.array[0]);
3345         foreach(i; 0..4)
3346             r[i] = cast(uint)(a[i]) << bits;
3347         return r;
3348     }
3349 }
3350 
3351 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3352 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3353 {
3354     static if (LDC_with_SSE2)
3355     {
3356         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3357     }
3358     else static if (GDC_with_SSE2)
3359     {
3360         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3361     }
3362     else static if (DMD_with_32bit_asm)
3363     {
3364         asm pure nothrow @nogc @trusted
3365         {
3366             movdqu XMM0, a;
3367             movdqu XMM1, count;
3368             psllq XMM0, XMM1;
3369             movdqu a, XMM0;
3370         }
3371         return a;
3372     }
3373     else
3374     {
3375         // ARM: good since LDC 1.12 -O2
3376         // ~but -O0 version is catastrophic
3377         long2 r = void;
3378         long2 sa = cast(long2)a;
3379         long2 lc = cast(long2)count;
3380         int bits = cast(int)(lc.array[0]);
3381         foreach(i; 0..2)
3382             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3383         return cast(__m128i)r;
3384     }
3385 }
3386 
3387 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3388 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3389 {
3390     static if (LDC_with_SSE2)
3391     {
3392         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3393     }
3394     else static if (GDC_with_SSE2)
3395     {
3396         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3397     }
3398     else static if (DMD_with_32bit_asm)
3399     {
3400         asm pure nothrow @nogc
3401         {
3402             movdqu XMM0, a;
3403             movdqu XMM1, count;
3404             psllw XMM0, XMM1;
3405             movdqu a, XMM0;
3406         }
3407         return a;
3408     }
3409     else
3410     {
3411         short8 sa = cast(short8)a;
3412         long2 lc = cast(long2)count;
3413         int bits = cast(int)(lc.array[0]);
3414         short8 r = void;
3415         foreach(i; 0..8)
3416             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3417         return cast(int4)r;
3418     }
3419 }
3420 
3421 
3422 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3423 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3424 {
3425     static if (GDC_with_SSE2)
3426     {
3427         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3428     }
3429     else static if (LDC_with_SSE2)
3430     {
3431         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3432     }
3433     else
3434     {
3435         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3436         //       D says "It's illegal to shift by the same or more bits 
3437         //       than the size of the quantity being shifted"
3438         //       and it's UB instead.
3439         int4 r = _mm_setzero_si128();
3440 
3441         ubyte count = cast(ubyte) imm8;
3442         if (count > 31)
3443             return r;
3444         
3445         foreach(i; 0..4)
3446             r.array[i] = cast(uint)(a.array[i]) << count;
3447         return r;
3448     }
3449 }
3450 unittest
3451 {
3452     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3453     __m128i B = _mm_slli_epi32(A, 1);
3454     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3455     int[4] expectedB = [ 0, 4, 6, -8];
3456     assert(B.array == expectedB);
3457     assert(B2.array == expectedB);
3458 
3459     __m128i C = _mm_slli_epi32(A, 0);
3460     int[4] expectedC = [ 0, 2, 3, -4];
3461     assert(C.array == expectedC);
3462 
3463     __m128i D = _mm_slli_epi32(A, 65);
3464     int[4] expectedD = [ 0, 0, 0, 0];
3465     assert(D.array == expectedD);
3466 }
3467 
3468 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3469 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3470 {
3471     static if (GDC_with_SSE2)
3472     {
3473         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3474     }
3475     else static if (LDC_with_SSE2)
3476     {
3477         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3478     }
3479     else
3480     {
3481         long2 sa = cast(long2)a;
3482 
3483         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3484         //       D says "It's illegal to shift by the same or more bits 
3485         //       than the size of the quantity being shifted"
3486         //       and it's UB instead.
3487         long2 r = cast(long2) _mm_setzero_si128();
3488         ubyte count = cast(ubyte) imm8;
3489         if (count > 63)
3490             return cast(__m128i)r;
3491 
3492         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3493         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3494         return cast(__m128i)r;
3495     }
3496 }
3497 unittest
3498 {
3499     __m128i A = _mm_setr_epi64(8, -4);
3500     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3501     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3502     long[2] expectedB = [ 16, -8];
3503     assert(B.array == expectedB);
3504     assert(B2.array == expectedB);
3505 
3506     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3507     long[2] expectedC = [ 8, -4];
3508     assert(C.array == expectedC);
3509 
3510     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3511     long[2] expectedD = [ 0, -0];
3512     assert(D.array == expectedD);
3513 }
3514 
3515 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3516 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3517 {
3518     static if (GDC_with_SSE2)
3519     {
3520         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3521     }
3522     else static if (LDC_with_SSE2)
3523     {
3524         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3525     }
3526     else static if (LDC_with_ARM64)
3527     {
3528         short8 sa = cast(short8)a;
3529         short8 r = cast(short8)_mm_setzero_si128();
3530         ubyte count = cast(ubyte) imm8;
3531         if (count > 15)
3532             return cast(__m128i)r;
3533         r = sa << short8(count);
3534         return cast(__m128i)r;
3535     }
3536     else
3537     {
3538         short8 sa = cast(short8)a;
3539         short8 r = cast(short8)_mm_setzero_si128();
3540         ubyte count = cast(ubyte) imm8;
3541         if (count > 15)
3542             return cast(__m128i)r;
3543         foreach(i; 0..8)
3544             r.ptr[i] = cast(short)(sa.array[i] << count);
3545         return cast(__m128i)r;
3546     }
3547 }
3548 unittest
3549 {
3550     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3551     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3552     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3553     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3554     assert(B.array == expectedB);
3555     assert(B2.array == expectedB);
3556 
3557     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3558     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3559     assert(C.array == expectedC);
3560 }
3561 
3562 
3563 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3564 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3565 {
3566     static if (bytes & 0xF0)
3567     {
3568         return _mm_setzero_si128();
3569     }
3570     else
3571     {
3572         static if (GDC_with_SSE2)
3573         {
3574             return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3575         }
3576         else version(DigitalMars)
3577         {
3578             version(D_InlineAsm_X86)
3579             {
3580                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3581                 {
3582                     movdqu XMM0, op;
3583                     pslldq XMM0, bytes;
3584                     movdqu op, XMM0;
3585                 }
3586                 return op;
3587             }
3588             else
3589             {
3590                 byte16 A = cast(byte16)op;
3591                 byte16 R;
3592                 for (int n = 15; n >= bytes; --n)
3593                     R.ptr[n] = A.array[n-bytes];
3594                 for (int n = bytes-1; n >= 0; --n)
3595                     R.ptr[n] = 0;
3596                 return cast(__m128i)R;
3597             }
3598         }
3599         else
3600         {
3601             return cast(__m128i) shufflevector!(byte16,
3602             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3603             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3604             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3605             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3606         }
3607     }
3608 }
3609 unittest
3610 {
3611     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3612     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3613     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3614     assert(R.array == correct);
3615 
3616     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3617     int[4] expectedB = [0, 0, 0, 0];
3618     assert(B.array == expectedB);
3619 }
3620 
3621 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3622 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3623 {
3624     version(LDC)
3625     {
3626         // Disappeared with LDC 1.11
3627         static if (__VERSION__ < 2081)
3628             return __builtin_ia32_sqrtpd(vec);
3629         else
3630         {
3631             vec.array[0] = llvm_sqrt(vec.array[0]);
3632             vec.array[1] = llvm_sqrt(vec.array[1]);
3633             return vec;
3634         }
3635     }
3636     else static if (GDC_with_SSE2)    
3637     {
3638         return __builtin_ia32_sqrtpd(vec);
3639     }
3640     else
3641     {
3642         vec.ptr[0] = sqrt(vec.array[0]);
3643         vec.ptr[1] = sqrt(vec.array[1]);
3644         return vec;
3645     }
3646 }
3647 
3648 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3649 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3650 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3651 {
3652     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3653     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3654     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3655     version(LDC)
3656     {
3657         // Disappeared with LDC 1.11
3658         static if (__VERSION__ < 2081)
3659         {
3660             __m128d c = __builtin_ia32_sqrtsd(b);
3661             a[0] = c[0];
3662             return a;
3663         }
3664         else
3665         {
3666             a.array[0] = llvm_sqrt(b.array[0]);
3667             return a;
3668         }
3669     }
3670     else static if (GDC_with_SSE2)
3671     {
3672         __m128d c = __builtin_ia32_sqrtsd(b);
3673         a.ptr[0] = c.array[0];
3674         return a;
3675     }
3676     else
3677     {
3678         a.ptr[0] = sqrt(b.array[0]);
3679         return a;
3680     }
3681 }
3682 unittest
3683 {
3684     __m128d A = _mm_setr_pd(1.0, 3.0);
3685     __m128d B = _mm_setr_pd(4.0, 5.0);
3686     __m128d R = _mm_sqrt_sd(A, B);
3687     double[2] correct = [2.0, 3.0 ];
3688     assert(R.array == correct);
3689 }
3690 
3691 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3692 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3693 {
3694     static if (GDC_with_SSE2)
3695     {
3696         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3697     }
3698     else static if (LDC_with_SSE2)
3699     {
3700         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3701     }
3702     else
3703     {
3704         short8 sa = cast(short8)a;
3705         long2 lc = cast(long2)count;
3706         int bits = cast(int)(lc.array[0]);
3707         short8 r = void;
3708         foreach(i; 0..8)
3709             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3710         return cast(int4)r;
3711     }
3712 }
3713 
3714 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3715 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3716 {
3717     static if (LDC_with_SSE2)
3718     {
3719         return __builtin_ia32_psrad128(a, count);
3720     }
3721     else static if (GDC_with_SSE2)
3722     {
3723         return __builtin_ia32_psrad128(a, count);
3724     }
3725     else
3726     {    
3727         int4 r = void;
3728         long2 lc = cast(long2)count;
3729         int bits = cast(int)(lc.array[0]);
3730         r.ptr[0] = (a.array[0] >> bits);
3731         r.ptr[1] = (a.array[1] >> bits);
3732         r.ptr[2] = (a.array[2] >> bits);
3733         r.ptr[3] = (a.array[3] >> bits);
3734         return r;
3735     }
3736 }
3737 
3738 
3739 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3740 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3741 {
3742     static if (GDC_with_SSE2)
3743     {
3744         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3745     }
3746     else static if (LDC_with_SSE2)
3747     {
3748         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3749     }
3750     else static if (LDC_with_ARM64)
3751     {
3752         short8 sa = cast(short8)a;
3753         ubyte count = cast(ubyte)imm8;
3754         if (count > 15) 
3755             count = 15;
3756         short8 r = sa >> short8(count);
3757         return cast(__m128i)r;
3758     }
3759     else
3760     {
3761         short8 sa = cast(short8)a;
3762         short8 r = void;
3763 
3764         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3765         //       D says "It's illegal to shift by the same or more bits 
3766         //       than the size of the quantity being shifted"
3767         //       and it's UB instead.
3768         ubyte count = cast(ubyte)imm8;
3769         if (count > 15) 
3770             count = 15;
3771         foreach(i; 0..8)
3772             r.ptr[i] = cast(short)(sa.array[i] >> count);
3773         return cast(int4)r;
3774     }
3775 }
3776 unittest
3777 {
3778     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3779     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3780     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3781     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3782     assert(B.array == expectedB);
3783     assert(B2.array == expectedB);
3784 
3785     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3786     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3787     assert(C.array == expectedC);
3788 }
3789 
3790 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3791 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3792 {
3793     static if (LDC_with_SSE2)
3794     {
3795         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3796     }
3797     else static if (GDC_with_SSE2)
3798     {
3799         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3800     }
3801     else
3802     {
3803         int4 r = void;
3804 
3805         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3806         //       D says "It's illegal to shift by the same or more bits 
3807         //       than the size of the quantity being shifted"
3808         //       and it's UB instead.
3809         ubyte count = cast(ubyte) imm8;
3810         if (count > 31)
3811             count = 31;
3812 
3813         r.ptr[0] = (a.array[0] >> count);
3814         r.ptr[1] = (a.array[1] >> count);
3815         r.ptr[2] = (a.array[2] >> count);
3816         r.ptr[3] = (a.array[3] >> count);
3817         return r;
3818     }
3819 }
3820 unittest
3821 {
3822     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3823     __m128i B = _mm_srai_epi32(A, 1);
3824     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3825     int[4] expectedB = [ 0, 1, 1, -2];
3826     assert(B.array == expectedB);
3827     assert(B2.array == expectedB);
3828 
3829     __m128i C = _mm_srai_epi32(A, 32);
3830     int[4] expectedC = [ 0, 0, 0, -1];
3831     assert(C.array == expectedC);
3832 
3833     __m128i D = _mm_srai_epi32(A, 0);
3834     int[4] expectedD = [ 0, 2, 3, -4];
3835     assert(D.array == expectedD);
3836 }
3837 
3838 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3839 {
3840     static if (LDC_with_SSE2)
3841     {
3842         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3843     }
3844     else static if (GDC_with_SSE2)
3845     {
3846         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3847     }
3848     else
3849     {
3850         short8 sa = cast(short8)a;
3851         long2 lc = cast(long2)count;
3852         int bits = cast(int)(lc.array[0]);
3853         short8 r = void;
3854         foreach(i; 0..8)
3855             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3856         return cast(int4)r;
3857     }
3858 }
3859 
3860 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
3861 {
3862     static if (LDC_with_SSE2)
3863     {
3864         return __builtin_ia32_psrld128(a, count);
3865     }
3866     else static if (GDC_with_SSE2)
3867     {
3868         return __builtin_ia32_psrld128(a, count);
3869     }
3870     else
3871     {
3872         int4 r = void;
3873         long2 lc = cast(long2)count;
3874         int bits = cast(int)(lc.array[0]);
3875         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
3876         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
3877         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
3878         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
3879         return r;
3880     }
3881 }
3882 
3883 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
3884 {
3885     static if (LDC_with_SSE2)
3886     {
3887         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3888     }
3889     else static if (GDC_with_SSE2)
3890     {
3891         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3892     }
3893     else
3894     {
3895         long2 r = void;
3896         long2 sa = cast(long2)a;
3897         long2 lc = cast(long2)count;
3898         int bits = cast(int)(lc.array[0]);
3899         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
3900         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
3901         return cast(__m128i)r;
3902     }
3903 }
3904 
3905 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3906 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3907 {
3908     static if (GDC_with_SSE2)
3909     {
3910         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3911     }
3912     else static if (LDC_with_SSE2)
3913     {
3914         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3915     }
3916     else static if (LDC_with_ARM64)
3917     {
3918         short8 sa = cast(short8)a;
3919         short8 r = cast(short8) _mm_setzero_si128();
3920 
3921         ubyte count = cast(ubyte)imm8;
3922         if (count >= 16)
3923             return cast(__m128i)r;
3924 
3925         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3926         return cast(__m128i)r;
3927     }
3928     else
3929     {
3930         short8 sa = cast(short8)a;
3931         ubyte count = cast(ubyte)imm8;
3932 
3933         short8 r = cast(short8) _mm_setzero_si128();
3934         if (count >= 16)
3935             return cast(__m128i)r;
3936 
3937         foreach(i; 0..8)
3938             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
3939         return cast(__m128i)r;
3940     }
3941 }
3942 unittest
3943 {
3944     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3945     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3946     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
3947     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3948     assert(B.array == expectedB);
3949     assert(B2.array == expectedB);
3950 
3951     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
3952     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
3953     assert(C.array == expectedC);
3954 
3955     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
3956     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
3957     assert(D.array == expectedD);
3958 }
3959 
3960 
3961 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
3962 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3963 {
3964     static if (GDC_with_SSE2)
3965     {
3966         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3967     }
3968     else static if (LDC_with_SSE2)
3969     {
3970         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3971     }
3972     else
3973     {
3974         ubyte count = cast(ubyte) imm8;
3975 
3976         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3977         //       D says "It's illegal to shift by the same or more bits 
3978         //       than the size of the quantity being shifted"
3979         //       and it's UB instead.
3980         int4 r = _mm_setzero_si128();
3981         if (count >= 32)
3982             return r;
3983         r.ptr[0] = a.array[0] >>> count;
3984         r.ptr[1] = a.array[1] >>> count;
3985         r.ptr[2] = a.array[2] >>> count;
3986         r.ptr[3] = a.array[3] >>> count;
3987         return r;
3988     }
3989 }
3990 unittest
3991 {
3992     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3993     __m128i B = _mm_srli_epi32(A, 1);
3994     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
3995     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3996     assert(B.array == expectedB);
3997     assert(B2.array == expectedB);
3998  
3999     __m128i C = _mm_srli_epi32(A, 255);
4000     int[4] expectedC = [ 0, 0, 0, 0 ];
4001     assert(C.array == expectedC);
4002 }
4003 
4004 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4005 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4006 {
4007     static if (GDC_with_SSE2)
4008     {
4009         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4010     }
4011     else static if (LDC_with_SSE2)
4012     {
4013         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4014     }
4015     else
4016     {
4017         long2 r = cast(long2) _mm_setzero_si128();
4018         long2 sa = cast(long2)a;
4019 
4020         ubyte count = cast(ubyte) imm8;
4021         if (count >= 64)
4022             return cast(__m128i)r;
4023 
4024         r.ptr[0] = sa.array[0] >>> count;
4025         r.ptr[1] = sa.array[1] >>> count;
4026         return cast(__m128i)r;
4027     }
4028 }
4029 unittest
4030 {
4031     __m128i A = _mm_setr_epi64(8, -4);
4032     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4033     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4034     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4035     assert(B.array == expectedB);
4036     assert(B2.array == expectedB);
4037 
4038     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4039     long[2] expectedC = [ 0, 0 ];
4040     assert(C.array == expectedC);
4041 }
4042 
4043 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4044 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
4045 {
4046     static if (bytes & 0xF0)
4047     {
4048         return _mm_setzero_si128();
4049     }
4050     else static if (GDC_with_SSE2)
4051     {
4052         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4053     }
4054     else static if (DMD_with_32bit_asm)
4055     {
4056         asm pure nothrow @nogc @trusted
4057         {
4058             movdqu XMM0, v;
4059             psrldq XMM0, bytes;
4060             movdqu v, XMM0;
4061         }
4062         return v;
4063     }
4064     else
4065     {
4066         return cast(__m128i) shufflevector!(byte16,
4067                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4068                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4069                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4070     }
4071 }
4072 unittest
4073 {
4074     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
4075     int[4] correct = [2, 3, 4, 0];
4076     assert(R.array == correct);
4077 
4078     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4079     int[4] expectedA = [0, 0, 0, 0];
4080     assert(A.array == expectedA);
4081 }
4082 
4083 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4084 /// #BONUS
4085 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4086 {
4087     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4088 }
4089 unittest
4090 {
4091     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4092     float[4] correct = [3.0f, 4.0f, 0, 0];
4093     assert(R.array == correct);
4094 }
4095 
4096 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4097 /// #BONUS
4098 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4099 {
4100     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4101 }
4102 
4103 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4104 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4105 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4106 {
4107     __m128d* aligned = cast(__m128d*)mem_addr;
4108     *aligned = a;
4109 }
4110 
4111 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4112 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4113 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4114 {
4115     __m128d* aligned = cast(__m128d*)mem_addr;
4116     __m128d r;
4117     r.ptr[0] = a.array[0];
4118     r.ptr[1] = a.array[0];
4119     *aligned = r;
4120 }
4121 
4122 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4123 /// be aligned on any particular boundary.
4124 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4125 {
4126     *mem_addr = a.array[0];
4127 }
4128 
4129 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4130 /// general-protection exception may be generated.
4131 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4132 {
4133     *mem_addr = a;
4134 }
4135 
4136 alias _mm_store1_pd = _mm_store_pd1; ///
4137 
4138 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4139 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4140 {
4141     *mem_addr = a.array[1];
4142 }
4143 
4144 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4145 // expectations from the user point of view. This problem also exist in C++.
4146 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4147 {
4148     long* dest = cast(long*)mem_addr;
4149     long2 la = cast(long2)a;
4150     *dest = la.array[0];
4151 }
4152 unittest
4153 {
4154     long[3] A = [1, 2, 3];
4155     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4156     long[3] correct = [1, 0x1_0000_0000, 3];
4157     assert(A == correct);
4158 }
4159 
4160 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4161 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4162 {
4163     *mem_addr = a.array[0];
4164 }
4165 
4166 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4167 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4168 void _mm_storer_pd (double* mem_addr, __m128d a) pure
4169 {
4170     __m128d* aligned = cast(__m128d*)mem_addr;
4171     *aligned = shufflevector!(double2, 1, 0)(a, a);
4172 }
4173 
4174 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4175 /// `mem_addr` does not need to be aligned on any particular boundary.
4176 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
4177 {
4178     storeUnaligned!double2(a, mem_addr);
4179 }
4180 
4181 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4182 /// boundary.
4183 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4184 {
4185     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4186 }
4187 
4188 /// Store 32-bit integer from the first element of `a` into memory. 
4189 /// `mem_addr` does not need to be aligned on any particular boundary.
4190 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4191 {
4192     int* dest = cast(int*)mem_addr;
4193     *dest = a.array[0];
4194 }
4195 unittest
4196 {
4197     int[2] arr = [-24, 12];
4198     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4199     assert(arr == [-24, -1]);
4200 }
4201 
4202 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4203 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4204 /// boundary or a general-protection exception may be generated.
4205 void _mm_stream_pd (double* mem_addr, __m128d a)
4206 {
4207     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4208     __m128d* dest = cast(__m128d*)mem_addr;
4209     *dest = a;
4210 }
4211 
4212 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4213 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4214 /// may be generated.
4215 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4216 {
4217     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4218     __m128i* dest = cast(__m128i*)mem_addr;
4219     *dest = a;
4220 }
4221 
4222 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4223 /// pollution. If the cache line containing address mem_addr is already in the cache,
4224 /// the cache will be updated.
4225 void _mm_stream_si32 (int* mem_addr, int a)
4226 {
4227     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4228     *mem_addr = a;
4229 }
4230 
4231 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4232 /// cache pollution. If the cache line containing address mem_addr is already
4233 /// in the cache, the cache will be updated.
4234 void _mm_stream_si64 (long* mem_addr, long a)
4235 {
4236     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4237     *mem_addr = a;
4238 }
4239 
4240 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4241 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4242 {
4243     return cast(__m128i)(cast(short8)a - cast(short8)b);
4244 }
4245 
4246 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4247 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4248 {
4249     return cast(__m128i)(cast(int4)a - cast(int4)b);
4250 }
4251 
4252 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4253 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4254 {
4255     return cast(__m128i)(cast(long2)a - cast(long2)b);
4256 }
4257 
4258 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4259 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4260 {
4261     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4262 }
4263 
4264 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4265 /// floating-point elements in `a`.
4266 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4267 {
4268     return a - b;
4269 }
4270 
4271 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4272 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4273 /// upper element of result.
4274 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4275 {
4276     version(DigitalMars)
4277     {
4278         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4279         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4280         asm pure nothrow @nogc @trusted { nop;}
4281         a[0] = a[0] - b[0];
4282         return a;
4283     }
4284     else static if (GDC_with_SSE2)
4285     {
4286         return __builtin_ia32_subsd(a, b);
4287     }
4288     else
4289     {
4290         a.ptr[0] -= b.array[0];
4291         return a;
4292     }
4293 }
4294 unittest
4295 {
4296     __m128d a = [1.5, -2.0];
4297     a = _mm_sub_sd(a, a);
4298     assert(a.array == [0.0, -2.0]);
4299 }
4300 
4301 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4302 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4303 {
4304     return a - b;
4305 }
4306 
4307 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4308 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4309 {
4310     version(LDC)
4311     {
4312         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4313         {
4314             // Generates PSUBSW since LDC 1.15 -O0
4315             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4316             
4317             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4318             enum ir = `
4319                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4320                 ret <8 x i16> %r`;
4321             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4322         }
4323         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4324         {
4325             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4326             short[8] res;
4327             short8 sa = cast(short8)a;
4328             short8 sb = cast(short8)b;
4329             foreach(i; 0..8)
4330                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4331             return _mm_loadu_si128(cast(int4*)res.ptr);
4332         }
4333         else static if (LDC_with_SSE2)
4334         {
4335             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4336         }
4337         else
4338             static assert(false);
4339     }
4340     else static if (GDC_with_SSE2)
4341     {
4342         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4343     }
4344     else
4345     {
4346         short[8] res;
4347         short8 sa = cast(short8)a;
4348         short8 sb = cast(short8)b;
4349         foreach(i; 0..8)
4350             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4351         return _mm_loadu_si128(cast(int4*)res.ptr);
4352     }
4353 }
4354 unittest
4355 {
4356     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4357                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4358     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4359     assert(res.array == correctResult);
4360 }
4361 
4362 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4363 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4364 {
4365     version(LDC)
4366     {
4367         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4368         {
4369             // x86: Generates PSUBSB since LDC 1.15 -O0
4370             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4371             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4372             enum ir = `
4373                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4374                 ret <16 x i8> %r`;
4375             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4376         }
4377         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4378         {
4379             byte[16] res;
4380             byte16 sa = cast(byte16)a;
4381             byte16 sb = cast(byte16)b;
4382             foreach(i; 0..16)
4383                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4384             return _mm_loadu_si128(cast(int4*)res.ptr);
4385         }
4386         else static if (LDC_with_SSE2)
4387         {
4388             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4389         }
4390         else
4391             static assert(false);
4392     }
4393     else static if (GDC_with_SSE2)
4394     {
4395         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4396     }
4397     else
4398     {
4399         byte[16] res;
4400         byte16 sa = cast(byte16)a;
4401         byte16 sb = cast(byte16)b;
4402         foreach(i; 0..16)
4403             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4404         return _mm_loadu_si128(cast(int4*)res.ptr);
4405     }
4406 }
4407 unittest
4408 {
4409     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4410                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4411     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4412     assert(res.array == correctResult);
4413 }
4414 
4415 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4416 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4417 {
4418     version(LDC)
4419     {
4420         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4421         {
4422             // x86: Generates PSUBUSW since LDC 1.15 -O0
4423             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4424             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4425             enum ir = `
4426                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4427                 ret <8 x i16> %r`;
4428             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4429         }
4430         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4431         {
4432             short[8] res;
4433             short8 sa = cast(short8)a;
4434             short8 sb = cast(short8)b;
4435             foreach(i; 0..8)
4436             {
4437                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4438                 res[i] = saturateSignedIntToUnsignedShort(sum);
4439             }
4440             return _mm_loadu_si128(cast(int4*)res.ptr);
4441         }
4442         else static if (LDC_with_SSE2)
4443         {
4444             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4445         }
4446         else 
4447             static assert(false);
4448     }
4449     else static if (GDC_with_SSE2)
4450     {
4451         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4452     }
4453     else
4454     {
4455         short[8] res;
4456         short8 sa = cast(short8)a;
4457         short8 sb = cast(short8)b;
4458         foreach(i; 0..8)
4459         {
4460             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4461             res[i] = saturateSignedIntToUnsignedShort(sum);
4462         }
4463         return _mm_loadu_si128(cast(int4*)res.ptr);
4464     }
4465 }
4466 unittest
4467 {
4468     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4469                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4470     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4471     assert(R.array == correct);
4472 }
4473 
4474 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4475 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4476 {
4477     version(LDC)
4478     {
4479         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4480         {
4481             // x86: Generates PSUBUSB since LDC 1.15 -O0
4482             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4483             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4484             enum ir = `
4485                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4486                 ret <16 x i8> %r`;
4487             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4488         }
4489         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4490         {
4491             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4492             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4493             {
4494                 ubyte[16] res;
4495                 byte16 sa = cast(byte16)a;
4496                 byte16 sb = cast(byte16)b;
4497                 foreach(i; 0..16)
4498                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4499                 return _mm_loadu_si128(cast(int4*)res.ptr);
4500             }
4501         }
4502         else static if (LDC_with_SSE2)
4503         {
4504             return __builtin_ia32_psubusb128(a, b);
4505         }
4506         else 
4507             static assert(false);
4508     }
4509     else static if (GDC_with_SSE2)
4510     {
4511         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4512     }
4513     else
4514     {
4515         ubyte[16] res;
4516         byte16 sa = cast(byte16)a;
4517         byte16 sb = cast(byte16)b;
4518         foreach(i; 0..16)
4519             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4520         return _mm_loadu_si128(cast(int4*)res.ptr);
4521     }
4522 }
4523 unittest
4524 {
4525     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4526                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4527     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4528     assert(res.array == correctResult);
4529 }
4530 
4531 // Note: the only difference between these intrinsics is the signalling
4532 //       behaviour of quiet NaNs. This is incorrect but the case where
4533 //       you would want to differentiate between qNaN and sNaN and then
4534 //       treat them differently on purpose seems extremely rare.
4535 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4536 alias _mm_ucomige_sd = _mm_comige_sd; ///
4537 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4538 alias _mm_ucomile_sd = _mm_comile_sd; ///
4539 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4540 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4541 
4542 /// Return vector of type `__m128d` with undefined elements.
4543 __m128d _mm_undefined_pd() pure @safe
4544 {
4545     __m128d result = void;
4546     return result;
4547 }
4548 
4549 /// Return vector of type `__m128i` with undefined elements.
4550 __m128i _mm_undefined_si128() pure @safe
4551 {
4552     __m128i result = void;
4553     return result;
4554 }
4555 
4556 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4557 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4558 {
4559     static if (GDC_with_SSE2)
4560     {
4561         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4562     }
4563     else static if (DMD_with_32bit_asm)
4564     {
4565         asm pure nothrow @nogc @trusted
4566         {
4567             movdqu XMM0, a;
4568             movdqu XMM1, b;
4569             punpckhwd XMM0, XMM1;
4570             movdqu a, XMM0;
4571         }
4572         return a;
4573     }
4574     else
4575     {
4576         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4577                                            (cast(short8)a, cast(short8)b);
4578     }
4579 }
4580 unittest
4581 {
4582     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4583     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4584     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4585     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4586     assert(C.array == correct);
4587 }
4588 
4589 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4590 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
4591 {
4592     static if (GDC_with_SSE2)
4593     {
4594         return __builtin_ia32_punpckhdq128(a, b);
4595     }
4596     else version(DigitalMars)
4597     {
4598         __m128i r;
4599         r.ptr[0] = a.array[2];
4600         r.ptr[1] = b.array[2];
4601         r.ptr[2] = a.array[3];
4602         r.ptr[3] = b.array[3];
4603         return r;
4604     }
4605     else
4606     {
4607         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4608     }
4609 }
4610 unittest
4611 {
4612     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4613     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4614     __m128i C = _mm_unpackhi_epi32(A, B);
4615     int[4] correct = [3, 7, 4, 8];
4616     assert(C.array == correct);
4617 }
4618 
4619 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4620 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4621 {
4622     static if (GDC_with_SSE2)
4623     {
4624         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
4625     }
4626     else
4627     {
4628         __m128i r = cast(__m128i)b;
4629         r[0] = a[2];
4630         r[1] = a[3];
4631         return r; 
4632     }
4633 }
4634 unittest // Issue #36
4635 {
4636     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4637     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4638     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4639     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4640     assert(C.array == correct);
4641 }
4642 
4643 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4644 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4645 {
4646     static if (GDC_with_SSE2)
4647     {
4648         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
4649     }
4650     else static if (DMD_with_32bit_asm)
4651     {
4652         asm pure nothrow @nogc @trusted
4653         {
4654             movdqu XMM0, a;
4655             movdqu XMM1, b;
4656             punpckhbw XMM0, XMM1;
4657             movdqu a, XMM0;
4658         }
4659         return a;
4660     }
4661     else
4662     {
4663         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4664                                                    12, 28, 13, 29, 14, 30, 15, 31)
4665                                                    (cast(byte16)a, cast(byte16)b);
4666     }
4667 }
4668 unittest
4669 {
4670     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4671     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4672     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
4673     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
4674     assert(C.array == correct);
4675 }
4676 
4677 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4678 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4679 {
4680     static if (GDC_with_SSE2)
4681     {
4682         return __builtin_ia32_unpckhpd(a, b);
4683     }
4684     else
4685     {
4686         return shufflevector!(__m128d, 1, 3)(a, b);
4687     }
4688 }
4689 unittest
4690 {
4691     __m128d A = _mm_setr_pd(4.0, 6.0);
4692     __m128d B = _mm_setr_pd(7.0, 9.0);
4693     __m128d C = _mm_unpackhi_pd(A, B);
4694     double[2] correct = [6.0, 9.0];
4695     assert(C.array == correct);
4696 }
4697 
4698 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4699 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4700 {
4701     static if (GDC_with_SSE2)
4702     {
4703         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
4704     }
4705     else static if (DMD_with_32bit_asm)
4706     {
4707         asm pure nothrow @nogc @trusted
4708         {
4709             movdqu XMM0, a;
4710             movdqu XMM1, b;
4711             punpcklwd XMM0, XMM1;
4712             movdqu a, XMM0;
4713         }
4714         return a;
4715     }
4716     else
4717     {
4718         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4719                                            (cast(short8)a, cast(short8)b);
4720     }
4721 }
4722 unittest
4723 {
4724     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4725     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4726     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
4727     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
4728     assert(C.array == correct);
4729 }
4730 
4731 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4732 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
4733 {
4734     static if (GDC_with_SSE2)
4735     {
4736         return __builtin_ia32_punpckldq128(a, b);
4737     }
4738     else version(DigitalMars)
4739     {
4740         __m128i r;
4741         r.ptr[0] = a.array[0];
4742         r.ptr[1] = b.array[0];
4743         r.ptr[2] = a.array[1];
4744         r.ptr[3] = b.array[1];
4745         return r;
4746     }
4747     else
4748     {
4749         return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
4750     }
4751 }
4752 unittest
4753 {
4754     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4755     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4756     __m128i C = _mm_unpacklo_epi32(A, B);
4757     int[4] correct = [1, 5, 2, 6];
4758     assert(C.array == correct);
4759 }
4760 
4761 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4762 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4763 {
4764     static if (GDC_with_SSE2)
4765     {
4766         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
4767     }
4768     else
4769     {
4770         long2 lA = cast(long2)a;
4771         long2 lB = cast(long2)b;
4772         long2 R;
4773         R.ptr[0] = lA.array[0];
4774         R.ptr[1] = lB.array[0];
4775         return cast(__m128i)R;
4776     }
4777 }
4778 unittest // Issue #36
4779 {
4780     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4781     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4782     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4783     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4784     assert(C.array == correct);
4785 }
4786 
4787 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
4788 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4789 {
4790     static if (GDC_with_SSE2)
4791     {
4792         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
4793     }
4794     else static if (DMD_with_32bit_asm)
4795     {
4796         asm pure nothrow @nogc @trusted
4797         {
4798             movdqu XMM0, a;
4799             movdqu XMM1, b;
4800             punpcklbw XMM0, XMM1;
4801             movdqu a, XMM0;
4802         }
4803         return a;
4804     }
4805     else
4806     {
4807         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4808                                                     4, 20, 5, 21, 6, 22, 7, 23)
4809                                            (cast(byte16)a, cast(byte16)b);
4810     }
4811 }
4812 unittest
4813 {
4814     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4815     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4816     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
4817     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
4818     assert(C.array == correct);
4819 }
4820 
4821 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
4822 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4823 {
4824     static if (GDC_with_SSE2)
4825     {
4826         return __builtin_ia32_unpcklpd(a, b);
4827     }
4828     else
4829     {
4830         return shufflevector!(__m128d, 0, 2)(a, b);
4831     }
4832 }
4833 unittest
4834 {
4835     __m128d A = _mm_setr_pd(4.0, 6.0);
4836     __m128d B = _mm_setr_pd(7.0, 9.0);
4837     __m128d C = _mm_unpacklo_pd(A, B);
4838     double[2] correct = [4.0, 7.0];
4839     assert(C.array == correct);
4840 }
4841 
4842 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4843 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4844 {
4845     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4846 }
4847 
4848 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
4849 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4850 {
4851     return a ^ b;
4852 }
4853 
4854 unittest
4855 {
4856     float distance(float[4] a, float[4] b) nothrow @nogc
4857     {
4858         __m128 va = _mm_loadu_ps(a.ptr);
4859         __m128 vb = _mm_loadu_ps(b.ptr);
4860         __m128 diffSquared = _mm_sub_ps(va, vb);
4861         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4862         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4863         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4864         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4865     }
4866     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4867 }