inteli.emmintrin source code

1 /**
2 * SSE2 intrinsics. 
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.emmintrin;
8 
9 public import inteli.types;
10 public import inteli.xmmintrin; // SSE2 includes SSE1
11 import inteli.mmx;
12 import inteli.internals;
13 
14 nothrow @nogc:
15 
16 
17 // SSE2 instructions
18 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
19 
20 /// Add packed 16-bit integers in `a` and `b`.
21 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
22 {
23     return cast(__m128i)(cast(short8)a + cast(short8)b);
24 }
25 unittest
26 {
27     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
28     short8 R = cast(short8) _mm_add_epi16(A, A);
29     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
30     assert(R.array == correct);
31 }
32 
33 /// Add packed 32-bit integers in `a` and `b`.
34 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
35 {
36     return cast(__m128i)(cast(int4)a + cast(int4)b);
37 }
38 unittest
39 {
40     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
41     int4 R = _mm_add_epi32(A, A);
42     int[4] correct = [ -14, -2, 0, 18 ];
43     assert(R.array == correct);
44 }
45 
46 /// Add packed 64-bit integers in `a` and `b`.
47 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
48 {
49     return cast(__m128i)(cast(long2)a + cast(long2)b);
50 }
51 unittest
52 {
53     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
54     long2 R = cast(long2) _mm_add_epi64(A, A);
55     long[2] correct = [ -2, 0 ];
56     assert(R.array == correct);
57 }
58 
59 /// Add packed 8-bit integers in `a` and `b`.
60 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
61 {
62     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
63 }
64 unittest
65 {
66     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
67     byte16 R = cast(byte16) _mm_add_epi8(A, A);
68     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
69     assert(R.array == correct);
70 }
71 
72 /// Add the lower double-precision (64-bit) floating-point element 
73 /// in `a` and `b`, store the result in the lower element of dst, 
74 /// and copy the upper element from `a` to the upper element of destination. 
75 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
76 {
77     static if (GDC_with_SSE2)
78     {
79         return __builtin_ia32_addsd(a, b);
80     }
81     else version(DigitalMars)
82     {
83         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
84         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
85         asm pure nothrow @nogc @trusted { nop;}
86         a[0] = a[0] + b[0];
87         return a;
88     }
89     else
90     {
91         a[0] += b[0];
92         return a;
93     }
94 }
95 unittest
96 {
97     __m128d a = [1.5, -2.0];
98     a = _mm_add_sd(a, a);
99     assert(a.array == [3.0, -2.0]);
100 }
101 
102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
104 {
105     return a + b;
106 }
107 unittest
108 {
109     __m128d a = [1.5, -2.0];
110     a = _mm_add_pd(a, a);
111     assert(a.array == [3.0, -4.0]);
112 }
113 
114 /// Add 64-bit integers `a` and `b`.
115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
116 {
117     return a + b;
118 }
119 
120 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
122 {
123     static if (GDC_with_SSE2)
124     {
125         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
126     }
127     else version(LDC)
128     {
129         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
130         {
131             // x86: Generates PADDSW since LDC 1.15 -O0
132             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
133             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
134             enum ir = `
135                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
136                 ret <8 x i16> %r`;
137             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
138         }
139         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
140         {
141             short[8] res;
142             short8 sa = cast(short8)a;
143             short8 sb = cast(short8)b;
144             foreach(i; 0..8)
145                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
146             return _mm_loadu_si128(cast(int4*)res.ptr);
147         }
148         else
149             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
150     }
151     else
152     {
153         short[8] res;
154         short8 sa = cast(short8)a;
155         short8 sb = cast(short8)b;
156         foreach(i; 0..8)
157             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
158         return _mm_loadu_si128(cast(int4*)res.ptr);
159     }
160 }
161 unittest
162 {
163     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
164                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
165     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
166     assert(res.array == correctResult);
167 }
168 
169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
171 {
172     static if (GDC_with_SSE2)
173     {
174         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
175     }
176     else version(LDC)
177     {
178         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
179         {
180             // x86: Generates PADDSB since LDC 1.15 -O0
181             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
182             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
183             enum ir = `
184                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
185                 ret <16 x i8> %r`;
186             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
187         }
188         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
189         {
190             byte[16] res;
191             byte16 sa = cast(byte16)a;
192             byte16 sb = cast(byte16)b;
193             foreach(i; 0..16)
194                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
195             return _mm_loadu_si128(cast(int4*)res.ptr);
196         }
197         else
198             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
199     }
200     else
201     {
202         byte[16] res;
203         byte16 sa = cast(byte16)a;
204         byte16 sb = cast(byte16)b;
205         foreach(i; 0..16)
206             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
207         return _mm_loadu_si128(cast(int4*)res.ptr);
208     }
209 }
210 unittest
211 {
212     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
213                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
214     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
215                                                16, 18, 20, 22, 24, 26, 28, 30];
216     assert(res.array == correctResult);
217 }
218 
219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
220 // PERF: #GDC version?
221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
222 {
223     version(LDC)
224     {
225         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
226         {
227             // x86: Generates PADDUSB since LDC 1.15 -O0
228             // ARM: Generates uqadd.16b since LDC 1.21 -O1
229             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
230             enum ir = `
231                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
232                 ret <16 x i8> %r`;
233             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
234         }
235         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
236         {
237             ubyte[16] res;
238             byte16 sa = cast(byte16)a;
239             byte16 sb = cast(byte16)b;
240             foreach(i; 0..16)
241                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
242             return _mm_loadu_si128(cast(int4*)res.ptr);
243         }
244         else
245             return __builtin_ia32_paddusb128(a, b);
246     }
247     else
248     {
249         ubyte[16] res;
250         byte16 sa = cast(byte16)a;
251         byte16 sb = cast(byte16)b;
252         foreach(i; 0..16)
253             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
254         return _mm_loadu_si128(cast(int4*)res.ptr);
255     }
256 }
257 unittest
258 {
259     byte16 res = cast(byte16) 
260         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
261                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
262     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
263                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
264     assert(res.array == correctResult);
265 }
266 
267 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
268 // PERF: #GDC version?
269 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
270 {
271     version(LDC)
272     {
273         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
274         {
275             // x86: Generates PADDUSW since LDC 1.15 -O0
276             // ARM: Generates uqadd.8h since LDC 1.21 -O1
277             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
278             enum ir = `
279                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
280                 ret <8 x i16> %r`;
281             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
282         }
283         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
284         {
285             ushort[8] res;
286             short8 sa = cast(short8)a;
287             short8 sb = cast(short8)b;
288             foreach(i; 0..8)
289                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
290             return _mm_loadu_si128(cast(int4*)res.ptr);
291         }
292         else
293             return __builtin_ia32_paddusw128(a, b);
294     }
295     else
296     {
297         ushort[8] res;
298         short8 sa = cast(short8)a;
299         short8 sb = cast(short8)b;
300         foreach(i; 0..8)
301             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
302         return _mm_loadu_si128(cast(int4*)res.ptr);
303     }
304 }
305 unittest
306 {
307     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
308                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
309     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
310     assert(res.array == correctResult);
311 }
312 
313 /// Compute the bitwise AND of packed double-precision (64-bit) 
314 /// floating-point elements in `a` and `b`.
315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
316 {
317     return cast(__m128d)( cast(long2)a & cast(long2)b );
318 }
319 unittest
320 {
321     double a = 4.32;
322     double b = -78.99;
323     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
324     __m128d A = _mm_set_pd(a, b);
325     __m128d B = _mm_set_pd(b, a);
326     long2 R = cast(long2)( _mm_and_pd(A, B) );
327     assert(R.array[0] == correct);
328     assert(R.array[1] == correct);
329 }
330 
331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
333 {
334     return a & b;
335 }
336 unittest
337 {
338     __m128i A = _mm_set1_epi32(7);
339     __m128i B = _mm_set1_epi32(14);
340     __m128i R = _mm_and_si128(A, B);
341     int[4] correct = [6, 6, 6, 6];
342     assert(R.array == correct);
343 }
344 
345 /// Compute the bitwise NOT of packed double-precision (64-bit) 
346 /// floating-point elements in `a` and then AND with `b`.
347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
348 {
349     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
350 }
351 unittest
352 {
353     double a = 4.32;
354     double b = -78.99;
355     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
356     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
357     __m128d A = _mm_setr_pd(a, b);
358     __m128d B = _mm_setr_pd(b, a);
359     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
360     assert(R.array[0] == correct);
361     assert(R.array[1] == correct2);
362 }
363 
364 /// Compute the bitwise NOT of 128 bits (representing integer data) 
365 /// in `a` and then AND with `b`.
366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
367 {
368     return (~a) & b;
369 }
370 unittest
371 {
372     __m128i A = _mm_set1_epi32(7);
373     __m128i B = _mm_set1_epi32(14);
374     __m128i R = _mm_andnot_si128(A, B);
375     int[4] correct = [8, 8, 8, 8];
376     assert(R.array == correct);
377 }
378 
379 /// Average packed unsigned 16-bit integers in `a` and `b`.
380 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
381 {
382     static if (GDC_with_SSE2)
383     {
384         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
385     }
386     else static if (LDC_with_ARM64)
387     {
388         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
389     }
390     else version(LDC)
391     {
392         // Generates pavgw even in LDC 1.0, even in -O0
393         // But not in ARM
394         enum ir = `
395             %ia = zext <8 x i16> %0 to <8 x i32>
396             %ib = zext <8 x i16> %1 to <8 x i32>
397             %isum = add <8 x i32> %ia, %ib
398             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
399             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
400             %r = trunc <8 x i32> %isums to <8 x i16>
401             ret <8 x i16> %r`;
402         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
403     }
404     else
405     {
406         short8 sa = cast(short8)a;
407         short8 sb = cast(short8)b;
408         short8 sr = void;
409         foreach(i; 0..8)
410         {
411             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
412         }
413         return cast(int4)sr;
414     }
415 }
416 unittest
417 {
418     __m128i A = _mm_set1_epi16(31);
419     __m128i B = _mm_set1_epi16(64);
420     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
421     foreach(i; 0..8)
422         assert(avg.array[i] == 48);
423 }
424 
425 /// Average packed unsigned 8-bit integers in `a` and `b`.
426 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
427 {
428     static if (GDC_with_SSE2)
429     {
430         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
431     }
432     else static if (LDC_with_ARM64)
433     {
434         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
435     }
436     else version(LDC)
437     {
438         // Generates pavgb even in LDC 1.0, even in -O0
439         // But not in ARM
440         enum ir = `
441             %ia = zext <16 x i8> %0 to <16 x i16>
442             %ib = zext <16 x i8> %1 to <16 x i16>
443             %isum = add <16 x i16> %ia, %ib
444             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
445             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
446             %r = trunc <16 x i16> %isums to <16 x i8>
447             ret <16 x i8> %r`;
448         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
449     }
450     else
451     {
452         byte16 sa = cast(byte16)a;
453         byte16 sb = cast(byte16)b;
454         byte16 sr = void;
455         foreach(i; 0..16)
456         {
457             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
458         }
459         return cast(int4)sr;
460     }
461 }
462 unittest
463 {
464     __m128i A = _mm_set1_epi8(31);
465     __m128i B = _mm_set1_epi8(64);
466     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
467     foreach(i; 0..16)
468         assert(avg.array[i] == 48);
469 }
470 
471 /// Shift `a` left by `bytes` bytes while shifting in zeros.
472 alias _mm_bslli_si128 = _mm_slli_si128;
473 unittest
474 {
475     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
476     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
477     __m128i result = _mm_bslli_si128!5(toShift);
478     assert( (cast(byte16)result).array == exact);
479 }
480 
481 /// Shift `v` right by `bytes` bytes while shifting in zeros.
482 alias _mm_bsrli_si128 = _mm_srli_si128;
483 unittest
484 {
485     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
486     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
487     __m128i result = _mm_bsrli_si128!5(toShift);
488     assert( (cast(byte16)result).array == exact);
489 }
490 
491 /// Cast vector of type `__m128d` to type `__m128`. 
492 /// Note: Also possible with a regular `cast(__m128)(a)`.
493 __m128 _mm_castpd_ps (__m128d a) pure @safe
494 {
495     return cast(__m128)a;
496 }
497 
498 /// Cast vector of type `__m128d` to type `__m128i`. 
499 /// Note: Also possible with a regular `cast(__m128i)(a)`.
500 __m128i _mm_castpd_si128 (__m128d a) pure @safe
501 {
502     return cast(__m128i)a;
503 }
504 
505 /// Cast vector of type `__m128` to type `__m128d`. 
506 /// Note: Also possible with a regular `cast(__m128d)(a)`.
507 __m128d _mm_castps_pd (__m128 a) pure @safe
508 {
509     return cast(__m128d)a;
510 }
511 
512 /// Cast vector of type `__m128` to type `__m128i`. 
513 /// Note: Also possible with a regular `cast(__m128i)(a)`.
514 __m128i _mm_castps_si128 (__m128 a) pure @safe
515 {
516     return cast(__m128i)a;
517 }
518 
519 /// Cast vector of type `__m128i` to type `__m128d`. 
520 /// Note: Also possible with a regular `cast(__m128d)(a)`.
521 __m128d _mm_castsi128_pd (__m128i a) pure @safe
522 {
523     return cast(__m128d)a;
524 }
525 
526 /// Cast vector of type `__m128i` to type `__m128`. 
527 /// Note: Also possible with a regular `cast(__m128)(a)`.
528 __m128 _mm_castsi128_ps (__m128i a) pure @safe
529 {
530     return cast(__m128)a;
531 }
532 
533 /// Invalidate and flush the cache line that contains `p` 
534 /// from all levels of the cache hierarchy.
535 void _mm_clflush (const(void)* p) @trusted
536 {
537     static if (GDC_with_SSE2)
538     {
539         __builtin_ia32_clflush(p);
540     }
541     else static if (LDC_with_SSE2)
542     {
543         __builtin_ia32_clflush(cast(void*)p);
544     }
545     else version(D_InlineAsm_X86)
546     {
547         asm pure nothrow @nogc @safe
548         {
549             mov EAX, p;
550             clflush [EAX];
551         }
552     }
553     else version(D_InlineAsm_X86_64)
554     {
555         asm pure nothrow @nogc @safe
556         {
557             mov RAX, p;
558             clflush [RAX];
559         }
560     }
561     else 
562     {
563         // Do nothing. Invalidating cacheline does
564         // not affect correctness.
565     }
566 }
567 unittest
568 {
569     ubyte[64] cacheline;
570     _mm_clflush(cacheline.ptr);
571 }
572 
573 /// Compare packed 16-bit integers in `a` and `b` for equality.
574 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
575 {
576     static if (GDC_with_SSE2)
577     {
578         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
579     }
580     else
581     {
582         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
583     }
584 }
585 unittest
586 {
587     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
588     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
589     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
590     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
591     assert(R.array == E);
592 }
593 
594 /// Compare packed 32-bit integers in `a` and `b` for equality.
595 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
596 {
597     static if (GDC_with_SSE2)
598     {
599         return __builtin_ia32_pcmpeqd128(a, b);
600     }
601     else
602     {
603         return equalMask!__m128i(a, b);
604     }
605 }
606 unittest
607 {
608     int4   A = [-3, -2, -1,  0];
609     int4   B = [ 4, -2,  2,  0];
610     int[4] E = [ 0, -1,  0, -1];
611     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
612     assert(R.array == E);
613 }
614 
615 /// Compare packed 8-bit integers in `a` and `b` for equality.
616 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
617 {
618     static if (GDC_with_SSE2)
619     {
620         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
621     }
622     else
623     {
624         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
625     }
626 }
627 unittest
628 {
629     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
630     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
631     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
632     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
633     assert(C.array == correct);
634 }
635 
636 /// Compare packed double-precision (64-bit) floating-point elements 
637 /// in `a` and `b` for equality.
638 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
639 {
640     static if (GDC_with_SSE2)
641     {
642         return __builtin_ia32_cmpeqpd(a, b);
643     }
644     else
645     {
646         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
647     }
648 }
649 
650 /// Compare the lower double-precision (64-bit) floating-point elements
651 /// in `a` and `b` for equality, store the result in the lower element,
652 /// and copy the upper element from `a`.
653 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
654 {
655     static if (GDC_with_SSE2)
656     {
657         return __builtin_ia32_cmpeqsd(a, b);
658     }
659     else
660     {
661         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
662     }
663 }
664 
665 /// Compare packed double-precision (64-bit) floating-point elements 
666 /// in `a` and `b` for greater-than-or-equal.
667 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
668 {
669     static if (GDC_with_SSE2)
670     {
671         return __builtin_ia32_cmpgepd(a, b);
672     }
673     else
674     {
675         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
676     }
677 }
678 
679 /// Compare the lower double-precision (64-bit) floating-point elements 
680 /// in `a` and `b` for greater-than-or-equal, store the result in the 
681 /// lower element, and copy the upper element from `a`.
682 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
683 {
684     // Note: There is no __builtin_ia32_cmpgesd builtin.
685     static if (GDC_with_SSE2)
686     {
687         return __builtin_ia32_cmpnltsd(b, a);
688     }
689     else
690     {
691         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
692     }
693 }
694 
695 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
696 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
697 {
698     static if (GDC_with_SSE2)
699     {
700         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
701     }
702     else
703     {
704         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
705     }
706 }
707 unittest
708 {
709     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
710     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
711     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
712     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
713     assert(R.array == E);
714 }
715 
716 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
717 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
718 {
719     static if (GDC_with_SSE2)
720     {
721         return __builtin_ia32_pcmpgtd128(a, b); 
722     }
723     else
724     {
725         return cast(__m128i)( greaterMask!int4(a, b));
726     }
727 }
728 unittest
729 {
730     int4   A = [-3,  2, -1,  0];
731     int4   B = [ 4, -2,  2,  0];
732     int[4] E = [ 0, -1,  0,  0];
733     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
734     assert(R.array == E);
735 }
736 
737 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
738 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
739 {
740     static if (GDC_with_SSE2)
741     {
742         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
743     }
744     else
745     {
746         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
747     }
748 }
749 unittest
750 {
751     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
752     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
753     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
754     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
755     __m128i D = _mm_cmpeq_epi8(A, B);
756     assert(C.array == correct);
757 }
758 
759 /// Compare packed double-precision (64-bit) floating-point elements 
760 /// in `a` and `b` for greater-than.
761 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
762 {
763     static if (GDC_with_SSE2)
764     {
765         return __builtin_ia32_cmpgtpd(a, b); 
766     }
767     else
768     {
769         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
770     }
771 }
772 
773 /// Compare the lower double-precision (64-bit) floating-point elements 
774 /// in `a` and `b` for greater-than, store the result in the lower element,
775 /// and copy the upper element from `a`.
776 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
777 {
778     // Note: There is no __builtin_ia32_cmpgtsd builtin.
779     static if (GDC_with_SSE2)
780     {
781         return __builtin_ia32_cmpnlesd(b, a);
782     }
783     else
784     {
785         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
786     }
787 }
788 
789 /// Compare packed double-precision (64-bit) floating-point elements 
790 /// in `a` and `b` for less-than-or-equal.
791 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
792 {
793     static if (GDC_with_SSE2)
794     {
795         return __builtin_ia32_cmplepd(a, b); 
796     }
797     else
798     {
799         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
800     }
801 }
802 
803 /// Compare the lower double-precision (64-bit) floating-point elements 
804 /// in `a` and `b` for less-than-or-equal, store the result in the 
805 /// lower element, and copy the upper element from `a`.
806 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
807 {
808     static if (GDC_with_SSE2)
809     {
810         return __builtin_ia32_cmplesd(a, b); 
811     }
812     else
813     {
814         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
815     }
816 }
817 
818 /// Compare packed 16-bit integers in `a` and `b` for less-than.
819 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
820 {
821     return _mm_cmpgt_epi16(b, a);
822 }
823 
824 /// Compare packed 32-bit integers in `a` and `b` for less-than.
825 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
826 {
827     return _mm_cmpgt_epi32(b, a);
828 }
829 
830 /// Compare packed 8-bit integers in `a` and `b` for less-than.
831 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
832 {
833     return _mm_cmpgt_epi8(b, a);
834 }
835 
836 /// Compare packed double-precision (64-bit) floating-point elements
837 /// in `a` and `b` for less-than.
838 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
839 {
840     static if (GDC_with_SSE2)
841     {
842         return __builtin_ia32_cmpltpd(a, b); 
843     }
844     else
845     {
846         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
847     }
848 }
849 
850 /// Compare the lower double-precision (64-bit) floating-point elements
851 /// in `a` and `b` for less-than, store the result in the lower 
852 /// element, and copy the upper element from `a`.
853 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
854 {
855     static if (GDC_with_SSE2)
856     {
857         return __builtin_ia32_cmpltsd(a, b); 
858     }
859     else
860     {
861         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
862     }
863 }
864 
865 /// Compare packed double-precision (64-bit) floating-point elements
866 /// in `a` and `b` for not-equal.
867 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
868 {
869     static if (GDC_with_SSE2)
870     {
871         return __builtin_ia32_cmpneqpd(a, b); 
872     }
873     else
874     {
875         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
876     }
877 }
878 
879 /// Compare the lower double-precision (64-bit) floating-point elements
880 /// in `a` and `b` for not-equal, store the result in the lower 
881 /// element, and copy the upper element from `a`.
882 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
883 {
884     static if (GDC_with_SSE2)
885     {
886         return __builtin_ia32_cmpneqsd(a, b); 
887     }
888     else
889     {
890         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
891     }
892 }
893 
894 /// Compare packed double-precision (64-bit) floating-point elements 
895 /// in `a` and `b` for not-greater-than-or-equal.
896 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
897 {
898     static if (GDC_with_SSE2)
899     {
900         return __builtin_ia32_cmpngepd(a, b); 
901     }
902     else
903     {
904         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
905     }
906 }
907 
908 /// Compare the lower double-precision (64-bit) floating-point elements 
909 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
910 /// the lower element, and copy the upper element from `a`.
911 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
912 {
913     // Note: There is no __builtin_ia32_cmpngesd builtin.
914     static if (GDC_with_SSE2)
915     {
916         return __builtin_ia32_cmpltsd(b, a); 
917     }
918     else
919     {
920         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
921     }
922 }
923 
924 /// Compare packed double-precision (64-bit) floating-point elements 
925 /// in `a` and `b` for not-greater-than.
926 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
927 {
928     static if (GDC_with_SSE2)
929     {
930         return __builtin_ia32_cmpngtpd(a, b);
931     }
932     else
933     {
934         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
935     }
936 }
937 
938 /// Compare the lower double-precision (64-bit) floating-point elements 
939 /// in `a` and `b` for not-greater-than, store the result in the 
940 /// lower element, and copy the upper element from `a`.
941 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
942 {
943     // Note: There is no __builtin_ia32_cmpngtsd builtin.
944     static if (GDC_with_SSE2)
945     {
946         return __builtin_ia32_cmplesd(b, a);
947     }
948     else
949     {
950         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
951     }
952 }
953 
954 /// Compare packed double-precision (64-bit) floating-point elements 
955 /// in `a` and `b` for not-less-than-or-equal.
956 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
957 {
958     static if (GDC_with_SSE2)
959     {
960         return __builtin_ia32_cmpnlepd(a, b);
961     }
962     else
963     {
964         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
965     }
966 }
967 
968 /// Compare the lower double-precision (64-bit) floating-point elements 
969 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
970 /// lower element, and copy the upper element from `a`.
971 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
972 {
973     static if (GDC_with_SSE2)
974     {
975         return __builtin_ia32_cmpnlesd(a, b);
976     }
977     else
978     {
979         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
980     }
981 }
982  
983 /// Compare packed double-precision (64-bit) floating-point elements 
984 /// in `a` and `b` for not-less-than.
985 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
986 {
987     static if (GDC_with_SSE2)
988     {
989         return __builtin_ia32_cmpnltpd(a, b);
990     }
991     else
992     {
993         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
994     }
995 }
996 
997 /// Compare the lower double-precision (64-bit) floating-point elements 
998 /// in `a` and `b` for not-less-than, store the result in the lower 
999 /// element, and copy the upper element from `a`.
1000 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1001 {
1002     static if (GDC_with_SSE2)
1003     {
1004         return __builtin_ia32_cmpnltsd(a, b);
1005     }
1006     else
1007     {
1008         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1009     }
1010 }
1011 
1012 /// Compare packed double-precision (64-bit) floating-point elements 
1013 /// in `a` and `b` to see if neither is NaN.
1014 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1015 {
1016     static if (GDC_with_SSE2)
1017     {
1018         return __builtin_ia32_cmpordpd(a, b);
1019     }
1020     else
1021     {
1022         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1023     }
1024 }
1025 
1026 /// Compare the lower double-precision (64-bit) floating-point elements 
1027 /// in `a` and `b` to see if neither is NaN, store the result in the 
1028 /// lower element, and copy the upper element from `a` to the upper element.
1029 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1030 {
1031     static if (GDC_with_SSE2)
1032     {
1033         return __builtin_ia32_cmpordsd(a, b);
1034     }
1035     else
1036     {
1037         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1038     }
1039 }
1040 
1041 /// Compare packed double-precision (64-bit) floating-point elements 
1042 /// in `a` and `b` to see if either is NaN.
1043 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1044 {
1045     static if (GDC_with_SSE2)
1046     {
1047         return __builtin_ia32_cmpunordpd(a, b);
1048     }
1049     else
1050     {
1051         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1052     }
1053 }
1054 
1055 /// Compare the lower double-precision (64-bit) floating-point elements 
1056 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1057 /// element, and copy the upper element from `a` to the upper element.
1058 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1059 {
1060     static if (GDC_with_SSE2)
1061     {
1062         return __builtin_ia32_cmpunordsd(a, b);
1063     }
1064     else
1065     {
1066         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1067     }
1068 }
1069 
1070 /// Compare the lower double-precision (64-bit) floating-point element 
1071 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1072 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1073 {
1074     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1075     // comisd instruction, it returns false in case of unordered instead.
1076     //
1077     // Actually C++ compilers disagree over the meaning of that instruction.
1078     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1079     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1080     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1081     return a.array[0] == b.array[0];
1082 }
1083 unittest
1084 {
1085     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1086     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1087     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1088     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1089     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1090 }
1091 
1092 /// Compare the lower double-precision (64-bit) floating-point element 
1093 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1094 /// result (0 or 1).
1095 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1096 {
1097     return a.array[0] >= b.array[0];
1098 }
1099 unittest
1100 {
1101     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1102     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1103     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1104     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1105     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1106     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1107 }
1108 
1109 /// Compare the lower double-precision (64-bit) floating-point element 
1110 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1111 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1112 {
1113     return a.array[0] > b.array[0];
1114 }
1115 unittest
1116 {
1117     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1118     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1119     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1120     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1121     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1122 }
1123 
1124 /// Compare the lower double-precision (64-bit) floating-point element 
1125 /// in `a` and `b` for less-than-or-equal.
1126 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1127 {
1128     return a.array[0] <= b.array[0];
1129 }
1130 unittest
1131 {
1132     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1133     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1134     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1135     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1136     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1137     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1138 }
1139 
1140 /// Compare the lower double-precision (64-bit) floating-point element 
1141 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1142 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1143 {
1144     return a.array[0] < b.array[0];
1145 }
1146 unittest
1147 {
1148     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1149     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1150     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1151     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1152     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1153     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1154 }
1155 
1156 /// Compare the lower double-precision (64-bit) floating-point element
1157 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1158 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1159 {
1160     return a.array[0] != b.array[0];
1161 }
1162 unittest
1163 {
1164     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1165     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1166     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1167     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1168     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1169 }
1170 
1171 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1172 /// floating-point elements.
1173  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1174 {
1175     version(LDC)
1176     {
1177         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1178         enum ir = `
1179             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1180             %r = sitofp <2 x i32> %v to <2 x double>
1181             ret <2 x double> %r`;
1182         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1183     }
1184     else static if (GDC_with_SSE2)
1185     {
1186         return __builtin_ia32_cvtdq2pd(a);
1187     }
1188     else
1189     {
1190         double2 r = void;
1191         r.ptr[0] = a.array[0];
1192         r.ptr[1] = a.array[1];
1193         return r;
1194     }
1195 }
1196 unittest
1197 {
1198     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1199     assert(A.array[0] == 54.0);
1200     assert(A.array[1] == 54.0);
1201 }
1202 
1203 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1204 /// floating-point elements.
1205 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1206 {
1207     static if (GDC_with_SSE2)
1208     {
1209         return __builtin_ia32_cvtdq2ps(a);
1210     }
1211     else
1212     {
1213         // x86: Generates cvtdq2ps since LDC 1.0.0 -O1
1214         // ARM: Generats scvtf.4s since LDC 1.8.0 -02
1215         __m128 res;
1216         res.ptr[0] = cast(float)a.array[0];
1217         res.ptr[1] = cast(float)a.array[1];
1218         res.ptr[2] = cast(float)a.array[2];
1219         res.ptr[3] = cast(float)a.array[3];
1220         return res;
1221     }
1222 }
1223 unittest
1224 {
1225     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1226     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1227 }
1228 
1229 /// Convert packed double-precision (64-bit) floating-point elements 
1230 /// in `a` to packed 32-bit integers.
1231 // PERF ARM32
1232 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1233 {
1234     static if (LDC_with_SSE2)
1235     {
1236         return __builtin_ia32_cvtpd2dq(a);
1237     }
1238     else static if (GDC_with_SSE2)
1239     {
1240         return __builtin_ia32_cvtpd2dq(a);
1241     }
1242     else static if (LDC_with_ARM64)
1243     {
1244         // Get current rounding mode.
1245         uint fpscr = arm_get_fpcr();
1246         long2 i;
1247         switch(fpscr & _MM_ROUND_MASK_ARM)
1248         {
1249             default:
1250             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1251             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1252             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1253             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1254         }
1255         int4 zero = 0;
1256         return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1257     }
1258     else
1259     {
1260         // PERF ARM32
1261         __m128i r = _mm_setzero_si128();
1262         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1263         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1264         return r;
1265     }
1266 }
1267 unittest
1268 {
1269     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1270     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1271 }
1272 
1273 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1274 /// to packed 32-bit integers
1275 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1276 {
1277     return to_m64(_mm_cvtpd_epi32(v));
1278 }
1279 unittest
1280 {
1281     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1282     assert(A.array[0] == 55 && A.array[1] == 61);
1283 }
1284 
1285 /// Convert packed double-precision (64-bit) floating-point elements 
1286 /// in `a` to packed single-precision (32-bit) floating-point elements.
1287 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1288 {
1289     static if (LDC_with_SSE2)
1290     {
1291         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1292     }
1293     else static if (GDC_with_SSE2)
1294     {
1295         return __builtin_ia32_cvtpd2ps(a);
1296     }
1297     else
1298     { 
1299         __m128 r = void;
1300         r.ptr[0] = a.array[0];
1301         r.ptr[1] = a.array[1];
1302         r.ptr[2] = 0;
1303         r.ptr[3] = 0;
1304         return r;
1305     }
1306 }
1307 unittest
1308 {
1309     __m128d A = _mm_set_pd(5.25, 4.0);
1310     __m128 B = _mm_cvtpd_ps(A);
1311     assert(B.array == [4.0f, 5.25f, 0, 0]);
1312 }
1313 
1314 /// Convert packed 32-bit integers in `v` to packed double-precision 
1315 /// (64-bit) floating-point elements.
1316 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1317 {
1318     return _mm_cvtepi32_pd(to_m128i(v));
1319 }
1320 unittest
1321 {
1322     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1323     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1324 }
1325 
1326 /// Convert packed single-precision (32-bit) floating-point elements 
1327 /// in `a` to packed 32-bit integers
1328 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1329 {
1330     static if (LDC_with_SSE2)
1331     {
1332         // Disabled, since it fail with optimizations unfortunately
1333         //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
1334         return __asm!__m128i("cvtps2dq $1,$0","=x,x",a);
1335     }
1336     else static if (GDC_with_SSE2)
1337     {
1338         return __builtin_ia32_cvtps2dq(a);
1339     }
1340     else static if (LDC_with_ARM64)
1341     {
1342         // Get current rounding mode.
1343         uint fpscr = arm_get_fpcr();
1344         switch(fpscr & _MM_ROUND_MASK_ARM)
1345         {
1346             default:
1347             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1348             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1349             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1350             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1351         }
1352     }
1353     else
1354     {
1355         __m128i r = void;
1356         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1357         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1358         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1359         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1360         return r;
1361     }
1362 }
1363 unittest
1364 {
1365     // GDC bug #98607
1366     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1367     // GDC does not provide optimization barrier for roundign mode.
1368     // Workarounded with different literals. Thsi bug will likely only manifest in unittest.
1369     // GDC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1370 
1371     uint savedRounding = _MM_GET_ROUNDING_MODE();
1372 
1373     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1374     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1375     assert(A.array == [1, -2, 54, -3]);
1376 
1377     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1378     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1379     assert(A.array == [1, -3, 53, -3]);
1380 
1381     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1382     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1383     assert(A.array == [2, -2, 54, -2]);
1384 
1385     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1386     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1387     assert(A.array == [1, -2, 53, -2]);
1388 
1389     _MM_SET_ROUNDING_MODE(savedRounding);
1390 }
1391 
1392 /// Convert packed single-precision (32-bit) floating-point elements 
1393 /// in `a` to packed double-precision (64-bit) floating-point elements.
1394 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1395 {
1396     version(LDC)
1397     {
1398         // Generates cvtps2pd since LDC 1.0 -O0
1399         enum ir = `
1400             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1401             %r = fpext <2 x float> %v to <2 x double>
1402             ret <2 x double> %r`;
1403         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1404     }
1405     else static if (GDC_with_SSE2)
1406     {
1407         return __builtin_ia32_cvtps2pd(a);
1408     }
1409     else
1410     {
1411         double2 r = void;
1412         r.ptr[0] = a.array[0];
1413         r.ptr[1] = a.array[1];
1414         return r;
1415     }
1416 }
1417 unittest
1418 {
1419     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1420     assert(A.array[0] == 54.0);
1421     assert(A.array[1] == 54.0);
1422 }
1423 
1424 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1425 double _mm_cvtsd_f64 (__m128d a) pure @safe
1426 {
1427     return a.array[0];
1428 }
1429 
1430 /// Convert the lower double-precision (64-bit) floating-point element
1431 /// in `a` to a 32-bit integer.
1432 int _mm_cvtsd_si32 (__m128d a) @safe
1433 {
1434     static if (LDC_with_SSE2)
1435     {
1436         return __builtin_ia32_cvtsd2si(a);
1437     }
1438     else static if (GDC_with_SSE2)
1439     {
1440         return __builtin_ia32_cvtsd2si(a);
1441     }
1442     else
1443     {
1444         return convertDoubleToInt32UsingMXCSR(a[0]);
1445     }
1446 }
1447 unittest
1448 {
1449     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1450 }
1451 
1452 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1453 long _mm_cvtsd_si64 (__m128d a) @trusted
1454 {
1455     version (LDC)
1456     {
1457         version (X86_64)
1458         {
1459             return __builtin_ia32_cvtsd2si64(a);
1460         }
1461         else
1462         {
1463             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1464             // using SSE instructions only. So the builtin doesn't exit for this arch.
1465             return convertDoubleToInt64UsingMXCSR(a[0]);
1466         }
1467     }
1468     else
1469     {
1470         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1471     }
1472 }
1473 unittest
1474 {
1475     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1476 
1477     uint savedRounding = _MM_GET_ROUNDING_MODE();
1478 
1479     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1480     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1481 
1482     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1483     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1484 
1485     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1486     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1487 
1488     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1489     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1490 
1491     _MM_SET_ROUNDING_MODE(savedRounding);
1492 }
1493 
1494 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1495 
1496 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1497 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1498 /// to the upper elements of result.
1499 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1500 {
1501     static if (GDC_with_SSE2)
1502     {
1503         return __builtin_ia32_cvtsd2ss(a, b); 
1504     }
1505     else
1506     {
1507         // Generates cvtsd2ss since LDC 1.3 -O0
1508         a.ptr[0] = b.array[0];
1509         return a;
1510     }
1511 }
1512 unittest
1513 {
1514     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1515     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1516 }
1517 
1518 /// Get the lower 32-bit integer in `a`.
1519 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1520 {
1521     return a.array[0];
1522 }
1523 
1524 /// Get the lower 64-bit integer in `a`.
1525 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1526 {
1527     long2 la = cast(long2)a;
1528     return la.array[0];
1529 }
1530 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1531 
1532 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1533 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1534 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1535 {
1536     a.ptr[0] = cast(double)b;
1537     return a;
1538 }
1539 unittest
1540 {
1541     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1542     assert(a.array == [42.0, 0]);
1543 }
1544 
1545 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1546 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1547 {
1548     int4 r = [0, 0, 0, 0];
1549     r.ptr[0] = a;
1550     return r;
1551 }
1552 unittest
1553 {
1554     __m128i a = _mm_cvtsi32_si128(65);
1555     assert(a.array == [65, 0, 0, 0]);
1556 }
1557 
1558 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1559 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1560 
1561 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1562 {
1563     a.ptr[0] = cast(double)b;
1564     return a;
1565 }
1566 unittest
1567 {
1568     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1569     assert(a.array == [42.0, 0]);
1570 }
1571 
1572 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1573 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1574 {
1575     long2 r = [0, 0];
1576     r.ptr[0] = a;
1577     return cast(__m128i)(r);
1578 }
1579 
1580 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1581 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1582 
1583 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1584 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1585 // element of result.
1586 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1587 {
1588     a.ptr[0] = b.array[0];
1589     return a;
1590 }
1591 unittest
1592 {
1593     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1594     assert(a.array == [42.0, 0]);
1595 }
1596 
1597 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1598 long _mm_cvttss_si64 (__m128 a) pure @safe
1599 {
1600     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1601 }
1602 unittest
1603 {
1604     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1605 }
1606 
1607 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1608 /// Put zeroes in the upper elements of result.
1609 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1610 {
1611     static if (LDC_with_SSE2)
1612     {
1613         return __builtin_ia32_cvttpd2dq(a);
1614     }
1615     else static if (GDC_with_SSE2)
1616     {
1617         return __builtin_ia32_cvttpd2dq(a);
1618     }
1619     else
1620     {
1621         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1622         __m128i r;
1623         r.ptr[0] = cast(int)a.array[0];
1624         r.ptr[1] = cast(int)a.array[1];
1625         r.ptr[2] = 0;
1626         r.ptr[3] = 0;
1627         return r;
1628     }
1629 }
1630 unittest
1631 {
1632     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1633     assert(R.array == [-4, 45641, 0, 0]);
1634 }
1635 
1636 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1637 /// to packed 32-bit integers with truncation.
1638 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1639 {
1640     return to_m64(_mm_cvttpd_epi32(v));
1641 }
1642 unittest
1643 {
1644     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1645     int[2] correct = [-4, 45641];
1646     assert(R.array == correct);
1647 }
1648 
1649 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1650 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1651 {
1652     // x86: Generates cvttps2dq since LDC 1.3 -O2
1653     // ARM64: generates fcvtze since LDC 1.8 -O2
1654     __m128i r;
1655     r.ptr[0] = cast(int)a.array[0];
1656     r.ptr[1] = cast(int)a.array[1];
1657     r.ptr[2] = cast(int)a.array[2];
1658     r.ptr[3] = cast(int)a.array[3];
1659     return r;
1660 }
1661 unittest
1662 {
1663     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1664     assert(R.array == [-4, 45641, 0, 1]);
1665 }
1666 
1667 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1668 int _mm_cvttsd_si32 (__m128d a)
1669 {
1670     // Generates cvttsd2si since LDC 1.3 -O0
1671     return cast(int)a.array[0];
1672 }
1673 
1674 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1675 long _mm_cvttsd_si64 (__m128d a)
1676 {
1677     // Generates cvttsd2si since LDC 1.3 -O0
1678     // but in 32-bit instead, it's a long sequence that resort to FPU
1679     return cast(long)a.array[0];
1680 }
1681 
1682 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1683 
1684 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1685 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1686 {
1687     return a / b;
1688 }
1689 
1690 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1691 {
1692     static if (GDC_with_SSE2)
1693     {
1694         return __builtin_ia32_divsd(a, b);
1695     }
1696     else version(DigitalMars)
1697     {
1698         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1699         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1700         asm pure nothrow @nogc @trusted { nop;}
1701         a.array[0] = a.array[0] / b.array[0];
1702         return a;
1703     }
1704     else
1705     {
1706         a.ptr[0] /= b.array[0];
1707         return a;
1708     }
1709 }
1710 unittest
1711 {
1712     __m128d a = [2.0, 4.5];
1713     a = _mm_div_sd(a, a);
1714     assert(a.array == [1.0, 4.5]);
1715 }
1716 
1717 /// Extract a 16-bit integer from `v`, selected with `index`
1718 int _mm_extract_epi16(__m128i v, int index) pure @safe
1719 {
1720     short8 r = cast(short8)v;
1721     return cast(ushort)(r.array[index]);
1722 }
1723 unittest
1724 {
1725     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1726     assert(_mm_extract_epi16(A, 6) == 6);
1727     assert(_mm_extract_epi16(A, 0) == 65535);
1728 }
1729 
1730 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1731 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1732 {
1733     short8 r = cast(short8)v;
1734     r.ptr[index & 7] = cast(short)i;
1735     return cast(__m128i)r;
1736 }
1737 unittest
1738 {
1739     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1740     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1741     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1742     assert(R.array == correct);
1743 }
1744 
1745 
1746 void _mm_lfence() @trusted
1747 {
1748     version(GNU)
1749     {
1750     
1751         static if (GDC_with_SSE2)
1752         {
1753             __builtin_ia32_lfence();
1754         }
1755         else version(X86)
1756         {
1757             asm pure nothrow @nogc @trusted
1758             {
1759                 "lfence;\n" : : : ;
1760             }
1761         }
1762         else
1763             static assert(false);
1764     }
1765     else static if (LDC_with_SSE2)
1766     {
1767         __builtin_ia32_lfence();
1768     }
1769     else static if (DMD_with_asm)
1770     {
1771         asm nothrow @nogc pure @safe
1772         {
1773             lfence;
1774         }
1775     }
1776     else version(LDC)
1777     {
1778         llvm_memory_fence(); // PERF actually generates mfence
1779     }
1780     else
1781         static assert(false);
1782 }
1783 unittest
1784 {
1785     _mm_lfence();
1786 }
1787 
1788 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1789 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1790 __m128d _mm_load_pd (const(double) * mem_addr) pure
1791 {
1792     __m128d* aligned = cast(__m128d*)mem_addr;
1793     return *aligned;
1794 }
1795 unittest
1796 {
1797     align(16) double[2] S = [-5.0, 7.0];
1798     __m128d R = _mm_load_pd(S.ptr);
1799     assert(R.array == S);
1800 }
1801 
1802 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1803 /// `mem_addr` does not need to be aligned on any particular boundary.
1804 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1805 {
1806     double m = *mem_addr;
1807     __m128d r;
1808     r.ptr[0] = m;
1809     r.ptr[1] = m;
1810     return r;
1811 }
1812 unittest
1813 {
1814     double what = 4;
1815     __m128d R = _mm_load_pd1(&what);
1816     double[2] correct = [4.0, 4];
1817     assert(R.array == correct);
1818 }
1819 
1820 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1821 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1822 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1823 {
1824     double2 r = [0, 0];
1825     r.ptr[0] = *mem_addr;
1826     return r;
1827 }
1828 unittest
1829 {
1830     double x = -42;
1831     __m128d a = _mm_load_sd(&x);
1832     assert(a.array == [-42.0, 0.0]);
1833 }
1834 
1835 /// Load 128-bits of integer data from memory into dst. 
1836 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1837 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1838 {
1839     return *mem_addr;
1840 }
1841 unittest
1842 {
1843     align(16) int[4] correct = [-1, 2, 3, 4];
1844     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1845     assert(A.array == correct);
1846 }
1847 
1848 alias _mm_load1_pd = _mm_load_pd1; ///
1849 
1850 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1851 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1852 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1853 {
1854     a.ptr[1] = *mem_addr;
1855     return a;
1856 }
1857 unittest
1858 {
1859     double A = 7.0;
1860     __m128d B = _mm_setr_pd(4.0, -5.0);
1861     __m128d R = _mm_loadh_pd(B, &A);
1862     double[2] correct = [ 4.0, 7.0 ];
1863     assert(R.array == correct);
1864 }
1865 
1866 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1867 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1868 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1869 {
1870     auto pLong = cast(const(long)*)mem_addr;
1871     long2 r = [0, 0];
1872     r.ptr[0] = *pLong;
1873     return cast(__m128i)(r);
1874 }
1875 unittest
1876 {
1877     long A = 0x7878787870707070;
1878     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1879     long[2] correct = [0x7878787870707070, 0];
1880     assert(R.array == correct);
1881 }
1882 
1883 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1884 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1885 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1886 {
1887     a.ptr[0] = *mem_addr;
1888     return a;
1889 }
1890 unittest
1891 {
1892     double A = 7.0;
1893     __m128d B = _mm_setr_pd(4.0, -5.0);
1894     __m128d R = _mm_loadl_pd(B, &A);
1895     double[2] correct = [ 7.0, -5.0 ];
1896     assert(R.array == correct);
1897 }
1898 
1899 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1900 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1901 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1902 {
1903     __m128d a = *cast(__m128d*)(mem_addr);
1904     __m128d r;
1905     r.ptr[0] = a.array[1];
1906     r.ptr[1] = a.array[0];
1907     return r;
1908 }
1909 unittest
1910 {
1911     align(16) double[2] A = [56.0, -74.0];
1912     __m128d R = _mm_loadr_pd(A.ptr);
1913     double[2] correct = [-74.0, 56.0];
1914     assert(R.array == correct);
1915 }
1916 
1917 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1918 /// `mem_addr` does not need to be aligned on any particular boundary.
1919 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1920 {
1921     static if (GDC_with_SSE2)
1922     {
1923         return __builtin_ia32_loadupd(mem_addr); 
1924     }
1925     else version(LDC)
1926     {
1927         return loadUnaligned!(double2)(mem_addr);
1928     }
1929     else version(DigitalMars)
1930     {
1931         static if (DMD_with_DSIMD)
1932         {
1933             return cast(__m128d)__simd(XMM.LODUPD, *mem_addr);
1934         }
1935         else static if (SSESizedVectorsAreEmulated)
1936         {
1937             // Since this vector is emulated, it doesn't have alignement constraints
1938             // and as such we can just cast it.
1939             return *cast(__m128d*)(mem_addr);
1940         }
1941         else
1942         {
1943             __m128d result;
1944             result.ptr[0] = mem_addr[0];
1945             result.ptr[1] = mem_addr[1];
1946             return result;
1947         }
1948     }
1949     else
1950     {
1951         __m128d result;
1952         result.ptr[0] = mem_addr[0];
1953         result.ptr[1] = mem_addr[1];
1954         return result;
1955     }
1956 }
1957 unittest
1958 {
1959     double[2] A = [56.0, -75.0];
1960     __m128d R = _mm_loadu_pd(A.ptr);
1961     double[2] correct = [56.0, -75.0];
1962     assert(R.array == correct);
1963 }
1964 
1965 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
1966 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1967 {
1968     static if (GDC_with_SSE2)
1969     {
1970         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1971     }
1972     else
1973     {
1974         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1975     }
1976 }
1977 unittest
1978 {
1979     align(16) int[4] correct = [-1, 2, -3, 4];
1980     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
1981     assert(A.array == correct);
1982 }
1983 
1984 /// Load unaligned 32-bit integer from memory into the first element of result.
1985 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1986 {
1987     int r = *cast(int*)(mem_addr);
1988     int4 result = [0, 0, 0, 0];
1989     result.ptr[0] = r;
1990     return result;
1991 }
1992 unittest
1993 {
1994     int r = 42;
1995     __m128i A = _mm_loadu_si32(&r);
1996     int[4] correct = [42, 0, 0, 0];
1997     assert(A.array == correct);
1998 }
1999 
2000 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2001 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2002 /// and pack the results in destination.
2003 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2004 {
2005     static if (GDC_with_SSE2)
2006     {
2007         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2008     }
2009     else static if (LDC_with_SSE2)
2010     {
2011         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2012     }
2013     else static if (LDC_with_ARM64)
2014     {
2015         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2016         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2017         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2018         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2019         return vcombine_s32(rl, rh);
2020     }
2021     else
2022     {
2023         short8 sa = cast(short8)a;
2024         short8 sb = cast(short8)b;
2025         int4 r;
2026         foreach(i; 0..4)
2027         {
2028             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2029         }
2030         return r;
2031     }
2032 }
2033 unittest
2034 {
2035     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2036     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2037     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2038     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2039     assert(R.array == correct);
2040 }
2041 
2042 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2043 /// (elements are not stored when the highest bit is not set in the corresponding element)
2044 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2045 /// boundary.
2046 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2047 {
2048     static if (GDC_with_SSE2)
2049     {    
2050         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2051     }
2052     else static if (LDC_with_SSE2)
2053     {
2054         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2055     }
2056     else static if (LDC_with_ARM64)
2057     {
2058         // PERF: catastrophic on ARM32
2059         byte16 bmask  = cast(byte16)mask;
2060         byte16 shift = 7;
2061         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2062         mask = cast(__m128i) bmask;
2063         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2064         dest = (a & mask) | (dest & ~mask);
2065         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2066     }
2067     else
2068     {
2069         byte16 b = cast(byte16)a;
2070         byte16 m = cast(byte16)mask;
2071         byte* dest = cast(byte*)(mem_addr);
2072         foreach(j; 0..16)
2073         {
2074             if (m.array[j] & 128)
2075             {
2076                 dest[j] = b.array[j];
2077             }
2078         }
2079     }
2080 }
2081 unittest
2082 {
2083     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2084     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2085     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2086     _mm_maskmoveu_si128(A, mask, dest.ptr);
2087     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2088     assert(dest == correct);
2089 }
2090 
2091 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2092 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2093 {
2094     version(GNU)
2095     {
2096         // PERF: not necessarily the best for GDC
2097         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2098         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2099         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2100         return _mm_xor_si128(b, mask);
2101     }
2102     else
2103     {
2104         // x86: pmaxsw since LDC 1.0 -O1
2105         // ARM: smax.8h since LDC 1.5 -01
2106         short8 sa = cast(short8)a;
2107         short8 sb = cast(short8)b;
2108         short8 greater = greaterMask!short8(sa, sb);
2109         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2110     }
2111 }
2112 unittest
2113 {
2114     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2115                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2116     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2117     assert(R.array == correct);
2118 }
2119 
2120 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2121 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2122 {
2123     version(LDC)
2124     {
2125         // x86: pmaxub since LDC 1.0.0 -O1
2126         // ARM64: umax.16b since LDC 1.5.0 -O1
2127         // PERF: catastrophic on ARM32
2128         ubyte16 sa = cast(ubyte16)a;
2129         ubyte16 sb = cast(ubyte16)b;
2130         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2131         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2132     }
2133     else
2134     {
2135         __m128i value128 = _mm_set1_epi8(-128);
2136         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2137         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2138         __m128i mask = _mm_and_si128(aTob, higher);
2139         return _mm_xor_si128(b, mask);
2140     }
2141 }
2142 unittest
2143 {
2144     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2145                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2146     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2147     assert(R.array == correct);
2148 }
2149 
2150 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2151 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2152 {
2153     static if (GDC_with_SSE2)
2154     {
2155         return __builtin_ia32_maxpd(a, b);
2156     }
2157     else
2158     {
2159         // x86: Generates maxpd starting with LDC 1.9 -O2
2160         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2161         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2162         return a;
2163     }
2164 }
2165 unittest
2166 {
2167     __m128d A = _mm_setr_pd(4.0, 1.0);
2168     __m128d B = _mm_setr_pd(1.0, 8.0);
2169     __m128d M = _mm_max_pd(A, B);
2170     assert(M.array[0] == 4.0);
2171     assert(M.array[1] == 8.0);
2172 }
2173 
2174 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2175 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2176 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2177 {
2178     static if (GDC_with_SSE2)
2179     {
2180         return __builtin_ia32_maxsd(a, b);
2181     }
2182     else
2183     {
2184          __m128d r = a;
2185         // Generates maxsd starting with LDC 1.3
2186         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2187         return r;
2188     }
2189 }
2190 unittest
2191 {
2192     __m128d A = _mm_setr_pd(1.0, 1.0);
2193     __m128d B = _mm_setr_pd(4.0, 2.0);
2194     __m128d M = _mm_max_sd(A, B);
2195     assert(M.array[0] == 4.0);
2196     assert(M.array[1] == 1.0);
2197 }
2198 
2199 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2200 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2201 /// is globally visible before any memory instruction which follows the fence in program order.
2202 void _mm_mfence() @trusted
2203 {
2204     version(GNU)
2205     {
2206         static if (GDC_with_SSE2)
2207         {
2208             __builtin_ia32_mfence();
2209         }
2210         else version(X86)
2211         {
2212             asm pure nothrow @nogc @trusted
2213             {
2214                 "mfence;\n" : : : ;
2215             }
2216         }
2217         else
2218             static assert(false);
2219     }
2220     else static if (LDC_with_SSE2)
2221     {
2222         __builtin_ia32_mfence();
2223     }
2224     else static if (DMD_with_asm)
2225     {
2226         asm nothrow @nogc pure @safe
2227         {
2228             mfence;
2229         }
2230     }
2231     else version(LDC)
2232     {
2233         void _mm_mfence() pure @safe
2234         {
2235             // Note: will generate the DMB instruction on ARM
2236             llvm_memory_fence();
2237         }
2238     }
2239     else
2240         static assert(false);
2241 }
2242 unittest
2243 {
2244     _mm_mfence();
2245 }
2246 
2247 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2248 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2249 {
2250     version(GNU)
2251     {
2252         // PERF: not necessarily the best for GDC
2253         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2254         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2255         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2256         return _mm_xor_si128(b, mask);
2257     }
2258     else
2259     {
2260         // x86: pminsw since LDC 1.0 -O1
2261         // ARM64: smin.8h since LDC 1.5 -01
2262         short8 sa = cast(short8)a;
2263         short8 sb = cast(short8)b;
2264         short8 greater = greaterMask!short8(sa, sb);
2265         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2266     }
2267 }
2268 unittest
2269 {
2270     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2271                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2272     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2273     assert(R.array == correct);
2274 }
2275 
2276 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2277 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2278 {
2279     version(LDC)
2280     {
2281         // x86: pminub since LDC 1.0.0 -O1
2282         // ARM: umin.16b since LDC 1.5.0 -O1
2283         // PERF: catastrophic on ARM32
2284         ubyte16 sa = cast(ubyte16)a;
2285         ubyte16 sb = cast(ubyte16)b;
2286         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2287         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2288     }
2289     else
2290     {
2291         __m128i value128 = _mm_set1_epi8(-128);
2292         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2293         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2294         __m128i mask = _mm_and_si128(aTob, lower);
2295         return _mm_xor_si128(b, mask);
2296     }
2297 }
2298 unittest
2299 {
2300     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2301                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2302     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2303     assert(R.array == correct);
2304 }
2305 
2306 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2307 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2308 {
2309     static if (GDC_with_SSE2)
2310     {
2311         return __builtin_ia32_minpd(a, b);
2312     }
2313     else
2314     {
2315         // Generates minpd starting with LDC 1.9
2316         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2317         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2318         return a;
2319     }
2320 }
2321 unittest
2322 {
2323     __m128d A = _mm_setr_pd(1.0, 2.0);
2324     __m128d B = _mm_setr_pd(4.0, 1.0);
2325     __m128d M = _mm_min_pd(A, B);
2326     assert(M.array[0] == 1.0);
2327     assert(M.array[1] == 1.0);
2328 }
2329 
2330 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2331 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2332 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2333 {
2334     static if (GDC_with_SSE2)
2335     {
2336         return __builtin_ia32_minsd(a, b);
2337     }
2338     else
2339     {
2340         // Generates minsd starting with LDC 1.3
2341         __m128d r = a;
2342         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2343         return r;
2344     }
2345 }
2346 unittest
2347 {
2348     __m128d A = _mm_setr_pd(1.0, 3.0);
2349     __m128d B = _mm_setr_pd(4.0, 2.0);
2350     __m128d M = _mm_min_sd(A, B);
2351     assert(M.array[0] == 1.0);
2352     assert(M.array[1] == 3.0);
2353 }
2354 
2355 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2356 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2357 {
2358     static if (GDC_with_SSE2)
2359     {
2360         // slightly better with GDC -O0
2361         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2362     }
2363     else
2364     {
2365         long2 result = [ 0, 0 ];
2366         long2 la = cast(long2) a;
2367         result.ptr[0] = la.array[0];
2368         return cast(__m128i)(result);
2369     }
2370 }
2371 unittest
2372 {
2373     long2 A = [13, 47];
2374     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2375     long[2] correct = [13, 0];
2376     assert(B.array == correct);
2377 }
2378 
2379 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2380 /// the upper element from `a` to the upper element of dst.
2381 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2382 {
2383     static if (GDC_with_SSE2)
2384     {
2385         return __builtin_ia32_movsd(a, b); 
2386     }
2387     else
2388     {
2389         b.ptr[1] = a.array[1];
2390         return b;
2391     }
2392 }
2393 unittest
2394 {
2395     double2 A = [13.0, 47.0];
2396     double2 B = [34.0, 58.0];
2397     double2 C = _mm_move_sd(A, B);
2398     double[2] correct = [34.0, 47.0];
2399     assert(C.array == correct);
2400 }
2401 
2402 /// Create mask from the most significant bit of each 8-bit element in `v`.
2403 int _mm_movemask_epi8 (__m128i a) pure @trusted
2404 {
2405     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2406     static if (GDC_with_SSE2)
2407     {
2408         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2409     }
2410     else static if (LDC_with_SSE2)
2411     {
2412         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2413     }
2414     else static if (LDC_with_ARM64)
2415     {
2416         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2417         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2418         // SO there might be something a bit faster, but this one is reasonable and branchless.
2419         byte8 mask_shift;
2420         mask_shift.ptr[0] = 7;
2421         mask_shift.ptr[1] = 6;
2422         mask_shift.ptr[2] = 5;
2423         mask_shift.ptr[3] = 4;
2424         mask_shift.ptr[4] = 3;
2425         mask_shift.ptr[5] = 2;
2426         mask_shift.ptr[6] = 1;
2427         mask_shift.ptr[7] = 0;
2428         byte8 mask_and = byte8(-128);
2429         byte8 lo = vget_low_u8(cast(byte16)a);
2430         byte8 hi = vget_high_u8(cast(byte16)a);
2431         lo = vand_u8(lo, mask_and);
2432         lo = vshr_u8(lo, mask_shift);
2433         hi = vand_u8(hi, mask_and);
2434         hi = vshr_u8(hi, mask_shift);
2435         lo = vpadd_u8(lo,lo);
2436         lo = vpadd_u8(lo,lo);
2437         lo = vpadd_u8(lo,lo);
2438         hi = vpadd_u8(hi,hi);
2439         hi = vpadd_u8(hi,hi);
2440         hi = vpadd_u8(hi,hi);
2441         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2442     }
2443     else
2444     {
2445         byte16 ai = cast(byte16)a;
2446         int r = 0;
2447         foreach(bit; 0..16)
2448         {
2449             if (ai.array[bit] < 0) r += (1 << bit);
2450         }
2451         return r;
2452     }
2453 }
2454 unittest
2455 {
2456     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2457 }
2458 
2459 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2460 /// loating-point element in `v`.
2461 int _mm_movemask_pd(__m128d v) pure @safe
2462 {
2463     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2464     static if (GDC_with_SSE2)
2465     {
2466         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2467         /// packed double-precision (64-bit) floating-point element in `v`.
2468         return __builtin_ia32_movmskpd(v);
2469     }
2470     else static if (LDC_with_SSE2)
2471     {
2472         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2473         /// packed double-precision (64-bit) floating-point element in `v`.
2474         return __builtin_ia32_movmskpd(v);
2475     }
2476     else
2477     {
2478         long2 lv = cast(long2)v;
2479         int r = 0;
2480         if (lv.array[0] < 0) r += 1;
2481         if (lv.array[1] < 0) r += 2;
2482         return r;
2483     }
2484 }
2485 unittest
2486 {
2487     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2488     assert(_mm_movemask_pd(A) == 2);
2489 }
2490 
2491 /// Copy the lower 64-bit integer in `v`.
2492 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2493 {
2494     long2 lv = cast(long2)v;
2495     return long1(lv.array[0]);
2496 }
2497 unittest
2498 {
2499     __m128i A = _mm_set_epi64x(-1, -2);
2500     __m64 R = _mm_movepi64_pi64(A);
2501     assert(R.array[0] == -2);
2502 }
2503 
2504 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2505 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2506 {
2507     long2 r;
2508     r.ptr[0] = a.array[0];
2509     r.ptr[1] = 0;
2510     return cast(__m128i)r;
2511 }
2512 
2513 // Note: generates pmuludq in LDC with -O1
2514 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2515 {
2516     __m128i zero = _mm_setzero_si128();
2517 
2518     static if (__VERSION__ >= 2088)
2519     {
2520         // Need LLVM9 to avoid this shufflevector
2521         long2 la, lb;
2522         la.ptr[0] = cast(uint)a.array[0];
2523         la.ptr[1] = cast(uint)a.array[2];
2524         lb.ptr[0] = cast(uint)b.array[0];
2525         lb.ptr[1] = cast(uint)b.array[2];
2526     }
2527     else
2528     {
2529         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2530         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2531     }
2532 
2533     version(DigitalMars)
2534     {
2535         // DMD has no long2 mul
2536         // long2 mul not supported before LDC 1.5
2537         la.ptr[0] *= lb.array[0];
2538         la.ptr[1] *= lb.array[1];
2539         return cast(__m128i)(la);
2540     }
2541     else
2542     {
2543         static if (__VERSION__ >= 2076)
2544         {
2545             return cast(__m128i)(la * lb);
2546         }
2547         else
2548         {
2549             // long2 mul not supported before LDC 1.5
2550             la.ptr[0] *= lb.array[0];
2551             la.ptr[1] *= lb.array[1];
2552             return cast(__m128i)(la);
2553         }
2554     }
2555 }
2556 unittest
2557 {
2558     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2559     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2560     __m128i C = _mm_mul_epu32(A, B);
2561     long2 LC = cast(long2)C;
2562     assert(LC.array[0] == 18446744065119617025uL);
2563     assert(LC.array[1] == 12723420444339690338uL);
2564 }
2565 
2566 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2567 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2568 {
2569     return a * b;
2570 }
2571 unittest
2572 {
2573     __m128d a = [-2.0, 1.5];
2574     a = _mm_mul_pd(a, a);
2575     assert(a.array == [4.0, 2.25]);
2576 }
2577 
2578 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2579 /// element of result, and copy the upper element from `a` to the upper element of result.
2580 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2581 {
2582     version(DigitalMars)
2583     {    
2584         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2585         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2586         asm pure nothrow @nogc @trusted { nop;}
2587         a.array[0] = a.array[0] * b.array[0];
2588         return a;
2589     }
2590     else static if (GDC_with_SSE2)
2591     {
2592         return __builtin_ia32_mulsd(a, b);
2593     }
2594     else
2595     {
2596         a.ptr[0] *= b.array[0];
2597         return a;
2598     }
2599 }
2600 unittest
2601 {
2602     __m128d a = [-2.0, 1.5];
2603     a = _mm_mul_sd(a, a);
2604     assert(a.array == [4.0, 1.5]);
2605 }
2606 
2607 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2608 /// and get an unsigned 64-bit result.
2609 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2610 {
2611     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2612 }
2613 unittest
2614 {
2615     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2616     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2617     __m64 C = _mm_mul_su32(A, B);
2618     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2619 }
2620 
2621 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2622 /// high 16 bits of the intermediate integers.
2623 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2624 {
2625     static if (GDC_with_SSE2)
2626     {
2627         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2628     }
2629     else static if (LDC_with_SSE2)
2630     {
2631         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2632     }
2633     else
2634     {
2635         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2636         //        PERF: it seems the simde solution has one less instruction in ARM64.
2637         // PERF: Catastrophic in ARM32.
2638         short8 sa = cast(short8)a;
2639         short8 sb = cast(short8)b;
2640         short8 r = void;
2641         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2642         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2643         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2644         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2645         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2646         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2647         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2648         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2649         return cast(__m128i)r;
2650     }
2651 }
2652 unittest
2653 {
2654     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2655     __m128i B = _mm_set1_epi16(16384);
2656     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2657     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2658     assert(R.array == correct);
2659 }
2660 
2661 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2662 /// high 16 bits of the intermediate integers.
2663 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2664 {
2665     static if (GDC_with_SSE2)
2666     {
2667         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2668     }
2669     else static if (LDC_with_SSE2)
2670     {
2671         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2672     }
2673     else
2674     {
2675         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2676         //      it seems the simde solution has one less instruction in ARM64
2677         // PERF: Catastrophic in ARM32.
2678         short8 sa = cast(short8)a;
2679         short8 sb = cast(short8)b;
2680         short8 r = void;
2681         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2682         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2683         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2684         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2685         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2686         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2687         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2688         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2689         return cast(__m128i)r;
2690     }
2691 }
2692 unittest
2693 {
2694     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2695     __m128i B = _mm_set1_epi16(16384);
2696     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2697     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2698     assert(R.array == correct);
2699 }
2700 
2701 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2702 /// bits of the intermediate integers.
2703 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2704 {
2705     return cast(__m128i)(cast(short8)a * cast(short8)b);
2706 }
2707 unittest
2708 {
2709     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2710     __m128i B = _mm_set1_epi16(16384);
2711     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2712     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2713     assert(R.array == correct);
2714 }
2715 
2716 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2717 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2718 {
2719     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2720 }
2721 
2722 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2723 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2724 {
2725     return a | b;
2726 }
2727 
2728 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2729 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2730 {
2731     static if (GDC_with_SSE2)
2732     {
2733         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2734     }    
2735     else static if (LDC_with_SSE2)
2736     {
2737         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2738     }
2739     else static if (LDC_with_ARM64)
2740     {
2741         short4 ra = vqmovn_s32(cast(int4)a);
2742         short4 rb = vqmovn_s32(cast(int4)b);
2743         return cast(__m128i)vcombine_s16(ra, rb);
2744     }
2745     else
2746     {
2747         // PERF: catastrophic on ARM
2748         short8 r;
2749         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2750         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2751         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2752         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2753         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2754         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2755         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2756         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2757         return cast(__m128i)r;
2758     }
2759 }
2760 unittest
2761 {
2762     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2763     short8 R = cast(short8) _mm_packs_epi32(A, A);
2764     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2765     assert(R.array == correct);
2766 }
2767 
2768 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2769 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2770 {
2771     static if (GDC_with_SSE2)
2772     {
2773         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2774     }
2775     else static if (LDC_with_SSE2)
2776     {
2777         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2778     }
2779     else static if (LDC_with_ARM64)
2780     {
2781         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2782         byte8 ra = vqmovn_s16(cast(short8)a);
2783         byte8 rb = vqmovn_s16(cast(short8)b);
2784         return cast(__m128i)vcombine_s8(ra, rb);
2785     }
2786     else
2787     {
2788         // PERF: ARM32 is missing
2789         byte16 r;
2790         short8 sa = cast(short8)a;
2791         short8 sb = cast(short8)b;
2792         foreach(i; 0..8)
2793             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2794         foreach(i; 0..8)
2795             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2796         return cast(__m128i)r;
2797     }
2798 }
2799 unittest
2800 {
2801     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2802     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2803     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2804                         127, -128, 127, 0, 127, -128, 127, 0];
2805     assert(R.array == correct);
2806 }
2807 
2808 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2809 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2810 {
2811     static if (GDC_with_SSE2)
2812     {
2813         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2814     }
2815     else static if (LDC_with_SSE2)
2816     {
2817         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2818     }
2819     else static if (LDC_with_ARM64)
2820     {
2821         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2822         byte8 ra = vqmovun_s16(cast(short8)a);
2823         byte8 rb = vqmovun_s16(cast(short8)b);
2824         return cast(__m128i)vcombine_s8(ra, rb);
2825     }
2826     else
2827     {
2828         short8 sa = cast(short8)a;
2829         short8 sb = cast(short8)b;
2830         ubyte[16] result = void;
2831         for (int i = 0; i < 8; ++i)
2832         {
2833             short s = sa[i];
2834             if (s < 0) s = 0;
2835             if (s > 255) s = 255;
2836             result[i] = cast(ubyte)s;
2837 
2838             s = sb[i];
2839             if (s < 0) s = 0;
2840             if (s > 255) s = 255;
2841             result[i+8] = cast(ubyte)s;
2842         }
2843         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2844     }
2845 }
2846 unittest
2847 {
2848     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2849     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2850     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2851                                                 0, 255, 0, 255, 255, 2, 1, 0];
2852     foreach(i; 0..16)
2853         assert(AA.array[i] == cast(byte)(correctResult[i]));
2854 }
2855 
2856 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2857 /// and power consumption of spin-wait loops.
2858 void _mm_pause() @trusted
2859 {
2860     version(GNU)
2861     {
2862         static if (GDC_with_SSE2)
2863         {
2864             __builtin_ia32_pause();
2865         }
2866         else version(X86)
2867         {
2868             asm pure nothrow @nogc @trusted
2869             {
2870                 "pause;\n" : : : ;
2871             }
2872         }
2873         else
2874             static assert(false);
2875     }
2876     else static if (LDC_with_SSE2)
2877     {
2878         __builtin_ia32_pause();
2879     }
2880     else static if (DMD_with_asm)
2881     {
2882         asm nothrow @nogc pure @safe
2883         {
2884             rep; nop; // F3 90 =  pause
2885         }
2886     }
2887     else version (LDC)
2888     {
2889         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2890     }
2891     else
2892         static assert(false);
2893 }
2894 unittest
2895 {
2896     _mm_pause();
2897 }
2898 
2899 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2900 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2901 /// low 16 bits of 64-bit elements in result.
2902 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2903 {
2904     static if (GDC_with_SSE2)
2905     {
2906         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
2907     }
2908     else static if (LDC_with_SSE2)
2909     {
2910         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2911     }
2912     else static if (LDC_with_ARM64)
2913     {
2914         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
2915 
2916         // PERF: Looks suboptimal vs addp
2917         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
2918         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
2919         ushort8 r = 0;
2920         r[0] = r0;
2921         r[4] = r4;
2922         return cast(__m128i) r;
2923     }
2924     else
2925     {
2926         // PERF: ARM32 is lacking
2927         byte16 ab = cast(byte16)a;
2928         byte16 bb = cast(byte16)b;
2929         ubyte[16] t;
2930         foreach(i; 0..16)
2931         {
2932             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2933             if (diff < 0) diff = -diff;
2934             t[i] = cast(ubyte)(diff);
2935         }
2936         int4 r = _mm_setzero_si128();
2937         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2938         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2939         return r;
2940     }
2941 }
2942 unittest
2943 {
2944     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2945     __m128i B = _mm_set1_epi8(1);
2946     __m128i R = _mm_sad_epu8(A, B);
2947     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2948                       0,
2949                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2950                       0];
2951     assert(R.array == correct);
2952 }
2953 
2954 /// Set packed 16-bit integers with the supplied values.
2955 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2956 {
2957     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2958     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2959 }
2960 unittest
2961 {
2962     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2963     short8 B = cast(short8) A;
2964     foreach(i; 0..8)
2965         assert(B.array[i] == i);
2966 }
2967 
2968 /// Set packed 32-bit integers with the supplied values.
2969 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2970 {
2971     int[4] result = [e0, e1, e2, e3];
2972     return loadUnaligned!(int4)(result.ptr);
2973 }
2974 unittest
2975 {
2976     __m128i A = _mm_set_epi32(3, 2, 1, 0);
2977     foreach(i; 0..4)
2978         assert(A.array[i] == i);
2979 }
2980 
2981 /// Set packed 64-bit integers with the supplied values.
2982 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
2983 {
2984     long[2] result = [e0.array[0], e1.array[0]];
2985     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2986 }
2987 unittest
2988 {
2989     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
2990     long2 B = cast(long2) A;
2991     assert(B.array[0] == 5678);
2992     assert(B.array[1] == 1234);
2993 }
2994 
2995 /// Set packed 64-bit integers with the supplied values.
2996 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
2997 {
2998     long[2] result = [e0, e1];
2999     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3000 }
3001 unittest
3002 {
3003     __m128i A = _mm_set_epi64x(1234, 5678);
3004     long2 B = cast(long2) A;
3005     assert(B.array[0] == 5678);
3006     assert(B.array[1] == 1234);
3007 }
3008 
3009 /// Set packed 8-bit integers with the supplied values.
3010 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3011                       byte e11, byte e10, byte e9, byte e8,
3012                       byte e7, byte e6, byte e5, byte e4,
3013                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3014 {
3015     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3016                      e8, e9, e10, e11, e12, e13, e14, e15];
3017     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3018 }
3019 
3020 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3021 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3022 {
3023     double[2] result = [e0, e1];
3024     return loadUnaligned!(double2)(result.ptr);
3025 }
3026 unittest
3027 {
3028     __m128d A = _mm_set_pd(61.0, 55.0);
3029     double[2] correct = [55.0, 61.0];
3030     assert(A.array == correct);
3031 }
3032 
3033 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3034 __m128d _mm_set_pd1 (double a) pure @trusted
3035 {
3036     double[2] result = [a, a];
3037     return loadUnaligned!(double2)(result.ptr);
3038 }
3039 unittest
3040 {
3041     __m128d A = _mm_set_pd1(61.0);
3042     double[2] correct = [61.0, 61.0];
3043     assert(A.array == correct);
3044 }
3045 
3046 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3047 /// and zero the upper element.
3048 __m128d _mm_set_sd (double a) pure @trusted
3049 {
3050     double[2] result = [a, 0];
3051     return loadUnaligned!(double2)(result.ptr);
3052 }
3053 
3054 /// Broadcast 16-bit integer a to all elements of dst.
3055 __m128i _mm_set1_epi16 (short a) pure @trusted
3056 {
3057     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3058     {
3059         short8 v = a;
3060         return cast(__m128i) v;
3061     }
3062     else
3063         return cast(__m128i)(short8(a));
3064 }
3065 unittest
3066 {
3067     short8 a = cast(short8) _mm_set1_epi16(31);
3068     for (int i = 0; i < 8; ++i)
3069         assert(a.array[i] == 31);
3070 }
3071 
3072 /// Broadcast 32-bit integer `a` to all elements.
3073 __m128i _mm_set1_epi32 (int a) pure @trusted
3074 {
3075     return cast(__m128i)(int4(a));
3076 }
3077 unittest
3078 {
3079     int4 a = cast(int4) _mm_set1_epi32(31);
3080     for (int i = 0; i < 4; ++i)
3081         assert(a.array[i] == 31);
3082 }
3083 
3084 /// Broadcast 64-bit integer `a` to all elements.
3085 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3086 {
3087     return _mm_set_epi64(a, a);
3088 }
3089 unittest
3090 {
3091     long b = 0x1DEADCAFE; 
3092     __m64 a;
3093     a.ptr[0] = b;
3094     long2 c = cast(long2) _mm_set1_epi64(a);
3095     assert(c.array[0] == b);
3096     assert(c.array[1] == b);
3097 }
3098 
3099 /// Broadcast 64-bit integer `a` to all elements
3100 __m128i _mm_set1_epi64x (long a) pure @trusted
3101 {
3102     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3103     return cast(__m128i)(b);
3104 }
3105 unittest
3106 {
3107     long b = 0x1DEADCAFE;
3108     long2 c = cast(long2) _mm_set1_epi64x(b);
3109     for (int i = 0; i < 2; ++i)
3110         assert(c.array[i] == b);
3111 }
3112 
3113 /// Broadcast 8-bit integer `a` to all elements.
3114 __m128i _mm_set1_epi8 (byte a) pure @trusted
3115 {
3116     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3117     return cast(__m128i)(b);
3118 }
3119 unittest
3120 {
3121     byte16 b = cast(byte16) _mm_set1_epi8(31);
3122     for (int i = 0; i < 16; ++i)
3123         assert(b.array[i] == 31);
3124 }
3125 
3126 alias _mm_set1_pd = _mm_set_pd1;
3127 
3128 /// Set packed 16-bit integers with the supplied values in reverse order.
3129 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3130                         short e3, short e2, short e1, short e0) pure @trusted
3131 {
3132     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
3133     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
3134 }
3135 unittest
3136 {
3137     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3138     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3139     assert(A.array == correct);
3140 }
3141 
3142 /// Set packed 32-bit integers with the supplied values in reverse order.
3143 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3144 {
3145     int[4] result = [e3, e2, e1, e0];
3146     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3147 }
3148 unittest
3149 {
3150     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3151     int[4] correct = [-1, 0, -2147483648, 2147483647];
3152     assert(A.array == correct);
3153 }
3154 
3155 /// Set packed 64-bit integers with the supplied values in reverse order.
3156 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3157 {
3158     long[2] result = [e1, e0];
3159     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3160 }
3161 unittest
3162 {
3163     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3164     long[2] correct = [-1, 0];
3165     assert(A.array == correct);
3166 }
3167 
3168 /// Set packed 8-bit integers with the supplied values in reverse order.
3169 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3170                        byte e11, byte e10, byte e9,  byte e8,
3171                        byte e7,  byte e6,  byte e5,  byte e4,
3172                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3173 {
3174     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3175                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3176     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3177 }
3178 
3179 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3180 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3181 {
3182     double2 result;
3183     result.ptr[0] = e1;
3184     result.ptr[1] = e0;
3185     return result;
3186 }
3187 unittest
3188 {
3189     __m128d A = _mm_setr_pd(61.0, 55.0);
3190     double[2] correct = [61.0, 55.0];
3191     assert(A.array == correct);
3192 }
3193 
3194 /// Return vector of type `__m128d` with all elements set to zero.
3195 __m128d _mm_setzero_pd () pure @trusted
3196 {
3197     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3198     double[2] result = [0.0, 0.0];
3199     return loadUnaligned!(double2)(result.ptr);
3200 }
3201 
3202 /// Return vector of type `__m128i` with all elements set to zero.
3203 __m128i _mm_setzero_si128() pure @trusted
3204 {
3205     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3206     int[4] result = [0, 0, 0, 0];
3207     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3208 }
3209 
3210 /// Shuffle 32-bit integers in a using the control in `imm8`.
3211 /// See_also: `_MM_SHUFFLE`.
3212 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3213 {
3214     static if (GDC_with_SSE2)
3215     {
3216         return __builtin_ia32_pshufd(a, imm8);
3217     }
3218     else
3219     {
3220         return shufflevector!(int4, (imm8 >> 0) & 3,
3221                                     (imm8 >> 2) & 3,
3222                                     (imm8 >> 4) & 3,
3223                                     (imm8 >> 6) & 3)(a, a);
3224     }
3225 }
3226 unittest
3227 {
3228     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3229     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3230     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3231     int[4] expectedB = [ 3, 2, 1, 0 ];
3232     assert(B.array == expectedB);
3233 }
3234 
3235 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3236 /// See_also: `_MM_SHUFFLE2`.
3237 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3238 {
3239     static if (GDC_with_SSE2)
3240     {
3241         return __builtin_ia32_shufpd(a, b, imm8);
3242     }
3243     else
3244     {
3245         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3246                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
3247     }
3248 }
3249 unittest
3250 {
3251     __m128d A = _mm_setr_pd(0.5, 2.0);
3252     __m128d B = _mm_setr_pd(4.0, 5.0);
3253     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3254     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3255     double[2] correct = [ 2.0, 5.0 ];
3256     assert(R.array == correct);
3257 }
3258 
3259 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3260 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3261 /// See also: `_MM_SHUFFLE`.
3262 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3263 {
3264     static if (GDC_with_SSE2)
3265     {
3266         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3267     }
3268     else
3269     {
3270         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3271                                           4 + ( (imm8 >> 0) & 3 ),
3272                                           4 + ( (imm8 >> 2) & 3 ),
3273                                           4 + ( (imm8 >> 4) & 3 ),
3274                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3275     }
3276 }
3277 unittest
3278 {
3279     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3280     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3281     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3282     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3283     assert(C.array == expectedC);
3284 }
3285 
3286 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3287 /// bits of result, with the high 64 bits being copied from from `a` to result.
3288 /// See_also: `_MM_SHUFFLE`.
3289 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3290 {
3291     static if (GDC_with_SSE2)
3292     {
3293         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3294     }
3295     else
3296     {
3297         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3298                                                     ( (imm8 >> 2) & 3 ),
3299                                                     ( (imm8 >> 4) & 3 ),
3300                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3301     }
3302 }
3303 unittest
3304 {
3305     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3306     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3307     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3308     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3309     assert(B.array == expectedB);
3310 }
3311 
3312 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3313 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3314 {
3315     static if (LDC_with_SSE2)
3316     {
3317         return __builtin_ia32_pslld128(a, count);
3318     }
3319     else static if (GDC_with_SSE2)
3320     {
3321         return __builtin_ia32_pslld128(a, count);
3322     }
3323     else static if (DMD_with_32bit_asm)
3324     {
3325         asm pure nothrow @nogc @trusted
3326         {
3327             movdqu XMM0, a;
3328             movdqu XMM1, count;
3329             pslld XMM0, XMM1;
3330             movdqu a, XMM0;
3331         }
3332         return a;
3333     }
3334     else
3335     {
3336         int4 r = void;
3337         long2 lc = cast(long2)count;
3338         int bits = cast(int)(lc.array[0]);
3339         foreach(i; 0..4)
3340             r[i] = cast(uint)(a[i]) << bits;
3341         return r;
3342     }
3343 }
3344 
3345 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3346 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3347 {
3348     static if (LDC_with_SSE2)
3349     {
3350         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3351     }
3352     else static if (GDC_with_SSE2)
3353     {
3354         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3355     }
3356     else static if (DMD_with_32bit_asm)
3357     {
3358         asm pure nothrow @nogc @trusted
3359         {
3360             movdqu XMM0, a;
3361             movdqu XMM1, count;
3362             psllq XMM0, XMM1;
3363             movdqu a, XMM0;
3364         }
3365         return a;
3366     }
3367     else
3368     {
3369         // ARM: good since LDC 1.12 -O2
3370         // ~but -O0 version is catastrophic
3371         long2 r = void;
3372         long2 sa = cast(long2)a;
3373         long2 lc = cast(long2)count;
3374         int bits = cast(int)(lc.array[0]);
3375         foreach(i; 0..2)
3376             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3377         return cast(__m128i)r;
3378     }
3379 }
3380 
3381 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3382 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3383 {
3384     static if (LDC_with_SSE2)
3385     {
3386         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3387     }
3388     else static if (GDC_with_SSE2)
3389     {
3390         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3391     }
3392     else static if (DMD_with_32bit_asm)
3393     {
3394         asm pure nothrow @nogc
3395         {
3396             movdqu XMM0, a;
3397             movdqu XMM1, count;
3398             psllw XMM0, XMM1;
3399             movdqu a, XMM0;
3400         }
3401         return a;
3402     }
3403     else
3404     {
3405         short8 sa = cast(short8)a;
3406         long2 lc = cast(long2)count;
3407         int bits = cast(int)(lc.array[0]);
3408         short8 r = void;
3409         foreach(i; 0..8)
3410             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3411         return cast(int4)r;
3412     }
3413 }
3414 
3415 
3416 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3417 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3418 {
3419     static if (GDC_with_SSE2)
3420     {
3421         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3422     }
3423     else static if (LDC_with_SSE2)
3424     {
3425         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3426     }
3427     else
3428     {
3429         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3430         //       D says "It's illegal to shift by the same or more bits 
3431         //       than the size of the quantity being shifted"
3432         //       and it's UB instead.
3433         int4 r = _mm_setzero_si128();
3434 
3435         ubyte count = cast(ubyte) imm8;
3436         if (count > 31)
3437             return r;
3438         
3439         foreach(i; 0..4)
3440             r.array[i] = cast(uint)(a.array[i]) << count;
3441         return r;
3442     }
3443 }
3444 unittest
3445 {
3446     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3447     __m128i B = _mm_slli_epi32(A, 1);
3448     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3449     int[4] expectedB = [ 0, 4, 6, -8];
3450     assert(B.array == expectedB);
3451     assert(B2.array == expectedB);
3452 
3453     __m128i C = _mm_slli_epi32(A, 0);
3454     int[4] expectedC = [ 0, 2, 3, -4];
3455     assert(C.array == expectedC);
3456 
3457     __m128i D = _mm_slli_epi32(A, 65);
3458     int[4] expectedD = [ 0, 0, 0, 0];
3459     assert(D.array == expectedD);
3460 }
3461 
3462 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3463 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3464 {
3465     static if (GDC_with_SSE2)
3466     {
3467         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3468     }
3469     else static if (LDC_with_SSE2)
3470     {
3471         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3472     }
3473     else
3474     {
3475         long2 sa = cast(long2)a;
3476 
3477         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3478         //       D says "It's illegal to shift by the same or more bits 
3479         //       than the size of the quantity being shifted"
3480         //       and it's UB instead.
3481         long2 r = cast(long2) _mm_setzero_si128();
3482         ubyte count = cast(ubyte) imm8;
3483         if (count > 63)
3484             return cast(__m128i)r;
3485 
3486         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3487         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3488         return cast(__m128i)r;
3489     }
3490 }
3491 unittest
3492 {
3493     __m128i A = _mm_setr_epi64(8, -4);
3494     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3495     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3496     long[2] expectedB = [ 16, -8];
3497     assert(B.array == expectedB);
3498     assert(B2.array == expectedB);
3499 
3500     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3501     long[2] expectedC = [ 8, -4];
3502     assert(C.array == expectedC);
3503 
3504     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3505     long[2] expectedD = [ 0, -0];
3506     assert(D.array == expectedD);
3507 }
3508 
3509 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3510 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3511 {
3512     static if (GDC_with_SSE2)
3513     {
3514         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3515     }
3516     else static if (LDC_with_SSE2)
3517     {
3518         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3519     }
3520     else static if (LDC_with_ARM64)
3521     {
3522         short8 sa = cast(short8)a;
3523         short8 r = cast(short8)_mm_setzero_si128();
3524         ubyte count = cast(ubyte) imm8;
3525         if (count > 15)
3526             return cast(__m128i)r;
3527         r = sa << short8(count);
3528         return cast(__m128i)r;
3529     }
3530     else
3531     {
3532         short8 sa = cast(short8)a;
3533         short8 r = cast(short8)_mm_setzero_si128();
3534         ubyte count = cast(ubyte) imm8;
3535         if (count > 15)
3536             return cast(__m128i)r;
3537         foreach(i; 0..8)
3538             r.ptr[i] = cast(short)(sa.array[i] << count);
3539         return cast(__m128i)r;
3540     }
3541 }
3542 unittest
3543 {
3544     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3545     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3546     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3547     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3548     assert(B.array == expectedB);
3549     assert(B2.array == expectedB);
3550 
3551     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3552     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3553     assert(C.array == expectedC);
3554 }
3555 
3556 
3557 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3558 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3559 {
3560     static if (bytes & 0xF0)
3561     {
3562         return _mm_setzero_si128();
3563     }
3564     else
3565     {
3566         static if (GDC_with_SSE2)
3567         {
3568             return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3569         }
3570         else version(DigitalMars)
3571         {
3572             version(D_InlineAsm_X86)
3573             {
3574                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3575                 {
3576                     movdqu XMM0, op;
3577                     pslldq XMM0, bytes;
3578                     movdqu op, XMM0;
3579                 }
3580                 return op;
3581             }
3582             else
3583             {
3584                 byte16 A = cast(byte16)op;
3585                 byte16 R;
3586                 for (int n = 15; n >= bytes; --n)
3587                     R.ptr[n] = A.array[n-bytes];
3588                 for (int n = bytes-1; n >= 0; --n)
3589                     R.ptr[n] = 0;
3590                 return cast(__m128i)R;
3591             }
3592         }
3593         else
3594         {
3595             return cast(__m128i) shufflevector!(byte16,
3596             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3597             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3598             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3599             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3600         }
3601     }
3602 }
3603 unittest
3604 {
3605     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3606     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3607     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3608     assert(R.array == correct);
3609 
3610     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3611     int[4] expectedB = [0, 0, 0, 0];
3612     assert(B.array == expectedB);
3613 }
3614 
3615 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3616 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3617 {
3618     version(LDC)
3619     {
3620         // Disappeared with LDC 1.11
3621         static if (__VERSION__ < 2081)
3622             return __builtin_ia32_sqrtpd(vec);
3623         else
3624         {
3625             vec.array[0] = llvm_sqrt(vec.array[0]);
3626             vec.array[1] = llvm_sqrt(vec.array[1]);
3627             return vec;
3628         }
3629     }
3630     else static if (GDC_with_SSE2)    
3631     {
3632         return __builtin_ia32_sqrtpd(vec);
3633     }
3634     else
3635     {
3636         vec.ptr[0] = sqrt(vec.array[0]);
3637         vec.ptr[1] = sqrt(vec.array[1]);
3638         return vec;
3639     }
3640 }
3641 
3642 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3643 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3644 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3645 {
3646     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3647     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3648     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3649     version(LDC)
3650     {
3651         // Disappeared with LDC 1.11
3652         static if (__VERSION__ < 2081)
3653         {
3654             __m128d c = __builtin_ia32_sqrtsd(b);
3655             a[0] = c[0];
3656             return a;
3657         }
3658         else
3659         {
3660             a.array[0] = llvm_sqrt(b.array[0]);
3661             return a;
3662         }
3663     }
3664     else static if (GDC_with_SSE2)
3665     {
3666         __m128d c = __builtin_ia32_sqrtsd(b);
3667         a.ptr[0] = c.array[0];
3668         return a;
3669     }
3670     else
3671     {
3672         a.ptr[0] = sqrt(b.array[0]);
3673         return a;
3674     }
3675 }
3676 unittest
3677 {
3678     __m128d A = _mm_setr_pd(1.0, 3.0);
3679     __m128d B = _mm_setr_pd(4.0, 5.0);
3680     __m128d R = _mm_sqrt_sd(A, B);
3681     double[2] correct = [2.0, 3.0 ];
3682     assert(R.array == correct);
3683 }
3684 
3685 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3686 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3687 {
3688     static if (GDC_with_SSE2)
3689     {
3690         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3691     }
3692     else static if (LDC_with_SSE2)
3693     {
3694         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3695     }
3696     else
3697     {
3698         short8 sa = cast(short8)a;
3699         long2 lc = cast(long2)count;
3700         int bits = cast(int)(lc.array[0]);
3701         short8 r = void;
3702         foreach(i; 0..8)
3703             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3704         return cast(int4)r;
3705     }
3706 }
3707 
3708 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3709 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3710 {
3711     static if (LDC_with_SSE2)
3712     {
3713         return __builtin_ia32_psrad128(a, count);
3714     }
3715     else static if (GDC_with_SSE2)
3716     {
3717         return __builtin_ia32_psrad128(a, count);
3718     }
3719     else
3720     {    
3721         int4 r = void;
3722         long2 lc = cast(long2)count;
3723         int bits = cast(int)(lc.array[0]);
3724         r.ptr[0] = (a.array[0] >> bits);
3725         r.ptr[1] = (a.array[1] >> bits);
3726         r.ptr[2] = (a.array[2] >> bits);
3727         r.ptr[3] = (a.array[3] >> bits);
3728         return r;
3729     }
3730 }
3731 
3732 
3733 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3734 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3735 {
3736     static if (GDC_with_SSE2)
3737     {
3738         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3739     }
3740     else static if (LDC_with_SSE2)
3741     {
3742         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3743     }
3744     else static if (LDC_with_ARM64)
3745     {
3746         short8 sa = cast(short8)a;
3747         ubyte count = cast(ubyte)imm8;
3748         if (count > 15) 
3749             count = 15;
3750         short8 r = sa >> short8(count);
3751         return cast(__m128i)r;
3752     }
3753     else
3754     {
3755         short8 sa = cast(short8)a;
3756         short8 r = void;
3757 
3758         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3759         //       D says "It's illegal to shift by the same or more bits 
3760         //       than the size of the quantity being shifted"
3761         //       and it's UB instead.
3762         ubyte count = cast(ubyte)imm8;
3763         if (count > 15) 
3764             count = 15;
3765         foreach(i; 0..8)
3766             r.ptr[i] = cast(short)(sa.array[i] >> count);
3767         return cast(int4)r;
3768     }
3769 }
3770 unittest
3771 {
3772     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3773     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3774     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3775     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3776     assert(B.array == expectedB);
3777     assert(B2.array == expectedB);
3778 
3779     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3780     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3781     assert(C.array == expectedC);
3782 }
3783 
3784 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3785 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3786 {
3787     static if (LDC_with_SSE2)
3788     {
3789         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3790     }
3791     else static if (GDC_with_SSE2)
3792     {
3793         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3794     }
3795     else
3796     {
3797         int4 r = void;
3798 
3799         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3800         //       D says "It's illegal to shift by the same or more bits 
3801         //       than the size of the quantity being shifted"
3802         //       and it's UB instead.
3803         ubyte count = cast(ubyte) imm8;
3804         if (count > 31)
3805             count = 31;
3806 
3807         r.ptr[0] = (a.array[0] >> count);
3808         r.ptr[1] = (a.array[1] >> count);
3809         r.ptr[2] = (a.array[2] >> count);
3810         r.ptr[3] = (a.array[3] >> count);
3811         return r;
3812     }
3813 }
3814 unittest
3815 {
3816     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3817     __m128i B = _mm_srai_epi32(A, 1);
3818     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3819     int[4] expectedB = [ 0, 1, 1, -2];
3820     assert(B.array == expectedB);
3821     assert(B2.array == expectedB);
3822 
3823     __m128i C = _mm_srai_epi32(A, 32);
3824     int[4] expectedC = [ 0, 0, 0, -1];
3825     assert(C.array == expectedC);
3826 
3827     __m128i D = _mm_srai_epi32(A, 0);
3828     int[4] expectedD = [ 0, 2, 3, -4];
3829     assert(D.array == expectedD);
3830 }
3831 
3832 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3833 {
3834     static if (LDC_with_SSE2)
3835     {
3836         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3837     }
3838     else static if (GDC_with_SSE2)
3839     {
3840         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3841     }
3842     else
3843     {
3844         short8 sa = cast(short8)a;
3845         long2 lc = cast(long2)count;
3846         int bits = cast(int)(lc.array[0]);
3847         short8 r = void;
3848         foreach(i; 0..8)
3849             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3850         return cast(int4)r;
3851     }
3852 }
3853 
3854 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
3855 {
3856     static if (LDC_with_SSE2)
3857     {
3858         return __builtin_ia32_psrld128(a, count);
3859     }
3860     else static if (GDC_with_SSE2)
3861     {
3862         return __builtin_ia32_psrld128(a, count);
3863     }
3864     else
3865     {
3866         int4 r = void;
3867         long2 lc = cast(long2)count;
3868         int bits = cast(int)(lc.array[0]);
3869         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
3870         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
3871         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
3872         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
3873         return r;
3874     }
3875 }
3876 
3877 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
3878 {
3879     static if (LDC_with_SSE2)
3880     {
3881         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3882     }
3883     else static if (GDC_with_SSE2)
3884     {
3885         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3886     }
3887     else
3888     {
3889         long2 r = void;
3890         long2 sa = cast(long2)a;
3891         long2 lc = cast(long2)count;
3892         int bits = cast(int)(lc.array[0]);
3893         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
3894         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
3895         return cast(__m128i)r;
3896     }
3897 }
3898 
3899 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3900 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3901 {
3902     static if (GDC_with_SSE2)
3903     {
3904         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3905     }
3906     else static if (LDC_with_SSE2)
3907     {
3908         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3909     }
3910     else static if (LDC_with_ARM64)
3911     {
3912         short8 sa = cast(short8)a;
3913         short8 r = cast(short8) _mm_setzero_si128();
3914 
3915         ubyte count = cast(ubyte)imm8;
3916         if (count >= 16)
3917             return cast(__m128i)r;
3918 
3919         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3920         return cast(__m128i)r;
3921     }
3922     else
3923     {
3924         short8 sa = cast(short8)a;
3925         ubyte count = cast(ubyte)imm8;
3926 
3927         short8 r = cast(short8) _mm_setzero_si128();
3928         if (count >= 16)
3929             return cast(__m128i)r;
3930 
3931         foreach(i; 0..8)
3932             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
3933         return cast(__m128i)r;
3934     }
3935 }
3936 unittest
3937 {
3938     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3939     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3940     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
3941     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3942     assert(B.array == expectedB);
3943     assert(B2.array == expectedB);
3944 
3945     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
3946     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
3947     assert(C.array == expectedC);
3948 
3949     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
3950     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
3951     assert(D.array == expectedD);
3952 }
3953 
3954 
3955 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
3956 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3957 {
3958     static if (GDC_with_SSE2)
3959     {
3960         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3961     }
3962     else static if (LDC_with_SSE2)
3963     {
3964         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3965     }
3966     else
3967     {
3968         ubyte count = cast(ubyte) imm8;
3969 
3970         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3971         //       D says "It's illegal to shift by the same or more bits 
3972         //       than the size of the quantity being shifted"
3973         //       and it's UB instead.
3974         int4 r = _mm_setzero_si128();
3975         if (count >= 32)
3976             return r;
3977         r.ptr[0] = a.array[0] >>> count;
3978         r.ptr[1] = a.array[1] >>> count;
3979         r.ptr[2] = a.array[2] >>> count;
3980         r.ptr[3] = a.array[3] >>> count;
3981         return r;
3982     }
3983 }
3984 unittest
3985 {
3986     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3987     __m128i B = _mm_srli_epi32(A, 1);
3988     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
3989     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3990     assert(B.array == expectedB);
3991     assert(B2.array == expectedB);
3992  
3993     __m128i C = _mm_srli_epi32(A, 255);
3994     int[4] expectedC = [ 0, 0, 0, 0 ];
3995     assert(C.array == expectedC);
3996 }
3997 
3998 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
3999 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4000 {
4001     static if (GDC_with_SSE2)
4002     {
4003         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4004     }
4005     else static if (LDC_with_SSE2)
4006     {
4007         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4008     }
4009     else
4010     {
4011         long2 r = cast(long2) _mm_setzero_si128();
4012         long2 sa = cast(long2)a;
4013 
4014         ubyte count = cast(ubyte) imm8;
4015         if (count >= 64)
4016             return cast(__m128i)r;
4017 
4018         r.ptr[0] = sa.array[0] >>> count;
4019         r.ptr[1] = sa.array[1] >>> count;
4020         return cast(__m128i)r;
4021     }
4022 }
4023 unittest
4024 {
4025     __m128i A = _mm_setr_epi64(8, -4);
4026     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4027     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4028     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4029     assert(B.array == expectedB);
4030     assert(B2.array == expectedB);
4031 
4032     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4033     long[2] expectedC = [ 0, 0 ];
4034     assert(C.array == expectedC);
4035 }
4036 
4037 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4038 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
4039 {
4040     static if (bytes & 0xF0)
4041     {
4042         return _mm_setzero_si128();
4043     }
4044     else static if (GDC_with_SSE2)
4045     {
4046         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4047     }
4048     else static if (DMD_with_32bit_asm)
4049     {
4050         asm pure nothrow @nogc @trusted
4051         {
4052             movdqu XMM0, v;
4053             psrldq XMM0, bytes;
4054             movdqu v, XMM0;
4055         }
4056         return v;
4057     }
4058     else
4059     {
4060         return cast(__m128i) shufflevector!(byte16,
4061                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4062                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4063                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4064     }
4065 }
4066 unittest
4067 {
4068     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
4069     int[4] correct = [2, 3, 4, 0];
4070     assert(R.array == correct);
4071 
4072     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4073     int[4] expectedA = [0, 0, 0, 0];
4074     assert(A.array == expectedA);
4075 }
4076 
4077 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4078 /// #BONUS
4079 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4080 {
4081     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4082 }
4083 unittest
4084 {
4085     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4086     float[4] correct = [3.0f, 4.0f, 0, 0];
4087     assert(R.array == correct);
4088 }
4089 
4090 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4091 /// #BONUS
4092 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4093 {
4094     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4095 }
4096 
4097 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4098 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4099 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4100 {
4101     __m128d* aligned = cast(__m128d*)mem_addr;
4102     *aligned = a;
4103 }
4104 
4105 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4106 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4107 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4108 {
4109     __m128d* aligned = cast(__m128d*)mem_addr;
4110     __m128d r;
4111     r.ptr[0] = a.array[0];
4112     r.ptr[1] = a.array[0];
4113     *aligned = r;
4114 }
4115 
4116 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4117 /// be aligned on any particular boundary.
4118 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4119 {
4120     *mem_addr = a.array[0];
4121 }
4122 
4123 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4124 /// general-protection exception may be generated.
4125 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4126 {
4127     *mem_addr = a;
4128 }
4129 
4130 alias _mm_store1_pd = _mm_store_pd1; ///
4131 
4132 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4133 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4134 {
4135     *mem_addr = a.array[1];
4136 }
4137 
4138 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4139 // expectations from the user point of view. This problem also exist in C++.
4140 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4141 {
4142     long* dest = cast(long*)mem_addr;
4143     long2 la = cast(long2)a;
4144     *dest = la.array[0];
4145 }
4146 unittest
4147 {
4148     long[3] A = [1, 2, 3];
4149     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4150     long[3] correct = [1, 0x1_0000_0000, 3];
4151     assert(A == correct);
4152 }
4153 
4154 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4155 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4156 {
4157     *mem_addr = a.array[0];
4158 }
4159 
4160 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4161 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4162 void _mm_storer_pd (double* mem_addr, __m128d a) pure
4163 {
4164     __m128d* aligned = cast(__m128d*)mem_addr;
4165     *aligned = shufflevector!(double2, 1, 0)(a, a);
4166 }
4167 
4168 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4169 /// `mem_addr` does not need to be aligned on any particular boundary.
4170 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
4171 {
4172     storeUnaligned!double2(a, mem_addr);
4173 }
4174 
4175 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4176 /// boundary.
4177 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4178 {
4179     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4180 }
4181 
4182 /// Store 32-bit integer from the first element of `a` into memory. 
4183 /// `mem_addr` does not need to be aligned on any particular boundary.
4184 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4185 {
4186     int* dest = cast(int*)mem_addr;
4187     *dest = a.array[0];
4188 }
4189 unittest
4190 {
4191     int[2] arr = [-24, 12];
4192     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4193     assert(arr == [-24, -1]);
4194 }
4195 
4196 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4197 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4198 /// boundary or a general-protection exception may be generated.
4199 void _mm_stream_pd (double* mem_addr, __m128d a)
4200 {
4201     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4202     __m128d* dest = cast(__m128d*)mem_addr;
4203     *dest = a;
4204 }
4205 
4206 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4207 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4208 /// may be generated.
4209 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4210 {
4211     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4212     __m128i* dest = cast(__m128i*)mem_addr;
4213     *dest = a;
4214 }
4215 
4216 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4217 /// pollution. If the cache line containing address mem_addr is already in the cache,
4218 /// the cache will be updated.
4219 void _mm_stream_si32 (int* mem_addr, int a)
4220 {
4221     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4222     *mem_addr = a;
4223 }
4224 
4225 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4226 /// cache pollution. If the cache line containing address mem_addr is already
4227 /// in the cache, the cache will be updated.
4228 void _mm_stream_si64 (long* mem_addr, long a)
4229 {
4230     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4231     *mem_addr = a;
4232 }
4233 
4234 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4235 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4236 {
4237     return cast(__m128i)(cast(short8)a - cast(short8)b);
4238 }
4239 
4240 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4241 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4242 {
4243     return cast(__m128i)(cast(int4)a - cast(int4)b);
4244 }
4245 
4246 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4247 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4248 {
4249     return cast(__m128i)(cast(long2)a - cast(long2)b);
4250 }
4251 
4252 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4253 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4254 {
4255     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4256 }
4257 
4258 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4259 /// floating-point elements in `a`.
4260 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4261 {
4262     return a - b;
4263 }
4264 
4265 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4266 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4267 /// upper element of result.
4268 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4269 {
4270     version(DigitalMars)
4271     {
4272         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4273         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4274         asm pure nothrow @nogc @trusted { nop;}
4275         a[0] = a[0] - b[0];
4276         return a;
4277     }
4278     else static if (GDC_with_SSE2)
4279     {
4280         return __builtin_ia32_subsd(a, b);
4281     }
4282     else
4283     {
4284         a.ptr[0] -= b.array[0];
4285         return a;
4286     }
4287 }
4288 unittest
4289 {
4290     __m128d a = [1.5, -2.0];
4291     a = _mm_sub_sd(a, a);
4292     assert(a.array == [0.0, -2.0]);
4293 }
4294 
4295 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4296 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4297 {
4298     return a - b;
4299 }
4300 
4301 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4302 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4303 {
4304     version(LDC)
4305     {
4306         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4307         {
4308             // Generates PSUBSW since LDC 1.15 -O0
4309             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4310             
4311             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4312             enum ir = `
4313                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4314                 ret <8 x i16> %r`;
4315             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4316         }
4317         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4318         {
4319             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4320             short[8] res;
4321             short8 sa = cast(short8)a;
4322             short8 sb = cast(short8)b;
4323             foreach(i; 0..8)
4324                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4325             return _mm_loadu_si128(cast(int4*)res.ptr);
4326         }
4327         else static if (LDC_with_SSE2)
4328         {
4329             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4330         }
4331         else
4332             static assert(false);
4333     }
4334     else static if (GDC_with_SSE2)
4335     {
4336         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4337     }
4338     else
4339     {
4340         short[8] res;
4341         short8 sa = cast(short8)a;
4342         short8 sb = cast(short8)b;
4343         foreach(i; 0..8)
4344             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4345         return _mm_loadu_si128(cast(int4*)res.ptr);
4346     }
4347 }
4348 unittest
4349 {
4350     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4351                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4352     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4353     assert(res.array == correctResult);
4354 }
4355 
4356 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4357 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4358 {
4359     version(LDC)
4360     {
4361         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4362         {
4363             // x86: Generates PSUBSB since LDC 1.15 -O0
4364             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4365             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4366             enum ir = `
4367                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4368                 ret <16 x i8> %r`;
4369             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4370         }
4371         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4372         {
4373             byte[16] res;
4374             byte16 sa = cast(byte16)a;
4375             byte16 sb = cast(byte16)b;
4376             foreach(i; 0..16)
4377                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4378             return _mm_loadu_si128(cast(int4*)res.ptr);
4379         }
4380         else static if (LDC_with_SSE2)
4381         {
4382             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4383         }
4384         else
4385             static assert(false);
4386     }
4387     else static if (GDC_with_SSE2)
4388     {
4389         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4390     }
4391     else
4392     {
4393         byte[16] res;
4394         byte16 sa = cast(byte16)a;
4395         byte16 sb = cast(byte16)b;
4396         foreach(i; 0..16)
4397             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4398         return _mm_loadu_si128(cast(int4*)res.ptr);
4399     }
4400 }
4401 unittest
4402 {
4403     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4404                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4405     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4406     assert(res.array == correctResult);
4407 }
4408 
4409 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4410 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4411 {
4412     version(LDC)
4413     {
4414         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4415         {
4416             // x86: Generates PSUBUSW since LDC 1.15 -O0
4417             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4418             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4419             enum ir = `
4420                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4421                 ret <8 x i16> %r`;
4422             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4423         }
4424         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4425         {
4426             short[8] res;
4427             short8 sa = cast(short8)a;
4428             short8 sb = cast(short8)b;
4429             foreach(i; 0..8)
4430             {
4431                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4432                 res[i] = saturateSignedIntToUnsignedShort(sum);
4433             }
4434             return _mm_loadu_si128(cast(int4*)res.ptr);
4435         }
4436         else static if (LDC_with_SSE2)
4437         {
4438             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4439         }
4440         else 
4441             static assert(false);
4442     }
4443     else static if (GDC_with_SSE2)
4444     {
4445         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4446     }
4447     else
4448     {
4449         short[8] res;
4450         short8 sa = cast(short8)a;
4451         short8 sb = cast(short8)b;
4452         foreach(i; 0..8)
4453         {
4454             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4455             res[i] = saturateSignedIntToUnsignedShort(sum);
4456         }
4457         return _mm_loadu_si128(cast(int4*)res.ptr);
4458     }
4459 }
4460 unittest
4461 {
4462     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4463                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4464     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4465     assert(R.array == correct);
4466 }
4467 
4468 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4469 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4470 {
4471     version(LDC)
4472     {
4473         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4474         {
4475             // x86: Generates PSUBUSB since LDC 1.15 -O0
4476             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4477             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4478             enum ir = `
4479                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4480                 ret <16 x i8> %r`;
4481             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4482         }
4483         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4484         {
4485             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4486             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4487             {
4488                 ubyte[16] res;
4489                 byte16 sa = cast(byte16)a;
4490                 byte16 sb = cast(byte16)b;
4491                 foreach(i; 0..16)
4492                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4493                 return _mm_loadu_si128(cast(int4*)res.ptr);
4494             }
4495         }
4496         else static if (LDC_with_SSE2)
4497         {
4498             return __builtin_ia32_psubusb128(a, b);
4499         }
4500         else 
4501             static assert(false);
4502     }
4503     else static if (GDC_with_SSE2)
4504     {
4505         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4506     }
4507     else
4508     {
4509         ubyte[16] res;
4510         byte16 sa = cast(byte16)a;
4511         byte16 sb = cast(byte16)b;
4512         foreach(i; 0..16)
4513             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4514         return _mm_loadu_si128(cast(int4*)res.ptr);
4515     }
4516 }
4517 unittest
4518 {
4519     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4520                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4521     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4522     assert(res.array == correctResult);
4523 }
4524 
4525 // Note: the only difference between these intrinsics is the signalling
4526 //       behaviour of quiet NaNs. This is incorrect but the case where
4527 //       you would want to differentiate between qNaN and sNaN and then
4528 //       treat them differently on purpose seems extremely rare.
4529 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4530 alias _mm_ucomige_sd = _mm_comige_sd; ///
4531 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4532 alias _mm_ucomile_sd = _mm_comile_sd; ///
4533 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4534 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4535 
4536 /// Return vector of type `__m128d` with undefined elements.
4537 __m128d _mm_undefined_pd() pure @safe
4538 {
4539     __m128d result = void;
4540     return result;
4541 }
4542 
4543 /// Return vector of type `__m128i` with undefined elements.
4544 __m128i _mm_undefined_si128() pure @safe
4545 {
4546     __m128i result = void;
4547     return result;
4548 }
4549 
4550 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4551 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4552 {
4553     static if (GDC_with_SSE2)
4554     {
4555         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4556     }
4557     else static if (DMD_with_32bit_asm)
4558     {
4559         asm pure nothrow @nogc @trusted
4560         {
4561             movdqu XMM0, a;
4562             movdqu XMM1, b;
4563             punpckhwd XMM0, XMM1;
4564             movdqu a, XMM0;
4565         }
4566         return a;
4567     }
4568     else
4569     {
4570         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4571                                            (cast(short8)a, cast(short8)b);
4572     }
4573 }
4574 unittest
4575 {
4576     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4577     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4578     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4579     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4580     assert(C.array == correct);
4581 }
4582 
4583 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4584 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
4585 {
4586     static if (GDC_with_SSE2)
4587     {
4588         return __builtin_ia32_punpckhdq128(a, b);
4589     }
4590     else version(DigitalMars)
4591     {
4592         __m128i r;
4593         r.ptr[0] = a.array[2];
4594         r.ptr[1] = b.array[2];
4595         r.ptr[2] = a.array[3];
4596         r.ptr[3] = b.array[3];
4597         return r;
4598     }
4599     else
4600     {
4601         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4602     }
4603 }
4604 unittest
4605 {
4606     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4607     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4608     __m128i C = _mm_unpackhi_epi32(A, B);
4609     int[4] correct = [3, 7, 4, 8];
4610     assert(C.array == correct);
4611 }
4612 
4613 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4614 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4615 {
4616     static if (GDC_with_SSE2)
4617     {
4618         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
4619     }
4620     else
4621     {
4622         __m128i r = cast(__m128i)b;
4623         r[0] = a[2];
4624         r[1] = a[3];
4625         return r; 
4626     }
4627 }
4628 unittest // Issue #36
4629 {
4630     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4631     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4632     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4633     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4634     assert(C.array == correct);
4635 }
4636 
4637 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4638 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4639 {
4640     static if (GDC_with_SSE2)
4641     {
4642         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
4643     }
4644     else static if (DMD_with_32bit_asm)
4645     {
4646         asm pure nothrow @nogc @trusted
4647         {
4648             movdqu XMM0, a;
4649             movdqu XMM1, b;
4650             punpckhbw XMM0, XMM1;
4651             movdqu a, XMM0;
4652         }
4653         return a;
4654     }
4655     else
4656     {
4657         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4658                                                    12, 28, 13, 29, 14, 30, 15, 31)
4659                                                    (cast(byte16)a, cast(byte16)b);
4660     }
4661 }
4662 unittest
4663 {
4664     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4665     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4666     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
4667     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
4668     assert(C.array == correct);
4669 }
4670 
4671 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4672 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4673 {
4674     static if (GDC_with_SSE2)
4675     {
4676         return __builtin_ia32_unpckhpd(a, b);
4677     }
4678     else
4679     {
4680         return shufflevector!(__m128d, 1, 3)(a, b);
4681     }
4682 }
4683 unittest
4684 {
4685     __m128d A = _mm_setr_pd(4.0, 6.0);
4686     __m128d B = _mm_setr_pd(7.0, 9.0);
4687     __m128d C = _mm_unpackhi_pd(A, B);
4688     double[2] correct = [6.0, 9.0];
4689     assert(C.array == correct);
4690 }
4691 
4692 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4693 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4694 {
4695     static if (GDC_with_SSE2)
4696     {
4697         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
4698     }
4699     else static if (DMD_with_32bit_asm)
4700     {
4701         asm pure nothrow @nogc @trusted
4702         {
4703             movdqu XMM0, a;
4704             movdqu XMM1, b;
4705             punpcklwd XMM0, XMM1;
4706             movdqu a, XMM0;
4707         }
4708         return a;
4709     }
4710     else
4711     {
4712         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4713                                            (cast(short8)a, cast(short8)b);
4714     }
4715 }
4716 unittest
4717 {
4718     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4719     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4720     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
4721     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
4722     assert(C.array == correct);
4723 }
4724 
4725 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4726 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
4727 {
4728     static if (GDC_with_SSE2)
4729     {
4730         return __builtin_ia32_punpckldq128(a, b);
4731     }
4732     else version(DigitalMars)
4733     {
4734         __m128i r;
4735         r.ptr[0] = a.array[0];
4736         r.ptr[1] = b.array[0];
4737         r.ptr[2] = a.array[1];
4738         r.ptr[3] = b.array[1];
4739         return r;
4740     }
4741     else
4742     {
4743         return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
4744     }
4745 }
4746 unittest
4747 {
4748     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4749     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4750     __m128i C = _mm_unpacklo_epi32(A, B);
4751     int[4] correct = [1, 5, 2, 6];
4752     assert(C.array == correct);
4753 }
4754 
4755 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4756 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4757 {
4758     static if (GDC_with_SSE2)
4759     {
4760         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
4761     }
4762     else
4763     {
4764         long2 lA = cast(long2)a;
4765         long2 lB = cast(long2)b;
4766         long2 R;
4767         R.ptr[0] = lA.array[0];
4768         R.ptr[1] = lB.array[0];
4769         return cast(__m128i)R;
4770     }
4771 }
4772 unittest // Issue #36
4773 {
4774     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4775     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4776     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4777     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4778     assert(C.array == correct);
4779 }
4780 
4781 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
4782 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4783 {
4784     static if (GDC_with_SSE2)
4785     {
4786         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
4787     }
4788     else static if (DMD_with_32bit_asm)
4789     {
4790         asm pure nothrow @nogc @trusted
4791         {
4792             movdqu XMM0, a;
4793             movdqu XMM1, b;
4794             punpcklbw XMM0, XMM1;
4795             movdqu a, XMM0;
4796         }
4797         return a;
4798     }
4799     else
4800     {
4801         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4802                                                     4, 20, 5, 21, 6, 22, 7, 23)
4803                                            (cast(byte16)a, cast(byte16)b);
4804     }
4805 }
4806 unittest
4807 {
4808     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4809     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4810     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
4811     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
4812     assert(C.array == correct);
4813 }
4814 
4815 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
4816 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4817 {
4818     static if (GDC_with_SSE2)
4819     {
4820         return __builtin_ia32_unpcklpd(a, b);
4821     }
4822     else
4823     {
4824         return shufflevector!(__m128d, 0, 2)(a, b);
4825     }
4826 }
4827 unittest
4828 {
4829     __m128d A = _mm_setr_pd(4.0, 6.0);
4830     __m128d B = _mm_setr_pd(7.0, 9.0);
4831     __m128d C = _mm_unpacklo_pd(A, B);
4832     double[2] correct = [4.0, 7.0];
4833     assert(C.array == correct);
4834 }
4835 
4836 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4837 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4838 {
4839     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4840 }
4841 
4842 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
4843 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4844 {
4845     return a ^ b;
4846 }
4847 
4848 unittest
4849 {
4850     float distance(float[4] a, float[4] b) nothrow @nogc
4851     {
4852         __m128 va = _mm_loadu_ps(a.ptr);
4853         __m128 vb = _mm_loadu_ps(b.ptr);
4854         __m128 diffSquared = _mm_sub_ps(va, vb);
4855         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4856         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4857         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4858         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4859     }
4860     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4861 }