inteli.emmintrin source code

1 /**
2 * SSE2 intrinsics. 
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.emmintrin;
8 
9 public import inteli.types;
10 public import inteli.xmmintrin; // SSE2 includes SSE1
11 import inteli.mmx;
12 import inteli.internals;
13 
14 nothrow @nogc:
15 
16 
17 // SSE2 instructions
18 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
19 
20 /// Add packed 16-bit integers in `a` and `b`.
21 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
22 {
23     return cast(__m128i)(cast(short8)a + cast(short8)b);
24 }
25 unittest
26 {
27     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
28     short8 R = cast(short8) _mm_add_epi16(A, A);
29     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
30     assert(R.array == correct);
31 }
32 
33 /// Add packed 32-bit integers in `a` and `b`.
34 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
35 {
36     return cast(__m128i)(cast(int4)a + cast(int4)b);
37 }
38 unittest
39 {
40     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
41     int4 R = _mm_add_epi32(A, A);
42     int[4] correct = [ -14, -2, 0, 18 ];
43     assert(R.array == correct);
44 }
45 
46 /// Add packed 64-bit integers in `a` and `b`.
47 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
48 {
49     return cast(__m128i)(cast(long2)a + cast(long2)b);
50 }
51 unittest
52 {
53     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
54     long2 R = cast(long2) _mm_add_epi64(A, A);
55     long[2] correct = [ -2, 0 ];
56     assert(R.array == correct);
57 }
58 
59 /// Add packed 8-bit integers in `a` and `b`.
60 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
61 {
62     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
63 }
64 unittest
65 {
66     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
67     byte16 R = cast(byte16) _mm_add_epi8(A, A);
68     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
69     assert(R.array == correct);
70 }
71 
72 /// Add the lower double-precision (64-bit) floating-point element 
73 /// in `a` and `b`, store the result in the lower element of dst, 
74 /// and copy the upper element from `a` to the upper element of destination. 
75 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
76 {
77     static if (GDC_with_SSE2)
78     {
79         return __builtin_ia32_addsd(a, b);
80     }
81     else version(DigitalMars)
82     {
83         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
84         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
85         asm pure nothrow @nogc @trusted { nop;}
86         a[0] = a[0] + b[0];
87         return a;
88     }
89     else
90     {
91         a[0] += b[0];
92         return a;
93     }
94 }
95 unittest
96 {
97     __m128d a = [1.5, -2.0];
98     a = _mm_add_sd(a, a);
99     assert(a.array == [3.0, -2.0]);
100 }
101 
102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
104 {
105     return a + b;
106 }
107 unittest
108 {
109     __m128d a = [1.5, -2.0];
110     a = _mm_add_pd(a, a);
111     assert(a.array == [3.0, -4.0]);
112 }
113 
114 /// Add 64-bit integers `a` and `b`.
115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
116 {
117     return a + b;
118 }
119 
120 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
122 {
123     static if (GDC_with_SSE2)
124     {
125         return __builtin_ia32_paddsw128(a, b);
126     }
127     else version(LDC)
128     {
129         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
130         {
131             // x86: Generates PADDSW since LDC 1.15 -O0
132             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
133             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
134             enum ir = `
135                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
136                 ret <8 x i16> %r`;
137             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
138         }
139         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
140         {
141             short[8] res;
142             short8 sa = cast(short8)a;
143             short8 sb = cast(short8)b;
144             foreach(i; 0..8)
145                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
146             return _mm_loadu_si128(cast(int4*)res.ptr);
147         }
148         else
149             return __builtin_ia32_paddsw128(a, b);
150     }
151     else
152     {
153         short[8] res;
154         short8 sa = cast(short8)a;
155         short8 sb = cast(short8)b;
156         foreach(i; 0..8)
157             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
158         return _mm_loadu_si128(cast(int4*)res.ptr);
159     }
160 }
161 unittest
162 {
163     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
164                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
165     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
166     assert(res.array == correctResult);
167 }
168 
169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
171 {
172     static if (GDC_with_SSE2)
173     {
174         return __builtin_ia32_paddsb128(a, b);
175     }
176     else version(LDC)
177     {
178         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
179         {
180             // x86: Generates PADDSB since LDC 1.15 -O0
181             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
182             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
183             enum ir = `
184                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
185                 ret <16 x i8> %r`;
186             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
187         }
188         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
189         {
190             byte[16] res;
191             byte16 sa = cast(byte16)a;
192             byte16 sb = cast(byte16)b;
193             foreach(i; 0..16)
194                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
195             return _mm_loadu_si128(cast(int4*)res.ptr);
196         }
197         else
198             return __builtin_ia32_paddsb128(a, b);
199     }
200     else
201     {
202         byte[16] res;
203         byte16 sa = cast(byte16)a;
204         byte16 sb = cast(byte16)b;
205         foreach(i; 0..16)
206             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
207         return _mm_loadu_si128(cast(int4*)res.ptr);
208     }
209 }
210 unittest
211 {
212     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
213                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
214     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
215                                                16, 18, 20, 22, 24, 26, 28, 30];
216     assert(res.array == correctResult);
217 }
218 
219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
220 // PERF: #GDC version?
221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
222 {
223     version(LDC)
224     {
225         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
226         {
227             // x86: Generates PADDUSB since LDC 1.15 -O0
228             // ARM: Generates uqadd.16b since LDC 1.21 -O1
229             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
230             enum ir = `
231                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
232                 ret <16 x i8> %r`;
233             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
234         }
235         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
236         {
237             ubyte[16] res;
238             byte16 sa = cast(byte16)a;
239             byte16 sb = cast(byte16)b;
240             foreach(i; 0..16)
241                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
242             return _mm_loadu_si128(cast(int4*)res.ptr);
243         }
244         else
245             return __builtin_ia32_paddusb128(a, b);
246     }
247     else
248     {
249         ubyte[16] res;
250         byte16 sa = cast(byte16)a;
251         byte16 sb = cast(byte16)b;
252         foreach(i; 0..16)
253             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
254         return _mm_loadu_si128(cast(int4*)res.ptr);
255     }
256 }
257 unittest
258 {
259     byte16 res = cast(byte16) 
260         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
261                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
262     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
263                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
264     assert(res.array == correctResult);
265 }
266 
267 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
268 // PERF: #GDC version?
269 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
270 {
271     version(LDC)
272     {
273         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
274         {
275             // x86: Generates PADDUSW since LDC 1.15 -O0
276             // ARM: Generates uqadd.8h since LDC 1.21 -O1
277             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
278             enum ir = `
279                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
280                 ret <8 x i16> %r`;
281             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
282         }
283         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
284         {
285             ushort[8] res;
286             short8 sa = cast(short8)a;
287             short8 sb = cast(short8)b;
288             foreach(i; 0..8)
289                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
290             return _mm_loadu_si128(cast(int4*)res.ptr);
291         }
292         else
293             return __builtin_ia32_paddusw128(a, b);
294     }
295     else
296     {
297         ushort[8] res;
298         short8 sa = cast(short8)a;
299         short8 sb = cast(short8)b;
300         foreach(i; 0..8)
301             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
302         return _mm_loadu_si128(cast(int4*)res.ptr);
303     }
304 }
305 unittest
306 {
307     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
308                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
309     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
310     assert(res.array == correctResult);
311 }
312 
313 /// Compute the bitwise AND of packed double-precision (64-bit) 
314 /// floating-point elements in `a` and `b`.
315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
316 {
317     return cast(__m128d)( cast(long2)a & cast(long2)b );
318 }
319 unittest
320 {
321     double a = 4.32;
322     double b = -78.99;
323     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
324     __m128d A = _mm_set_pd(a, b);
325     __m128d B = _mm_set_pd(b, a);
326     long2 R = cast(long2)( _mm_and_pd(A, B) );
327     assert(R.array[0] == correct);
328     assert(R.array[1] == correct);
329 }
330 
331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
333 {
334     return a & b;
335 }
336 unittest
337 {
338     __m128i A = _mm_set1_epi32(7);
339     __m128i B = _mm_set1_epi32(14);
340     __m128i R = _mm_and_si128(A, B);
341     int[4] correct = [6, 6, 6, 6];
342     assert(R.array == correct);
343 }
344 
345 /// Compute the bitwise NOT of packed double-precision (64-bit) 
346 /// floating-point elements in `a` and then AND with `b`.
347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
348 {
349     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
350 }
351 unittest
352 {
353     double a = 4.32;
354     double b = -78.99;
355     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
356     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
357     __m128d A = _mm_setr_pd(a, b);
358     __m128d B = _mm_setr_pd(b, a);
359     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
360     assert(R.array[0] == correct);
361     assert(R.array[1] == correct2);
362 }
363 
364 /// Compute the bitwise NOT of 128 bits (representing integer data) 
365 /// in `a` and then AND with `b`.
366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
367 {
368     return (~a) & b;
369 }
370 unittest
371 {
372     __m128i A = _mm_set1_epi32(7);
373     __m128i B = _mm_set1_epi32(14);
374     __m128i R = _mm_andnot_si128(A, B);
375     int[4] correct = [8, 8, 8, 8];
376     assert(R.array == correct);
377 }
378 
379 /// Average packed unsigned 16-bit integers in `a` and `b`.
380 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
381 {
382     static if (GDC_with_SSE2)
383     {
384         return __builtin_ia32_pavgw128(a, b);
385     }
386     else static if (LDC_with_ARM64)
387     {
388         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
389     }
390     else version(LDC)
391     {
392         // Generates pavgw even in LDC 1.0, even in -O0
393         // But not in ARM
394         enum ir = `
395             %ia = zext <8 x i16> %0 to <8 x i32>
396             %ib = zext <8 x i16> %1 to <8 x i32>
397             %isum = add <8 x i32> %ia, %ib
398             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
399             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
400             %r = trunc <8 x i32> %isums to <8 x i16>
401             ret <8 x i16> %r`;
402         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
403     }
404     else
405     {
406         short8 sa = cast(short8)a;
407         short8 sb = cast(short8)b;
408         short8 sr = void;
409         foreach(i; 0..8)
410         {
411             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
412         }
413         return cast(int4)sr;
414     }
415 }
416 unittest
417 {
418     __m128i A = _mm_set1_epi16(31);
419     __m128i B = _mm_set1_epi16(64);
420     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
421     foreach(i; 0..8)
422         assert(avg.array[i] == 48);
423 }
424 
425 /// Average packed unsigned 8-bit integers in `a` and `b`.
426 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
427 {
428     static if (GDC_with_SSE2)
429     {
430         return __builtin_ia32_pavgb128(a, b);
431     }
432     else static if (LDC_with_ARM64)
433     {
434         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
435     }
436     else version(LDC)
437     {
438         // Generates pavgb even in LDC 1.0, even in -O0
439         // But not in ARM
440         enum ir = `
441             %ia = zext <16 x i8> %0 to <16 x i16>
442             %ib = zext <16 x i8> %1 to <16 x i16>
443             %isum = add <16 x i16> %ia, %ib
444             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
445             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
446             %r = trunc <16 x i16> %isums to <16 x i8>
447             ret <16 x i8> %r`;
448         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
449     }
450     else
451     {
452         byte16 sa = cast(byte16)a;
453         byte16 sb = cast(byte16)b;
454         byte16 sr = void;
455         foreach(i; 0..16)
456         {
457             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
458         }
459         return cast(int4)sr;
460     }
461 }
462 unittest
463 {
464     __m128i A = _mm_set1_epi8(31);
465     __m128i B = _mm_set1_epi8(64);
466     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
467     foreach(i; 0..16)
468         assert(avg.array[i] == 48);
469 }
470 
471 /// Shift `a` left by `bytes` bytes while shifting in zeros.
472 alias _mm_bslli_si128 = _mm_slli_si128;
473 unittest
474 {
475     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
476     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
477     __m128i result = _mm_bslli_si128!5(toShift);
478     assert( (cast(byte16)result).array == exact);
479 }
480 
481 /// Shift `v` right by `bytes` bytes while shifting in zeros.
482 alias _mm_bsrli_si128 = _mm_srli_si128;
483 unittest
484 {
485     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
486     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
487     __m128i result = _mm_bsrli_si128!5(toShift);
488     assert( (cast(byte16)result).array == exact);
489 }
490 
491 /// Cast vector of type `__m128d` to type `__m128`. 
492 /// Note: Also possible with a regular `cast(__m128)(a)`.
493 __m128 _mm_castpd_ps (__m128d a) pure @safe
494 {
495     return cast(__m128)a;
496 }
497 
498 /// Cast vector of type `__m128d` to type `__m128i`. 
499 /// Note: Also possible with a regular `cast(__m128i)(a)`.
500 __m128i _mm_castpd_si128 (__m128d a) pure @safe
501 {
502     return cast(__m128i)a;
503 }
504 
505 /// Cast vector of type `__m128` to type `__m128d`. 
506 /// Note: Also possible with a regular `cast(__m128d)(a)`.
507 __m128d _mm_castps_pd (__m128 a) pure @safe
508 {
509     return cast(__m128d)a;
510 }
511 
512 /// Cast vector of type `__m128` to type `__m128i`. 
513 /// Note: Also possible with a regular `cast(__m128i)(a)`.
514 __m128i _mm_castps_si128 (__m128 a) pure @safe
515 {
516     return cast(__m128i)a;
517 }
518 
519 /// Cast vector of type `__m128i` to type `__m128d`. 
520 /// Note: Also possible with a regular `cast(__m128d)(a)`.
521 __m128d _mm_castsi128_pd (__m128i a) pure @safe
522 {
523     return cast(__m128d)a;
524 }
525 
526 /// Cast vector of type `__m128i` to type `__m128`. 
527 /// Note: Also possible with a regular `cast(__m128)(a)`.
528 __m128 _mm_castsi128_ps (__m128i a) pure @safe
529 {
530     return cast(__m128)a;
531 }
532 
533 /// Invalidate and flush the cache line that contains `p` 
534 /// from all levels of the cache hierarchy.
535 void _mm_clflush (const(void)* p) @trusted
536 {
537     static if (GDC_with_SSE2)
538     {
539         __builtin_ia32_clflush(p);
540     }
541     else static if (LDC_with_SSE2)
542     {
543         __builtin_ia32_clflush(cast(void*)p);
544     }
545     else version(D_InlineAsm_X86)
546     {
547         asm pure nothrow @nogc @safe
548         {
549             mov EAX, p;
550             clflush [EAX];
551         }
552     }
553     else version(D_InlineAsm_X86_64)
554     {
555         asm pure nothrow @nogc @safe
556         {
557             mov RAX, p;
558             clflush [RAX];
559         }
560     }
561     else 
562     {
563         // Do nothing. Invalidating cacheline does
564         // not affect correctness.
565     }
566 }
567 unittest
568 {
569     ubyte[64] cacheline;
570     _mm_clflush(cacheline.ptr);
571 }
572 
573 /// Compare packed 16-bit integers in `a` and `b` for equality.
574 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
575 {
576     static if (GDC_with_SSE2)
577     {
578         return __builtin_ia32_pcmpeqw128(a, b);
579     }
580     else
581     {
582         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
583     }
584 }
585 unittest
586 {
587     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
588     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
589     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
590     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
591     assert(R.array == E);
592 }
593 
594 /// Compare packed 32-bit integers in `a` and `b` for equality.
595 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
596 {
597     static if (GDC_with_SSE2)
598     {
599         return __builtin_ia32_pcmpeqd128(a, b);
600     }
601     else
602     {
603         return equalMask!__m128i(a, b);
604     }
605 }
606 unittest
607 {
608     int4   A = [-3, -2, -1,  0];
609     int4   B = [ 4, -2,  2,  0];
610     int[4] E = [ 0, -1,  0, -1];
611     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
612     assert(R.array == E);
613 }
614 
615 /// Compare packed 8-bit integers in `a` and `b` for equality.
616 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
617 {
618     static if (GDC_with_SSE2)
619     {
620         return __builtin_ia32_pcmpeqb128(a, b); 
621     }
622     else
623     {
624         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
625     }
626 }
627 unittest
628 {
629     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
630     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
631     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
632     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
633     assert(C.array == correct);
634 }
635 
636 /// Compare packed double-precision (64-bit) floating-point elements 
637 /// in `a` and `b` for equality.
638 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
639 {
640     static if (GDC_with_SSE2)
641     {
642         return __builtin_ia32_cmpeqpd(a, b);
643     }
644     else
645     {
646         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
647     }
648 }
649 
650 /// Compare the lower double-precision (64-bit) floating-point elements
651 /// in `a` and `b` for equality, store the result in the lower element,
652 /// and copy the upper element from `a`.
653 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
654 {
655     static if (GDC_with_SSE2)
656     {
657         return __builtin_ia32_cmpeqsd(a, b);
658     }
659     else
660     {
661         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
662     }
663 }
664 
665 /// Compare packed double-precision (64-bit) floating-point elements 
666 /// in `a` and `b` for greater-than-or-equal.
667 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
668 {
669     static if (GDC_with_SSE2)
670     {
671         return __builtin_ia32_cmpgepd(a, b);
672     }
673     else
674     {
675         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
676     }
677 }
678 
679 /// Compare the lower double-precision (64-bit) floating-point elements 
680 /// in `a` and `b` for greater-than-or-equal, store the result in the 
681 /// lower element, and copy the upper element from `a`.
682 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
683 {
684     // Note: There is no __builtin_ia32_cmpgesd builtin.
685     static if (GDC_with_SSE2)
686     {
687         return __builtin_ia32_cmpnltsd(b, a);
688     }
689     else
690     {
691         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
692     }
693 }
694 
695 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
696 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
697 {
698     static if (GDC_with_SSE2)
699     {
700         return __builtin_ia32_pcmpgtw128(a, b); 
701     }
702     else
703     {
704         return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
705     }
706 }
707 unittest
708 {
709     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
710     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
711     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
712     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
713     assert(R.array == E);
714 }
715 
716 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
717 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
718 {
719     static if (GDC_with_SSE2)
720     {
721         return __builtin_ia32_pcmpgtd128(a, b); 
722     }
723     else
724     {
725         return cast(__m128i)( greaterMask!int4(a, b));
726     }
727 }
728 unittest
729 {
730     int4   A = [-3,  2, -1,  0];
731     int4   B = [ 4, -2,  2,  0];
732     int[4] E = [ 0, -1,  0,  0];
733     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
734     assert(R.array == E);
735 }
736 
737 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
738 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
739 {
740     static if (GDC_with_SSE2)
741     {
742         return __builtin_ia32_pcmpgtb128(a, b); 
743     }
744     else
745     {
746         return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
747     }
748 }
749 unittest
750 {
751     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
752     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
753     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
754     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
755     __m128i D = _mm_cmpeq_epi8(A, B);
756     assert(C.array == correct);
757 }
758 
759 /// Compare packed double-precision (64-bit) floating-point elements 
760 /// in `a` and `b` for greater-than.
761 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
762 {
763     static if (GDC_with_SSE2)
764     {
765         return __builtin_ia32_cmpgtpd(a, b); 
766     }
767     else
768     {
769         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
770     }
771 }
772 
773 /// Compare the lower double-precision (64-bit) floating-point elements 
774 /// in `a` and `b` for greater-than, store the result in the lower element,
775 /// and copy the upper element from `a`.
776 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
777 {
778     // Note: There is no __builtin_ia32_cmpgtsd builtin.
779     static if (GDC_with_SSE2)
780     {
781         return __builtin_ia32_cmpnlesd(b, a);
782     }
783     else
784     {
785         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
786     }
787 }
788 
789 /// Compare packed double-precision (64-bit) floating-point elements 
790 /// in `a` and `b` for less-than-or-equal.
791 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
792 {
793     static if (GDC_with_SSE2)
794     {
795         return __builtin_ia32_cmplepd(a, b); 
796     }
797     else
798     {
799         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
800     }
801 }
802 
803 /// Compare the lower double-precision (64-bit) floating-point elements 
804 /// in `a` and `b` for less-than-or-equal, store the result in the 
805 /// lower element, and copy the upper element from `a`.
806 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
807 {
808     static if (GDC_with_SSE2)
809     {
810         return __builtin_ia32_cmplesd(a, b); 
811     }
812     else
813     {
814         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
815     }
816 }
817 
818 /// Compare packed 16-bit integers in `a` and `b` for less-than.
819 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
820 {
821     return _mm_cmpgt_epi16(b, a);
822 }
823 
824 /// Compare packed 32-bit integers in `a` and `b` for less-than.
825 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
826 {
827     return _mm_cmpgt_epi32(b, a);
828 }
829 
830 /// Compare packed 8-bit integers in `a` and `b` for less-than.
831 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
832 {
833     return _mm_cmpgt_epi8(b, a);
834 }
835 
836 /// Compare packed double-precision (64-bit) floating-point elements
837 /// in `a` and `b` for less-than.
838 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
839 {
840     static if (GDC_with_SSE2)
841     {
842         return __builtin_ia32_cmpltpd(a, b); 
843     }
844     else
845     {
846         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
847     }
848 }
849 
850 /// Compare the lower double-precision (64-bit) floating-point elements
851 /// in `a` and `b` for less-than, store the result in the lower 
852 /// element, and copy the upper element from `a`.
853 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
854 {
855     static if (GDC_with_SSE2)
856     {
857         return __builtin_ia32_cmpltsd(a, b); 
858     }
859     else
860     {
861         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
862     }
863 }
864 
865 /// Compare packed double-precision (64-bit) floating-point elements
866 /// in `a` and `b` for not-equal.
867 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
868 {
869     static if (GDC_with_SSE2)
870     {
871         return __builtin_ia32_cmpneqpd(a, b); 
872     }
873     else
874     {
875         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
876     }
877 }
878 
879 /// Compare the lower double-precision (64-bit) floating-point elements
880 /// in `a` and `b` for not-equal, store the result in the lower 
881 /// element, and copy the upper element from `a`.
882 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
883 {
884     static if (GDC_with_SSE2)
885     {
886         return __builtin_ia32_cmpneqsd(a, b); 
887     }
888     else
889     {
890         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
891     }
892 }
893 
894 /// Compare packed double-precision (64-bit) floating-point elements 
895 /// in `a` and `b` for not-greater-than-or-equal.
896 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
897 {
898     static if (GDC_with_SSE2)
899     {
900         return __builtin_ia32_cmpngepd(a, b); 
901     }
902     else
903     {
904         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
905     }
906 }
907 
908 /// Compare the lower double-precision (64-bit) floating-point elements 
909 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
910 /// the lower element, and copy the upper element from `a`.
911 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
912 {
913     // Note: There is no __builtin_ia32_cmpngesd builtin.
914     static if (GDC_with_SSE2)
915     {
916         return __builtin_ia32_cmpltsd(b, a); 
917     }
918     else
919     {
920         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
921     }
922 }
923 
924 /// Compare packed double-precision (64-bit) floating-point elements 
925 /// in `a` and `b` for not-greater-than.
926 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
927 {
928     static if (GDC_with_SSE2)
929     {
930         return __builtin_ia32_cmpngtpd(a, b);
931     }
932     else
933     {
934         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
935     }
936 }
937 
938 /// Compare the lower double-precision (64-bit) floating-point elements 
939 /// in `a` and `b` for not-greater-than, store the result in the 
940 /// lower element, and copy the upper element from `a`.
941 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
942 {
943     // Note: There is no __builtin_ia32_cmpngtsd builtin.
944     static if (GDC_with_SSE2)
945     {
946         return __builtin_ia32_cmplesd(b, a);
947     }
948     else
949     {
950         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
951     }
952 }
953 
954 /// Compare packed double-precision (64-bit) floating-point elements 
955 /// in `a` and `b` for not-less-than-or-equal.
956 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
957 {
958     static if (GDC_with_SSE2)
959     {
960         return __builtin_ia32_cmpnlepd(a, b);
961     }
962     else
963     {
964         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
965     }
966 }
967 
968 /// Compare the lower double-precision (64-bit) floating-point elements 
969 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
970 /// lower element, and copy the upper element from `a`.
971 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
972 {
973     static if (GDC_with_SSE2)
974     {
975         return __builtin_ia32_cmpnlesd(a, b);
976     }
977     else
978     {
979         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
980     }
981 }
982  
983 /// Compare packed double-precision (64-bit) floating-point elements 
984 /// in `a` and `b` for not-less-than.
985 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
986 {
987     static if (GDC_with_SSE2)
988     {
989         return __builtin_ia32_cmpnltpd(a, b);
990     }
991     else
992     {
993         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
994     }
995 }
996 
997 /// Compare the lower double-precision (64-bit) floating-point elements 
998 /// in `a` and `b` for not-less-than, store the result in the lower 
999 /// element, and copy the upper element from `a`.
1000 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1001 {
1002     static if (GDC_with_SSE2)
1003     {
1004         return __builtin_ia32_cmpnltsd(a, b);
1005     }
1006     else
1007     {
1008         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1009     }
1010 }
1011 
1012 /// Compare packed double-precision (64-bit) floating-point elements 
1013 /// in `a` and `b` to see if neither is NaN.
1014 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1015 {
1016     static if (GDC_with_SSE2)
1017     {
1018         return __builtin_ia32_cmpordpd(a, b);
1019     }
1020     else
1021     {
1022         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1023     }
1024 }
1025 
1026 /// Compare the lower double-precision (64-bit) floating-point elements 
1027 /// in `a` and `b` to see if neither is NaN, store the result in the 
1028 /// lower element, and copy the upper element from `a` to the upper element.
1029 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1030 {
1031     static if (GDC_with_SSE2)
1032     {
1033         return __builtin_ia32_cmpordsd(a, b);
1034     }
1035     else
1036     {
1037         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1038     }
1039 }
1040 
1041 /// Compare packed double-precision (64-bit) floating-point elements 
1042 /// in `a` and `b` to see if either is NaN.
1043 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1044 {
1045     static if (GDC_with_SSE2)
1046     {
1047         return __builtin_ia32_cmpunordpd(a, b);
1048     }
1049     else
1050     {
1051         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1052     }
1053 }
1054 
1055 /// Compare the lower double-precision (64-bit) floating-point elements 
1056 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1057 /// element, and copy the upper element from `a` to the upper element.
1058 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1059 {
1060     static if (GDC_with_SSE2)
1061     {
1062         return __builtin_ia32_cmpunordsd(a, b);
1063     }
1064     else
1065     {
1066         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1067     }
1068 }
1069 
1070 
1071 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
1072 // Some such comparisons yields true for NaNs, other don't.
1073 
1074 /// Compare the lower double-precision (64-bit) floating-point element 
1075 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1076 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1077 {
1078     static if (GDC_with_SSE2)
1079     {
1080         return __builtin_ia32_comieq(a, b);
1081     }
1082     else
1083     {
1084         return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC
1085     }
1086 }
1087 
1088 /// Compare the lower double-precision (64-bit) floating-point element 
1089 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1090 /// result (0 or 1).
1091 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1092 {
1093     static if (GDC_with_SSE2)
1094     {
1095         return __builtin_ia32_comige(a, b);
1096     }
1097     else
1098     {
1099         return comsd!(FPComparison.oge)(a, b);
1100     }
1101 }
1102 
1103 /// Compare the lower double-precision (64-bit) floating-point element 
1104 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1105 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1106 {
1107     static if (GDC_with_SSE2)
1108     {
1109         return __builtin_ia32_comigt(a, b);
1110     }
1111     else
1112     {
1113         return comsd!(FPComparison.ogt)(a, b);
1114     }
1115 }
1116 
1117 /// Compare the lower double-precision (64-bit) floating-point element 
1118 /// in `a` and `b` for less-than-or-equal.
1119 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1120 {
1121     static if (GDC_with_SSE2)
1122     {
1123         return __builtin_ia32_comile(a, b);
1124     }
1125     else
1126     {
1127         return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC
1128     }
1129 }
1130 
1131 /// Compare the lower double-precision (64-bit) floating-point element 
1132 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1133 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1134 {
1135     static if (GDC_with_SSE2)
1136     {
1137         return __builtin_ia32_comilt(a, b);
1138     }
1139     else
1140     {
1141         return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC
1142     }
1143 }
1144 
1145 /// Compare the lower double-precision (64-bit) floating-point element
1146 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1147 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1148 {
1149     static if (GDC_with_SSE2)
1150     {
1151         return __builtin_ia32_comineq(a, b);
1152     }
1153     else
1154     {
1155         return comsd!(FPComparison.one)(a, b);
1156     }
1157 }
1158 
1159 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1160 /// floating-point elements.
1161  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1162 {
1163     version(LDC)
1164     {
1165         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1166         enum ir = `
1167             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1168             %r = sitofp <2 x i32> %v to <2 x double>
1169             ret <2 x double> %r`;
1170         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1171     }
1172     else static if (GDC_with_SSE2)
1173     {
1174         return __builtin_ia32_cvtdq2pd(a);
1175     }
1176     else
1177     {
1178         double2 r = void;
1179         r.ptr[0] = a.array[0];
1180         r.ptr[1] = a.array[1];
1181         return r;
1182     }
1183 }
1184 unittest
1185 {
1186     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1187     assert(A.array[0] == 54.0);
1188     assert(A.array[1] == 54.0);
1189 }
1190 
1191 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1192 /// floating-point elements.
1193 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1194 {
1195     static if (GDC_with_SSE2)
1196     {
1197         return __builtin_ia32_cvtdq2ps(a);
1198     }
1199     else
1200     {
1201         // x86: Generates cvtdq2ps since LDC 1.0.0 -O1
1202         // ARM: Generats scvtf.4s since LDC 1.8.0 -02        
1203         __m128 res;
1204         res.ptr[0] = cast(float)a.array[0];
1205         res.ptr[1] = cast(float)a.array[1];
1206         res.ptr[2] = cast(float)a.array[2];
1207         res.ptr[3] = cast(float)a.array[3];
1208         return res;
1209     }
1210 }
1211 unittest
1212 {
1213     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1214     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1215 }
1216 
1217 /// Convert packed double-precision (64-bit) floating-point elements 
1218 /// in `a` to packed 32-bit integers.
1219 // PERF #ARM
1220 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1221 {
1222     static if (LDC_with_SSE2)
1223     {
1224         // Like in clang, implemented with a magic intrinsic right now
1225         return __builtin_ia32_cvtpd2dq(a);
1226     }
1227     else static if (GDC_with_SSE2)
1228     {
1229         return __builtin_ia32_cvtpd2dq(a);
1230     }
1231     else
1232     {
1233         __m128i r = _mm_setzero_si128();
1234         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1235         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1236         return r;
1237     }
1238 }
1239 unittest
1240 {
1241     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1242     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1243 }
1244 
1245 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1246 /// to packed 32-bit integers
1247 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1248 {
1249     return to_m64(_mm_cvtpd_epi32(v));
1250 }
1251 unittest
1252 {
1253     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1254     assert(A.array[0] == 55 && A.array[1] == 61);
1255 }
1256 
1257 /// Convert packed double-precision (64-bit) floating-point elements 
1258 /// in `a` to packed single-precision (32-bit) floating-point elements.
1259 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1260 {
1261     static if (LDC_with_SSE2)
1262     {
1263         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1264     }
1265     else static if (GDC_with_SSE2)
1266     {
1267         return __builtin_ia32_cvtpd2ps(a);
1268     }
1269     else
1270     { 
1271         __m128 r = void;
1272         r.ptr[0] = a.array[0];
1273         r.ptr[1] = a.array[1];
1274         r.ptr[2] = 0;
1275         r.ptr[3] = 0;
1276         return r;
1277     }
1278 }
1279 unittest
1280 {
1281     __m128d A = _mm_set_pd(5.25, 4.0);
1282     __m128 B = _mm_cvtpd_ps(A);
1283     assert(B.array == [4.0f, 5.25f, 0, 0]);
1284 }
1285 
1286 /// Convert packed 32-bit integers in `v` to packed double-precision 
1287 /// (64-bit) floating-point elements.
1288 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1289 {
1290     return _mm_cvtepi32_pd(to_m128i(v));
1291 }
1292 unittest
1293 {
1294     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1295     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1296 }
1297 
1298 /// Convert packed single-precision (32-bit) floating-point elements 
1299 /// in `a` to packed 32-bit integers
1300 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1301 {
1302     static if (LDC_with_SSE2)
1303     {
1304         // Disabled, since it fail with optimizations unfortunately
1305         //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
1306         return __asm!__m128i("cvtps2dq $1,$0","=x,x",a);
1307     }
1308     else static if (GDC_with_SSE2)
1309     {
1310         return __builtin_ia32_cvtps2dq(a);
1311     }
1312     else static if (LDC_with_ARM64)
1313     {
1314         // Get current rounding mode.
1315         uint fpscr = arm_get_fpcr();
1316         switch(fpscr & _MM_ROUND_MASK_ARM)
1317         {
1318             default:
1319             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1320             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1321             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1322             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1323         }
1324     }
1325     else
1326     {
1327         __m128i r = void;
1328         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1329         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1330         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1331         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1332         return r;
1333     }
1334 }
1335 unittest
1336 {
1337     uint savedRounding = _MM_GET_ROUNDING_MODE();
1338 
1339     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1340     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1341     assert(A.array == [1, -2, 54, -3]);
1342 
1343     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1344     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1345     assert(A.array == [1, -3, 53, -3]);
1346 
1347     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1348     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1349     assert(A.array == [2, -2, 54, -2]);
1350 
1351     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1352     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1353     assert(A.array == [1, -2, 53, -2]);
1354 
1355     _MM_SET_ROUNDING_MODE(savedRounding);
1356 }
1357 
1358 /// Convert packed single-precision (32-bit) floating-point elements 
1359 /// in `a` to packed double-precision (64-bit) floating-point elements.
1360 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1361 {
1362     version(LDC)
1363     {
1364         // Generates cvtps2pd since LDC 1.0 -O0
1365         enum ir = `
1366             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1367             %r = fpext <2 x float> %v to <2 x double>
1368             ret <2 x double> %r`;
1369         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1370     }
1371     else static if (GDC_with_SSE2)
1372     {
1373         return __builtin_ia32_cvtps2pd(a);
1374     }
1375     else
1376     {
1377         double2 r = void;
1378         r.ptr[0] = a.array[0];
1379         r.ptr[1] = a.array[1];
1380         return r;
1381     }
1382 }
1383 unittest
1384 {
1385     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1386     assert(A.array[0] == 54.0);
1387     assert(A.array[1] == 54.0);
1388 }
1389 
1390 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1391 double _mm_cvtsd_f64 (__m128d a) pure @safe
1392 {
1393     return a.array[0];
1394 }
1395 
1396 /// Convert the lower double-precision (64-bit) floating-point element
1397 /// in `a` to a 32-bit integer.
1398 int _mm_cvtsd_si32 (__m128d a) @safe
1399 {
1400     static if (LDC_with_SSE2)
1401     {
1402         return __builtin_ia32_cvtsd2si(a);
1403     }
1404     else static if (GDC_with_SSE2)
1405     {
1406         return __builtin_ia32_cvtsd2si(a);
1407     }
1408     else
1409     {
1410         return convertDoubleToInt32UsingMXCSR(a[0]);
1411     }
1412 }
1413 unittest
1414 {
1415     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1416 }
1417 
1418 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1419 long _mm_cvtsd_si64 (__m128d a) @trusted
1420 {
1421     version(LDC)
1422     {
1423         // Unfortunately this builtin crashes in 32-bit
1424         version(X86_64)
1425             return __builtin_ia32_cvtsd2si64(a);
1426         else
1427         {
1428             return convertDoubleToInt64UsingMXCSR(a[0]);
1429         }
1430     }
1431     else
1432     {
1433         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1434     }
1435 }
1436 unittest
1437 {
1438     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1439 
1440     uint savedRounding = _MM_GET_ROUNDING_MODE();
1441 
1442     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1443     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1444 
1445     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1446     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1447 
1448     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1449     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1450 
1451     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1452     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1453 
1454     _MM_SET_ROUNDING_MODE(savedRounding);
1455 }
1456 
1457 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1458 
1459 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1460 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1461 /// to the upper elements of result.
1462 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1463 {
1464     static if (GDC_with_SSE2)
1465     {
1466         return __builtin_ia32_cvtsd2ss(a, b); 
1467     }
1468     else
1469     {
1470         // Generates cvtsd2ss since LDC 1.3 -O0
1471         a.ptr[0] = b.array[0];
1472         return a;
1473     }
1474 }
1475 unittest
1476 {
1477     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1478     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1479 }
1480 
1481 /// Get the lower 32-bit integer in `a`.
1482 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1483 {
1484     return a.array[0];
1485 }
1486 
1487 /// Get the lower 64-bit integer in `a`.
1488 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1489 {
1490     long2 la = cast(long2)a;
1491     return la.array[0];
1492 }
1493 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1494 
1495 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1496 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1497 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1498 {
1499     a.ptr[0] = cast(double)b;
1500     return a;
1501 }
1502 unittest
1503 {
1504     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1505     assert(a.array == [42.0, 0]);
1506 }
1507 
1508 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1509 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1510 {
1511     int4 r = [0, 0, 0, 0];
1512     r.ptr[0] = a;
1513     return r;
1514 }
1515 unittest
1516 {
1517     __m128i a = _mm_cvtsi32_si128(65);
1518     assert(a.array == [65, 0, 0, 0]);
1519 }
1520 
1521 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1522 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1523 
1524 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1525 {
1526     a.ptr[0] = cast(double)b;
1527     return a;
1528 }
1529 unittest
1530 {
1531     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1532     assert(a.array == [42.0, 0]);
1533 }
1534 
1535 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1536 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1537 {
1538     long2 r = [0, 0];
1539     r.ptr[0] = a;
1540     return cast(__m128i)(r);
1541 }
1542 
1543 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1544 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1545 
1546 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1547 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1548 // element of result.
1549 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1550 {
1551     a.ptr[0] = b.array[0];
1552     return a;
1553 }
1554 unittest
1555 {
1556     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1557     assert(a.array == [42.0, 0]);
1558 }
1559 
1560 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1561 long _mm_cvttss_si64 (__m128 a) pure @safe
1562 {
1563     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1564 }
1565 unittest
1566 {
1567     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1568 }
1569 
1570 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1571 /// Put zeroes in the upper elements of result.
1572 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1573 {
1574     static if (LDC_with_SSE2)
1575     {
1576         return __builtin_ia32_cvttpd2dq(a);
1577     }
1578     else static if (GDC_with_SSE2)
1579     {
1580         return __builtin_ia32_cvttpd2dq(a);
1581     }
1582     else
1583     {
1584         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1585         __m128i r;
1586         r.ptr[0] = cast(int)a.array[0];
1587         r.ptr[1] = cast(int)a.array[1];
1588         r.ptr[2] = 0;
1589         r.ptr[3] = 0;
1590         return r;
1591     }
1592 }
1593 unittest
1594 {
1595     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1596     assert(R.array == [-4, 45641, 0, 0]);
1597 }
1598 
1599 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1600 /// to packed 32-bit integers with truncation.
1601 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1602 {
1603     return to_m64(_mm_cvttpd_epi32(v));
1604 }
1605 unittest
1606 {
1607     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1608     int[2] correct = [-4, 45641];
1609     assert(R.array == correct);
1610 }
1611 
1612 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1613 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1614 {
1615     // x86: Generates cvttps2dq since LDC 1.3 -O2
1616     // ARM64: generates fcvtze since LDC 1.8 -O2
1617     __m128i r;
1618     r.ptr[0] = cast(int)a.array[0];
1619     r.ptr[1] = cast(int)a.array[1];
1620     r.ptr[2] = cast(int)a.array[2];
1621     r.ptr[3] = cast(int)a.array[3];
1622     return r;
1623 }
1624 unittest
1625 {
1626     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1627     assert(R.array == [-4, 45641, 0, 1]);
1628 }
1629 
1630 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1631 int _mm_cvttsd_si32 (__m128d a)
1632 {
1633     // Generates cvttsd2si since LDC 1.3 -O0
1634     return cast(int)a.array[0];
1635 }
1636 
1637 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1638 long _mm_cvttsd_si64 (__m128d a)
1639 {
1640     // Generates cvttsd2si since LDC 1.3 -O0
1641     // but in 32-bit instead, it's a long sequence that resort to FPU
1642     return cast(long)a.array[0];
1643 }
1644 
1645 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1646 
1647 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1648 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1649 {
1650     return a / b;
1651 }
1652 
1653 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1654 {
1655     static if (GDC_with_SSE2)
1656     {
1657         return __builtin_ia32_divsd(a, b);
1658     }
1659     else version(DigitalMars)
1660     {
1661         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1662         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1663         asm pure nothrow @nogc @trusted { nop;}
1664         a.array[0] = a.array[0] / b.array[0];
1665         return a;
1666     }
1667     else
1668     {
1669         a.ptr[0] /= b.array[0];
1670         return a;
1671     }
1672 }
1673 unittest
1674 {
1675     __m128d a = [2.0, 4.5];
1676     a = _mm_div_sd(a, a);
1677     assert(a.array == [1.0, 4.5]);
1678 }
1679 
1680 /// Extract a 16-bit integer from `v`, selected with `index`
1681 // PERF: ARM version has array bound check
1682 int _mm_extract_epi16(__m128i v, int index) pure @safe
1683 {
1684     short8 r = cast(short8)v;
1685     return cast(ushort)(r.array[index]);
1686 }
1687 unittest
1688 {
1689     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1690     assert(_mm_extract_epi16(A, 6) == 6);
1691     assert(_mm_extract_epi16(A, 0) == 65535);
1692 }
1693 
1694 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1695 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1696 {
1697     short8 r = cast(short8)v;
1698     r.ptr[index & 7] = cast(short)i;
1699     return cast(__m128i)r;
1700 }
1701 unittest
1702 {
1703     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1704     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1705     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1706     assert(R.array == correct);
1707 }
1708 
1709 
1710 void _mm_lfence() @trusted
1711 {
1712     version(GNU)
1713     {
1714     
1715         static if (GDC_with_SSE2)
1716         {
1717             __builtin_ia32_lfence();
1718         }
1719         else version(X86)
1720         {
1721             asm pure nothrow @nogc @trusted
1722             {
1723                 "lfence;\n" : : : ;
1724             }
1725         }
1726         else
1727             static assert(false);
1728     }
1729     else static if (LDC_with_SSE2)
1730     {
1731         __builtin_ia32_lfence();
1732     }
1733     else static if (DMD_with_asm)
1734     {
1735         asm nothrow @nogc pure @safe
1736         {
1737             lfence;
1738         }
1739     }
1740     else version(LDC)
1741     {
1742         llvm_memory_fence(); // PERF actually generates mfence
1743     }
1744     else
1745         static assert(false);
1746 }
1747 unittest
1748 {
1749     _mm_lfence();
1750 }
1751 
1752 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1753 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1754 __m128d _mm_load_pd (const(double) * mem_addr) pure
1755 {
1756     __m128d* aligned = cast(__m128d*)mem_addr;
1757     return *aligned;
1758 }
1759 unittest
1760 {
1761     align(16) double[2] S = [-5.0, 7.0];
1762     __m128d R = _mm_load_pd(S.ptr);
1763     assert(R.array == S);
1764 }
1765 
1766 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1767 /// `mem_addr` does not need to be aligned on any particular boundary.
1768 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1769 {
1770     double[2] arr = [*mem_addr, *mem_addr];
1771     return loadUnaligned!(double2)(&arr[0]);
1772 }
1773 unittest
1774 {
1775     double what = 4;
1776     __m128d R = _mm_load_pd1(&what);
1777     double[2] correct = [4.0, 4];
1778     assert(R.array == correct);
1779 }
1780 
1781 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1782 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1783 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1784 {
1785     double2 r = [0, 0];
1786     r.ptr[0] = *mem_addr;
1787     return r;
1788 }
1789 unittest
1790 {
1791     double x = -42;
1792     __m128d a = _mm_load_sd(&x);
1793     assert(a.array == [-42.0, 0.0]);
1794 }
1795 
1796 /// Load 128-bits of integer data from memory into dst. 
1797 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1798 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1799 {
1800     return *mem_addr;
1801 }
1802 unittest
1803 {
1804     align(16) int[4] correct = [-1, 2, 3, 4];
1805     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1806     assert(A.array == correct);
1807 }
1808 
1809 alias _mm_load1_pd = _mm_load_pd1; ///
1810 
1811 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1812 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1813 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1814 {
1815     a.ptr[1] = *mem_addr;
1816     return a;
1817 }
1818 unittest
1819 {
1820     double A = 7.0;
1821     __m128d B = _mm_setr_pd(4.0, -5.0);
1822     __m128d R = _mm_loadh_pd(B, &A);
1823     double[2] correct = [ 4.0, 7.0 ];
1824     assert(R.array == correct);
1825 }
1826 
1827 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1828 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1829 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1830 {
1831     auto pLong = cast(const(long)*)mem_addr;
1832     long2 r = [0, 0];
1833     r.ptr[0] = *pLong;
1834     return cast(__m128i)(r);
1835 }
1836 unittest
1837 {
1838     long A = 0x7878787870707070;
1839     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1840     long[2] correct = [0x7878787870707070, 0];
1841     assert(R.array == correct);
1842 }
1843 
1844 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1845 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1846 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1847 {
1848     a.ptr[0] = *mem_addr;
1849     return a;
1850 }
1851 unittest
1852 {
1853     double A = 7.0;
1854     __m128d B = _mm_setr_pd(4.0, -5.0);
1855     __m128d R = _mm_loadl_pd(B, &A);
1856     double[2] correct = [ 7.0, -5.0 ];
1857     assert(R.array == correct);
1858 }
1859 
1860 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1861 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1862 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted // TODO: shouldn't be trusted
1863 {
1864     __m128d a = *cast(__m128d*)(mem_addr);
1865     __m128d r;
1866     r.ptr[0] = a.array[1];
1867     r.ptr[1] = a.array[0];
1868     return r;
1869 }
1870 unittest
1871 {
1872     align(16) double[2] A = [56.0, -74.0];
1873     __m128d R = _mm_loadr_pd(A.ptr);
1874     double[2] correct = [-74.0, 56.0];
1875     assert(R.array == correct);
1876 }
1877 
1878 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1879 /// `mem_addr` does not need to be aligned on any particular boundary.
1880 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
1881 {
1882     static if (GDC_with_SSE2)
1883     {
1884         return __builtin_ia32_loadupd(mem_addr); 
1885     }
1886     else
1887     {
1888         return loadUnaligned!(double2)(mem_addr);
1889     }
1890 }
1891 unittest
1892 {
1893     double[2] A = [56.0, -75.0];
1894     __m128d R = _mm_loadu_pd(A.ptr);
1895     double[2] correct = [56.0, -75.0];
1896     assert(R.array == correct);
1897 }
1898 
1899 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
1900 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1901 {
1902     static if (GDC_with_SSE2)
1903     {
1904         return __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1905     }
1906     else
1907     {
1908         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1909     }
1910 }
1911 unittest
1912 {
1913     align(16) int[4] correct = [-1, 2, -3, 4];
1914     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
1915     assert(A.array == correct);
1916 }
1917 
1918 /// Load unaligned 32-bit integer from memory into the first element of result.
1919 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1920 {
1921     int r = *cast(int*)(mem_addr);
1922     int4 result = [0, 0, 0, 0];
1923     result.ptr[0] = r;
1924     return result;
1925 }
1926 unittest
1927 {
1928     int r = 42;
1929     __m128i A = _mm_loadu_si32(&r);
1930     int[4] correct = [42, 0, 0, 0];
1931     assert(A.array == correct);
1932 }
1933 
1934 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1935 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1936 /// and pack the results in destination.
1937 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
1938 {
1939     static if (GDC_with_SSE2)
1940     {
1941         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
1942     }
1943     else static if (LDC_with_SSE2)
1944     {
1945         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
1946     }
1947     else static if (LDC_with_ARM64)
1948     {
1949         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
1950         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
1951         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
1952         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
1953         return vcombine_s32(rl, rh);
1954     }
1955     else
1956     {
1957         short8 sa = cast(short8)a;
1958         short8 sb = cast(short8)b;
1959         int4 r;
1960         foreach(i; 0..4)
1961         {
1962             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
1963         }
1964         return r;
1965     }
1966 }
1967 unittest
1968 {
1969     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1970     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1971     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
1972     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
1973     assert(R.array == correct);
1974 }
1975 
1976 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
1977 /// (elements are not stored when the highest bit is not set in the corresponding element)
1978 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
1979 /// boundary.
1980 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
1981 {
1982     static if (GDC_with_SSE2)
1983     {    
1984         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
1985     }
1986     else static if (LDC_with_SSE2)
1987     {
1988         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
1989     }
1990     else
1991     {
1992         // PERF: catastrophic on ARM
1993         byte16 b = cast(byte16)a;
1994         byte16 m = cast(byte16)mask;
1995         byte* dest = cast(byte*)(mem_addr);
1996         foreach(j; 0..16)
1997         {
1998             if (m.array[j] & 128)
1999             {
2000                 dest[j] = b.array[j];
2001             }
2002         }
2003     }
2004 }
2005 unittest
2006 {
2007     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2008     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2009     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2010     _mm_maskmoveu_si128(A, mask, dest.ptr);
2011     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2012     assert(dest == correct);
2013 }
2014 
2015 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2016 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2017 {
2018     version(GNU)
2019     {
2020         // PERF: not necessarily the best for GDC
2021         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2022         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2023         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2024         return _mm_xor_si128(b, mask);
2025     }
2026     else
2027     {
2028         // x86: pmaxsw since LDC 1.0 -O1
2029         // ARM: smax.8h since LDC 1.5 -01
2030         short8 sa = cast(short8)a;
2031         short8 sb = cast(short8)b;
2032         short8 greater = greaterMask!short8(sa, sb);
2033         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2034     }
2035 }
2036 unittest
2037 {
2038     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2039                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2040     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2041     assert(R.array == correct);
2042 }
2043 
2044 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2045 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2046 {
2047     version(LDC)
2048     {
2049         // x86: pmaxub since LDC 1.0.0 -O1
2050         // ARM64: umax.16b since LDC 1.5.0 -O1
2051         // PERF: catastrophic on ARM32
2052         alias ubyte16 = Vector!(ubyte[16]);
2053         ubyte16 sa = cast(ubyte16)a;
2054         ubyte16 sb = cast(ubyte16)b;
2055         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2056         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2057     }
2058     else
2059     {
2060         __m128i value128 = _mm_set1_epi8(-128);
2061         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2062         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2063         __m128i mask = _mm_and_si128(aTob, higher);
2064         return _mm_xor_si128(b, mask);
2065     }
2066 }
2067 unittest
2068 {
2069     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2070                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2071     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2072     assert(R.array == correct);
2073 }
2074 
2075 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2076 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2077 {
2078     static if (GDC_with_SSE2)
2079     {
2080         return __builtin_ia32_maxpd(a, b);
2081     }
2082     else
2083     {
2084         // x86: Generates maxpd starting with LDC 1.9 -O2
2085         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2086         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2087         return a;
2088     }
2089 }
2090 unittest
2091 {
2092     __m128d A = _mm_setr_pd(4.0, 1.0);
2093     __m128d B = _mm_setr_pd(1.0, 8.0);
2094     __m128d M = _mm_max_pd(A, B);
2095     assert(M.array[0] == 4.0);
2096     assert(M.array[1] == 8.0);
2097 }
2098 
2099 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2100 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2101 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2102 {
2103     static if (GDC_with_SSE2)
2104     {
2105         return __builtin_ia32_maxsd(a, b);
2106     }
2107     else
2108     {
2109          __m128d r = a;
2110         // Generates maxsd starting with LDC 1.3
2111         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2112         return r;
2113     }
2114 }
2115 unittest
2116 {
2117     __m128d A = _mm_setr_pd(1.0, 1.0);
2118     __m128d B = _mm_setr_pd(4.0, 2.0);
2119     __m128d M = _mm_max_sd(A, B);
2120     assert(M.array[0] == 4.0);
2121     assert(M.array[1] == 1.0);
2122 }
2123 
2124 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2125 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2126 /// is globally visible before any memory instruction which follows the fence in program order.
2127 void _mm_mfence() @trusted
2128 {
2129     version(GNU)
2130     {
2131         static if (GDC_with_SSE2)
2132         {
2133             __builtin_ia32_mfence();
2134         }
2135         else version(X86)
2136         {
2137             asm pure nothrow @nogc @trusted
2138             {
2139                 "mfence;\n" : : : ;
2140             }
2141         }
2142         else
2143             static assert(false);
2144     }
2145     else static if (LDC_with_SSE2)
2146     {
2147         __builtin_ia32_mfence();
2148     }
2149     else static if (DMD_with_asm)
2150     {
2151         asm nothrow @nogc pure @safe
2152         {
2153             mfence;
2154         }
2155     }
2156     else version(LDC)
2157     {
2158         void _mm_mfence() pure @safe
2159         {
2160             // Note: will generate the DMB instruction on ARM
2161             llvm_memory_fence();
2162         }
2163     }
2164     else
2165         static assert(false);
2166 }
2167 unittest
2168 {
2169     _mm_mfence();
2170 }
2171 
2172 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2173 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2174 {
2175     version(GNU)
2176     {
2177         // PERF: not necessarily the best for GDC
2178         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2179         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2180         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2181         return _mm_xor_si128(b, mask);
2182     }
2183     else
2184     {
2185         // x86: pminsw since LDC 1.0 -O1
2186         // ARM: smin.8h since LDC 1.5 -01
2187         short8 sa = cast(short8)a;
2188         short8 sb = cast(short8)b;
2189         short8 greater = greaterMask!short8(sa, sb);
2190         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2191     }
2192 }
2193 unittest
2194 {
2195     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2196                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2197     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2198     assert(R.array == correct);
2199 }
2200 
2201 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2202 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2203 {
2204     version(LDC)
2205     {
2206         // x86: pminub since LDC 1.0.0 -O1
2207         // ARM: umin.16b since LDC 1.5.0 -O1
2208         // PERF: catastrophic on ARM32
2209         alias ubyte16 = Vector!(ubyte[16]);
2210         ubyte16 sa = cast(ubyte16)a;
2211         ubyte16 sb = cast(ubyte16)b;
2212         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2213         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2214     }
2215     else
2216     {
2217         __m128i value128 = _mm_set1_epi8(-128);
2218         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2219         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2220         __m128i mask = _mm_and_si128(aTob, lower);
2221         return _mm_xor_si128(b, mask);
2222     }
2223 }
2224 unittest
2225 {
2226     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2227                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2228     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2229     assert(R.array == correct);
2230 }
2231 
2232 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2233 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2234 {
2235     static if (GDC_with_SSE2)
2236     {
2237         return __builtin_ia32_minpd(a, b);
2238     }
2239     else
2240     {
2241         // Generates minpd starting with LDC 1.9
2242         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2243         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2244         return a;
2245     }
2246 }
2247 unittest
2248 {
2249     __m128d A = _mm_setr_pd(1.0, 2.0);
2250     __m128d B = _mm_setr_pd(4.0, 1.0);
2251     __m128d M = _mm_min_pd(A, B);
2252     assert(M.array[0] == 1.0);
2253     assert(M.array[1] == 1.0);
2254 }
2255 
2256 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2257 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2258 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2259 {
2260     static if (GDC_with_SSE2)
2261     {
2262         return __builtin_ia32_minsd(a, b);
2263     }
2264     else
2265     {
2266         // Generates minsd starting with LDC 1.3
2267         __m128d r = a;
2268         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2269         return r;
2270     }
2271 }
2272 unittest
2273 {
2274     __m128d A = _mm_setr_pd(1.0, 3.0);
2275     __m128d B = _mm_setr_pd(4.0, 2.0);
2276     __m128d M = _mm_min_sd(A, B);
2277     assert(M.array[0] == 1.0);
2278     assert(M.array[1] == 3.0);
2279 }
2280 
2281 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2282 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2283 {
2284     static if (GDC_with_SSE2)
2285     {
2286         return __builtin_ia32_movq128(a);
2287     }
2288     else
2289     {
2290         long2 result = [ 0, 0 ];
2291         long2 la = cast(long2) a;
2292         result.ptr[0] = la.array[0];
2293         return cast(__m128i)(result);
2294     }
2295 }
2296 unittest
2297 {
2298     long2 A = [13, 47];
2299     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2300     long[2] correct = [13, 0];
2301     assert(B.array == correct);
2302 }
2303 
2304 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2305 /// the upper element from `a` to the upper element of dst.
2306 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2307 {
2308     static if (GDC_with_SSE2)
2309     {
2310         return __builtin_ia32_movsd(a, b); 
2311     }
2312     else
2313     {
2314         b.ptr[1] = a.array[1];
2315         return b;
2316     }
2317 }
2318 unittest
2319 {
2320     double2 A = [13.0, 47.0];
2321     double2 B = [34.0, 58.0];
2322     double2 C = _mm_move_sd(A, B);
2323     double[2] correct = [34.0, 47.0];
2324     assert(C.array == correct);
2325 }
2326 
2327 /// Create mask from the most significant bit of each 8-bit element in `v`.
2328 int _mm_movemask_epi8 (__m128i a) pure @trusted
2329 {
2330     static if (GDC_with_SSE2)
2331     {
2332         /// Create mask from the most significant bit of each 8-bit element in `v`.
2333         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2334     }
2335     else static if (LDC_with_SSE2)
2336     {
2337         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2338     }
2339     else static if (LDC_with_ARM64)
2340     {
2341         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2342         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2343         // SO there might be something a bit faster, but this one is reasonable and branchless.
2344         byte8 mask_shift;
2345         mask_shift.ptr[0] = 7;
2346         mask_shift.ptr[1] = 6;
2347         mask_shift.ptr[2] = 5;
2348         mask_shift.ptr[3] = 4;
2349         mask_shift.ptr[4] = 3;
2350         mask_shift.ptr[5] = 2;
2351         mask_shift.ptr[6] = 1;
2352         mask_shift.ptr[7] = 0;
2353         byte8 mask_and = byte8(-128);
2354         byte8 lo = vget_low_u8(cast(byte16)a);
2355         byte8 hi = vget_high_u8(cast(byte16)a);
2356         lo = vand_u8(lo, mask_and);
2357         lo = vshr_u8(lo, mask_shift);
2358         hi = vand_u8(hi, mask_and);
2359         hi = vshr_u8(hi, mask_shift);
2360         lo = vpadd_u8(lo,lo);
2361         lo = vpadd_u8(lo,lo);
2362         lo = vpadd_u8(lo,lo);
2363         hi = vpadd_u8(hi,hi);
2364         hi = vpadd_u8(hi,hi);
2365         hi = vpadd_u8(hi,hi);
2366         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2367     }
2368     else
2369     {
2370         byte16 ai = cast(byte16)a;
2371         int r = 0;
2372         foreach(bit; 0..16)
2373         {
2374             if (ai.array[bit] < 0) r += (1 << bit);
2375         }
2376         return r;
2377     }
2378 }
2379 unittest
2380 {
2381     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2382 }
2383 
2384 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2385 /// loating-point element in `v`.
2386 int _mm_movemask_pd(__m128d v) pure @safe
2387 {
2388     static if (GDC_with_SSE2)
2389     {
2390         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2391         /// packed double-precision (64-bit) floating-point element in `v`.
2392         return __builtin_ia32_movmskpd(v);
2393     }
2394     else static if (LDC_with_SSE2)
2395     {
2396         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2397         /// packed double-precision (64-bit) floating-point element in `v`.
2398         return __builtin_ia32_movmskpd(v);
2399     }
2400     else
2401     {
2402         long2 lv = cast(long2)v;
2403         int r = 0;
2404         if (lv.array[0] < 0) r += 1;
2405         if (lv.array[1] < 0) r += 2;
2406         return r;
2407     }
2408 }
2409 unittest
2410 {
2411     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2412     assert(_mm_movemask_pd(A) == 2);
2413 }
2414 
2415 /// Copy the lower 64-bit integer in `v`.
2416 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2417 {
2418     long2 lv = cast(long2)v;
2419     return long1(lv.array[0]);
2420 }
2421 unittest
2422 {
2423     __m128i A = _mm_set_epi64x(-1, -2);
2424     __m64 R = _mm_movepi64_pi64(A);
2425     assert(R.array[0] == -2);
2426 }
2427 
2428 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2429 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2430 {
2431     long2 r;
2432     r.ptr[0] = a.array[0];
2433     r.ptr[1] = 0;
2434     return cast(__m128i)r;
2435 }
2436 
2437 // Note: generates pmuludq in LDC with -O1
2438 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2439 {
2440     __m128i zero = _mm_setzero_si128();
2441 
2442     static if (__VERSION__ >= 2088)
2443     {
2444         // Need LLVM9 to avoid this shufflevector
2445         long2 la, lb;
2446         la.ptr[0] = cast(uint)a.array[0];
2447         la.ptr[1] = cast(uint)a.array[2];
2448         lb.ptr[0] = cast(uint)b.array[0];
2449         lb.ptr[1] = cast(uint)b.array[2];
2450     }
2451     else
2452     {
2453         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2454         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2455     }
2456 
2457     version(DigitalMars)
2458     {
2459         // DMD has no long2 mul
2460         // long2 mul not supported before LDC 1.5
2461         la.ptr[0] *= lb.array[0];
2462         la.ptr[1] *= lb.array[1];
2463         return cast(__m128i)(la);
2464     }
2465     else
2466     {
2467         static if (__VERSION__ >= 2076)
2468         {
2469             return cast(__m128i)(la * lb);
2470         }
2471         else
2472         {
2473             // long2 mul not supported before LDC 1.5
2474             la.ptr[0] *= lb.array[0];
2475             la.ptr[1] *= lb.array[1];
2476             return cast(__m128i)(la);
2477         }
2478     }
2479 }
2480 unittest
2481 {
2482     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2483     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2484     __m128i C = _mm_mul_epu32(A, B);
2485     long2 LC = cast(long2)C;
2486     assert(LC.array[0] == 18446744065119617025uL);
2487     assert(LC.array[1] == 12723420444339690338uL);
2488 }
2489 
2490 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2491 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2492 {
2493     return a * b;
2494 }
2495 unittest
2496 {
2497     __m128d a = [-2.0, 1.5];
2498     a = _mm_mul_pd(a, a);
2499     assert(a.array == [4.0, 2.25]);
2500 }
2501 
2502 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2503 /// element of result, and copy the upper element from `a` to the upper element of result.
2504 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2505 {
2506     version(DigitalMars)
2507     {    
2508         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2509         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2510         asm pure nothrow @nogc @trusted { nop;}
2511         a.array[0] = a.array[0] * b.array[0];
2512         return a;
2513     }
2514     else static if (GDC_with_SSE2)
2515     {
2516         return __builtin_ia32_mulsd(a, b);
2517     }
2518     else
2519     {
2520         a.ptr[0] *= b.array[0];
2521         return a;
2522     }
2523 }
2524 unittest
2525 {
2526     __m128d a = [-2.0, 1.5];
2527     a = _mm_mul_sd(a, a);
2528     assert(a.array == [4.0, 1.5]);
2529 }
2530 
2531 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2532 /// and get an unsigned 64-bit result.
2533 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2534 {
2535     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2536 }
2537 unittest
2538 {
2539     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2540     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2541     __m64 C = _mm_mul_su32(A, B);
2542     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2543 }
2544 
2545 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2546 /// high 16 bits of the intermediate integers.
2547 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2548 {
2549     static if (GDC_with_SSE2)
2550     {
2551         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2552     }
2553     else static if (LDC_with_SSE2)
2554     {
2555         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2556     }
2557     else
2558     {
2559         // PERF ARM?
2560         short8 sa = cast(short8)a;
2561         short8 sb = cast(short8)b;
2562         short8 r = void;
2563         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2564         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2565         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2566         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2567         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2568         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2569         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2570         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2571         return cast(__m128i)r;
2572     }
2573 }
2574 unittest
2575 {
2576     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2577     __m128i B = _mm_set1_epi16(16384);
2578     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2579     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2580     assert(R.array == correct);
2581 }
2582 
2583 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2584 /// high 16 bits of the intermediate integers.
2585 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2586 {
2587     static if (GDC_with_SSE2)
2588     {
2589         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2590     }
2591     else static if (LDC_with_SSE2)
2592     {
2593         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2594     }
2595     else
2596     {
2597         // PERF ARM??
2598         short8 sa = cast(short8)a;
2599         short8 sb = cast(short8)b;
2600         short8 r = void;
2601         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2602         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2603         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2604         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2605         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2606         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2607         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2608         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2609         return cast(__m128i)r;
2610     }
2611 }
2612 unittest
2613 {
2614     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2615     __m128i B = _mm_set1_epi16(16384);
2616     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2617     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2618     assert(R.array == correct);
2619 }
2620 
2621 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2622 /// bits of the intermediate integers.
2623 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2624 {
2625     return cast(__m128i)(cast(short8)a * cast(short8)b);
2626 }
2627 unittest
2628 {
2629     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2630     __m128i B = _mm_set1_epi16(16384);
2631     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2632     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2633     assert(R.array == correct);
2634 }
2635 
2636 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2637 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2638 {
2639     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2640 }
2641 
2642 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2643 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2644 {
2645     return a | b;
2646 }
2647 
2648 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2649 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2650 {
2651     static if (GDC_with_SSE2)
2652     {
2653         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2654     }    
2655     else static if (LDC_with_SSE2)
2656     {
2657         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2658     }
2659     else static if (LDC_with_ARM64)
2660     {
2661         short4 ra = vqmovn_s32(cast(int4)a);
2662         short4 rb = vqmovn_s32(cast(int4)b);
2663         return cast(__m128i)vcombine_s16(ra, rb);
2664     }
2665     else
2666     {
2667         // PERF: catastrophic on ARM
2668         short8 r;
2669         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2670         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2671         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2672         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2673         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2674         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2675         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2676         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2677         return cast(__m128i)r;
2678     }
2679 }
2680 unittest
2681 {
2682     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2683     short8 R = cast(short8) _mm_packs_epi32(A, A);
2684     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2685     assert(R.array == correct);
2686 }
2687 
2688 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2689 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2690 {
2691     static if (GDC_with_SSE2)
2692     {
2693         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2694     }
2695     else static if (LDC_with_SSE2)
2696     {
2697         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2698     }
2699     else static if (LDC_with_ARM64)
2700     {
2701         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2702         byte8 ra = vqmovn_s16(cast(short8)a);
2703         byte8 rb = vqmovn_s16(cast(short8)b);
2704         return cast(__m128i)vcombine_s8(ra, rb);
2705     }
2706     else
2707     {
2708         // PERF: ARM32 is missing
2709         byte16 r;
2710         short8 sa = cast(short8)a;
2711         short8 sb = cast(short8)b;
2712         foreach(i; 0..8)
2713             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2714         foreach(i; 0..8)
2715             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2716         return cast(__m128i)r;
2717     }
2718 }
2719 unittest
2720 {
2721     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2722     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2723     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2724                         127, -128, 127, 0, 127, -128, 127, 0];
2725     assert(R.array == correct);
2726 }
2727 
2728 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2729 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2730 {
2731     static if (GDC_with_SSE2)
2732     {
2733         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2734     }
2735     else static if (LDC_with_SSE2)
2736     {
2737         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2738     }
2739     else static if (LDC_with_ARM64)
2740     {
2741         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2742         byte8 ra = vqmovun_s16(cast(short8)a);
2743         byte8 rb = vqmovun_s16(cast(short8)b);
2744         return cast(__m128i)vcombine_s8(ra, rb);
2745     }
2746     else
2747     {
2748         short8 sa = cast(short8)a;
2749         short8 sb = cast(short8)b;
2750         ubyte[16] result = void;
2751         for (int i = 0; i < 8; ++i)
2752         {
2753             short s = sa[i];
2754             if (s < 0) s = 0;
2755             if (s > 255) s = 255;
2756             result[i] = cast(ubyte)s;
2757 
2758             s = sb[i];
2759             if (s < 0) s = 0;
2760             if (s > 255) s = 255;
2761             result[i+8] = cast(ubyte)s;
2762         }
2763         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2764     }
2765 }
2766 unittest
2767 {
2768     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2769     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2770     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2771                                                 0, 255, 0, 255, 255, 2, 1, 0];
2772     foreach(i; 0..16)
2773         assert(AA.array[i] == cast(byte)(correctResult[i]));
2774 }
2775 
2776 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2777 /// and power consumption of spin-wait loops.
2778 void _mm_pause() @trusted
2779 {
2780     version(GNU)
2781     {
2782         static if (GDC_with_SSE2)
2783         {
2784             __builtin_ia32_pause();
2785         }
2786         else version(X86)
2787         {
2788             asm pure nothrow @nogc @trusted
2789             {
2790                 "pause;\n" : : : ;
2791             }
2792         }
2793         else
2794             static assert(false);
2795     }
2796     else static if (LDC_with_SSE2)
2797     {
2798         __builtin_ia32_pause();
2799     }
2800     else static if (DMD_with_asm)
2801     {
2802         asm nothrow @nogc pure @safe
2803         {
2804             rep; nop; // F3 90 =  pause
2805         }
2806     }
2807     else version (LDC)
2808     {
2809         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2810     }
2811     else
2812         static assert(false);
2813 }
2814 unittest
2815 {
2816     _mm_pause();
2817 }
2818 
2819 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2820 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2821 /// low 16 bits of 64-bit elements in result.
2822 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2823 {
2824     static if (GDC_with_SSE2)
2825     {
2826         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2827     }
2828     else static if (LDC_with_SSE2)
2829     {
2830         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2831     }
2832     else
2833     {
2834         // PERF: ARM??
2835         byte16 ab = cast(byte16)a;
2836         byte16 bb = cast(byte16)b;
2837         ubyte[16] t;
2838         foreach(i; 0..16)
2839         {
2840             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2841             if (diff < 0) diff = -diff;
2842             t[i] = cast(ubyte)(diff);
2843         }
2844         int4 r = _mm_setzero_si128();
2845         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2846         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2847         return r;
2848     }
2849 }
2850 unittest
2851 {
2852     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2853     __m128i B = _mm_set1_epi8(1);
2854     __m128i R = _mm_sad_epu8(A, B);
2855     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2856                       0,
2857                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2858                       0];
2859     assert(R.array == correct);
2860 }
2861 
2862 /// Set packed 16-bit integers with the supplied values.
2863 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2864 {
2865     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2866     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2867 }
2868 unittest
2869 {
2870     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2871     short8 B = cast(short8) A;
2872     foreach(i; 0..8)
2873         assert(B.array[i] == i);
2874 }
2875 
2876 /// Set packed 32-bit integers with the supplied values.
2877 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2878 {
2879     int[4] result = [e0, e1, e2, e3];
2880     return loadUnaligned!(int4)(result.ptr);
2881 }
2882 unittest
2883 {
2884     __m128i A = _mm_set_epi32(3, 2, 1, 0);
2885     foreach(i; 0..4)
2886         assert(A.array[i] == i);
2887 }
2888 
2889 /// Set packed 64-bit integers with the supplied values.
2890 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
2891 {
2892     long[2] result = [e0.array[0], e1.array[0]];
2893     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2894 }
2895 unittest
2896 {
2897     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
2898     long2 B = cast(long2) A;
2899     assert(B.array[0] == 5678);
2900     assert(B.array[1] == 1234);
2901 }
2902 
2903 /// Set packed 64-bit integers with the supplied values.
2904 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
2905 {
2906     long[2] result = [e0, e1];
2907     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2908 }
2909 unittest
2910 {
2911     __m128i A = _mm_set_epi64x(1234, 5678);
2912     long2 B = cast(long2) A;
2913     assert(B.array[0] == 5678);
2914     assert(B.array[1] == 1234);
2915 }
2916 
2917 /// Set packed 8-bit integers with the supplied values.
2918 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
2919                       byte e11, byte e10, byte e9, byte e8,
2920                       byte e7, byte e6, byte e5, byte e4,
2921                       byte e3, byte e2, byte e1, byte e0) pure @trusted
2922 {
2923     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
2924                      e8, e9, e10, e11, e12, e13, e14, e15];
2925     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2926 }
2927 
2928 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
2929 __m128d _mm_set_pd (double e1, double e0) pure @trusted
2930 {
2931     double[2] result = [e0, e1];
2932     return loadUnaligned!(double2)(result.ptr);
2933 }
2934 unittest
2935 {
2936     __m128d A = _mm_set_pd(61.0, 55.0);
2937     double[2] correct = [55.0, 61.0];
2938     assert(A.array == correct);
2939 }
2940 
2941 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
2942 __m128d _mm_set_pd1 (double a) pure @trusted
2943 {
2944     double[2] result = [a, a];
2945     return loadUnaligned!(double2)(result.ptr);
2946 }
2947 unittest
2948 {
2949     __m128d A = _mm_set_pd1(61.0);
2950     double[2] correct = [61.0, 61.0];
2951     assert(A.array == correct);
2952 }
2953 
2954 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
2955 /// and zero the upper element.
2956 __m128d _mm_set_sd (double a) pure @trusted
2957 {
2958     double[2] result = [a, 0];
2959     return loadUnaligned!(double2)(result.ptr);
2960 }
2961 
2962 /// Broadcast 16-bit integer a to all elements of dst.
2963 __m128i _mm_set1_epi16 (short a) pure @trusted
2964 {
2965     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
2966     {
2967         short8 v = a;
2968         return cast(__m128i) v;
2969     }
2970     else
2971         return cast(__m128i)(short8(a));
2972 }
2973 unittest
2974 {
2975     short8 a = cast(short8) _mm_set1_epi16(31);
2976     for (int i = 0; i < 8; ++i)
2977         assert(a.array[i] == 31);
2978 }
2979 
2980 /// Broadcast 32-bit integer `a` to all elements.
2981 __m128i _mm_set1_epi32 (int a) pure @trusted
2982 {
2983     return cast(__m128i)(int4(a));
2984 }
2985 unittest
2986 {
2987     __m128 a = _mm_set1_ps(-1.0f);
2988     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
2989     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
2990 }
2991 
2992 /// Broadcast 64-bit integer `a` to all elements.
2993 __m128i _mm_set1_epi64 (__m64 a) pure @safe
2994 {
2995     return _mm_set_epi64(a, a);
2996 }
2997 unittest
2998 {
2999     long b = 0x1DEADCAFE; 
3000     __m64 a;
3001     a.ptr[0] = b;
3002     long2 c = cast(long2) _mm_set1_epi64(a);
3003     assert(c.array[0] == b);
3004     assert(c.array[1] == b);
3005 }
3006 
3007 /// Broadcast 64-bit integer `a` to all elements
3008 __m128i _mm_set1_epi64x (long a) pure @trusted
3009 {
3010     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3011     return cast(__m128i)(b);
3012 }
3013 unittest
3014 {
3015     long b = 0x1DEADCAFE;
3016     long2 c = cast(long2) _mm_set1_epi64x(b);
3017     for (int i = 0; i < 2; ++i)
3018         assert(c.array[i] == b);
3019 }
3020 
3021 /// Broadcast 8-bit integer `a` to all elements.
3022 __m128i _mm_set1_epi8 (byte a) pure @trusted
3023 {
3024     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3025     return cast(__m128i)(b);
3026 }
3027 unittest
3028 {
3029     byte16 b = cast(byte16) _mm_set1_epi8(31);
3030     for (int i = 0; i < 16; ++i)
3031         assert(b.array[i] == 31);
3032 }
3033 
3034 alias _mm_set1_pd = _mm_set_pd1;
3035 
3036 /// Set packed 16-bit integers with the supplied values in reverse order.
3037 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3038                         short e3, short e2, short e1, short e0) pure @trusted
3039 {
3040     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
3041     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
3042 }
3043 unittest
3044 {
3045     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3046     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3047     assert(A.array == correct);
3048 }
3049 
3050 /// Set packed 32-bit integers with the supplied values in reverse order.
3051 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3052 {
3053     int[4] result = [e3, e2, e1, e0];
3054     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3055 }
3056 unittest
3057 {
3058     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3059     int[4] correct = [-1, 0, -2147483648, 2147483647];
3060     assert(A.array == correct);
3061 }
3062 
3063 /// Set packed 64-bit integers with the supplied values in reverse order.
3064 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3065 {
3066     long[2] result = [e1, e0];
3067     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3068 }
3069 unittest
3070 {
3071     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3072     long[2] correct = [-1, 0];
3073     assert(A.array == correct);
3074 }
3075 
3076 /// Set packed 8-bit integers with the supplied values in reverse order.
3077 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3078                        byte e11, byte e10, byte e9,  byte e8,
3079                        byte e7,  byte e6,  byte e5,  byte e4,
3080                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3081 {
3082     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3083                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3084     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3085 }
3086 
3087 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3088 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3089 {
3090     double[2] result = [e1, e0];
3091     return loadUnaligned!(double2)(result.ptr);
3092 }
3093 unittest
3094 {
3095     __m128d A = _mm_setr_pd(61.0, 55.0);
3096     double[2] correct = [61.0, 55.0];
3097     assert(A.array == correct);
3098 }
3099 
3100 /// Return vector of type `__m128d` with all elements set to zero.
3101 __m128d _mm_setzero_pd () pure @trusted
3102 {
3103     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3104     double[2] result = [0.0, 0.0];
3105     return loadUnaligned!(double2)(result.ptr);
3106 }
3107 
3108 /// Return vector of type `__m128i` with all elements set to zero.
3109 __m128i _mm_setzero_si128() pure @trusted
3110 {
3111     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3112     int[4] result = [0, 0, 0, 0];
3113     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3114 }
3115 
3116 /// Shuffle 32-bit integers in a using the control in `imm8`.
3117 /// See_also: `_MM_SHUFFLE`.
3118 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3119 {
3120     static if (GDC_with_SSE2)
3121     {
3122         return __builtin_ia32_pshufd(a, imm8);
3123     }
3124     else
3125     {
3126         return shufflevector!(int4, (imm8 >> 0) & 3,
3127                                     (imm8 >> 2) & 3,
3128                                     (imm8 >> 4) & 3,
3129                                     (imm8 >> 6) & 3)(a, a);
3130     }
3131 }
3132 unittest
3133 {
3134     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3135     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3136     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3137     int[4] expectedB = [ 3, 2, 1, 0 ];
3138     assert(B.array == expectedB);
3139 }
3140 
3141 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3142 /// See_also: `_MM_SHUFFLE2`.
3143 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3144 {
3145     static if (GDC_with_SSE2)
3146     {
3147         return __builtin_ia32_shufpd(a, b, imm8);
3148     }
3149     else
3150     {
3151         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3152                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
3153     }
3154 }
3155 unittest
3156 {
3157     __m128d A = _mm_setr_pd(0.5, 2.0);
3158     __m128d B = _mm_setr_pd(4.0, 5.0);
3159     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3160     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3161     double[2] correct = [ 2.0, 5.0 ];
3162     assert(R.array == correct);
3163 }
3164 
3165 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3166 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3167 /// See also: `_MM_SHUFFLE`.
3168 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3169 {
3170     static if (GDC_with_SSE2)
3171     {
3172         return __builtin_ia32_pshufhw(a, imm8);
3173     }
3174     else
3175     {
3176         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3177                                           4 + ( (imm8 >> 0) & 3 ),
3178                                           4 + ( (imm8 >> 2) & 3 ),
3179                                           4 + ( (imm8 >> 4) & 3 ),
3180                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3181     }
3182 }
3183 unittest
3184 {
3185     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3186     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3187     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3188     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3189     assert(C.array == expectedC);
3190 }
3191 
3192 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3193 /// bits of result, with the high 64 bits being copied from from `a` to result.
3194 /// See_also: `_MM_SHUFFLE`.
3195 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3196 {
3197     static if (GDC_with_SSE2)
3198     {
3199         return __builtin_ia32_pshuflw(a, imm8);
3200     }
3201     else
3202     {
3203         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3204                                                     ( (imm8 >> 2) & 3 ),
3205                                                     ( (imm8 >> 4) & 3 ),
3206                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3207     }
3208 }
3209 unittest
3210 {
3211     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3212     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3213     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3214     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3215     assert(B.array == expectedB);
3216 }
3217 
3218 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3219 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3220 {
3221     static if (LDC_with_SSE2)
3222     {
3223         return __builtin_ia32_pslld128(a, count);
3224     }
3225     else static if (GDC_with_SSE2)
3226     {
3227         return __builtin_ia32_pslld128(a, count);
3228     }
3229     else static if (DMD_with_32bit_asm)
3230     {
3231         asm pure nothrow @nogc @trusted
3232         {
3233             movdqu XMM0, a;
3234             movdqu XMM1, count;
3235             pslld XMM0, XMM1;
3236             movdqu a, XMM0;
3237         }
3238         return a;
3239     }
3240     else
3241     {
3242         int4 r = void;
3243         long2 lc = cast(long2)count;
3244         int bits = cast(int)(lc.array[0]);
3245         foreach(i; 0..4)
3246             r[i] = cast(uint)(a[i]) << bits;
3247         return r;
3248     }
3249 }
3250 
3251 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3252 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3253 {
3254     static if (LDC_with_SSE2)
3255     {
3256         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3257     }
3258     else static if (GDC_with_SSE2)
3259     {
3260         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3261     }
3262     else static if (DMD_with_32bit_asm)
3263     {
3264         asm pure nothrow @nogc @trusted
3265         {
3266             movdqu XMM0, a;
3267             movdqu XMM1, count;
3268             psllq XMM0, XMM1;
3269             movdqu a, XMM0;
3270         }
3271         return a;
3272     }
3273     else
3274     {
3275         // ARM: good since LDC 1.12 -O2
3276         // ~but -O0 version is catastrophic
3277         long2 r = void;
3278         long2 sa = cast(long2)a;
3279         long2 lc = cast(long2)count;
3280         int bits = cast(int)(lc.array[0]);
3281         foreach(i; 0..2)
3282             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3283         return cast(__m128i)r;
3284     }
3285 }
3286 
3287 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3288 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3289 {
3290     static if (LDC_with_SSE2)
3291     {
3292         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3293     }
3294     else static if (GDC_with_SSE2)
3295     {
3296         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3297     }
3298     else static if (DMD_with_32bit_asm)
3299     {
3300         asm pure nothrow @nogc
3301         {
3302             movdqu XMM0, a;
3303             movdqu XMM1, count;
3304             psllw XMM0, XMM1;
3305             movdqu a, XMM0;
3306         }
3307         return a;
3308     }
3309     else
3310     {
3311         short8 sa = cast(short8)a;
3312         long2 lc = cast(long2)count;
3313         int bits = cast(int)(lc.array[0]);
3314         short8 r = void;
3315         foreach(i; 0..8)
3316             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3317         return cast(int4)r;
3318     }
3319 }
3320 
3321 
3322 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3323 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3324 {
3325     static if (GDC_with_SSE2)
3326     {
3327         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3328     }
3329     else static if (LDC_with_SSE2)
3330     {
3331         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3332     }
3333     else
3334     {
3335         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3336         //       D says "It's illegal to shift by the same or more bits 
3337         //       than the size of the quantity being shifted"
3338         //       and it's UB instead.
3339         int4 r = _mm_setzero_si128();
3340 
3341         ubyte count = cast(ubyte) imm8;
3342         if (count > 31)
3343             return r;
3344         
3345         foreach(i; 0..4)
3346             r.array[i] = cast(uint)(a.array[i]) << count;
3347         return r;
3348     }
3349 }
3350 unittest
3351 {
3352     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3353     __m128i B = _mm_slli_epi32(A, 1);
3354     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3355     int[4] expectedB = [ 0, 4, 6, -8];
3356     assert(B.array == expectedB);
3357     assert(B2.array == expectedB);
3358 
3359     __m128i C = _mm_slli_epi32(A, 0);
3360     int[4] expectedC = [ 0, 2, 3, -4];
3361     assert(C.array == expectedC);
3362 
3363     __m128i D = _mm_slli_epi32(A, 65);
3364     int[4] expectedD = [ 0, 0, 0, 0];
3365     assert(D.array == expectedD);
3366 }
3367 
3368 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3369 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3370 {
3371     static if (GDC_with_SSE2)
3372     {
3373         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3374     }
3375     else static if (LDC_with_SSE2)
3376     {
3377         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3378     }
3379     else
3380     {
3381         long2 sa = cast(long2)a;
3382 
3383         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3384         //       D says "It's illegal to shift by the same or more bits 
3385         //       than the size of the quantity being shifted"
3386         //       and it's UB instead.
3387         long2 r = cast(long2) _mm_setzero_si128();
3388         ubyte count = cast(ubyte) imm8;
3389         if (count > 63)
3390             return cast(__m128i)r;
3391 
3392         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3393         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3394         return cast(__m128i)r;
3395     }
3396 }
3397 unittest
3398 {
3399     __m128i A = _mm_setr_epi64(8, -4);
3400     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3401     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3402     long[2] expectedB = [ 16, -8];
3403     assert(B.array == expectedB);
3404     assert(B2.array == expectedB);
3405 
3406     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3407     long[2] expectedC = [ 8, -4];
3408     assert(C.array == expectedC);
3409 
3410     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3411     long[2] expectedD = [ 0, -0];
3412     assert(D.array == expectedD);
3413 }
3414 
3415 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3416 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3417 {
3418     static if (GDC_with_SSE2)
3419     {
3420         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3421     }
3422     else static if (LDC_with_SSE2)
3423     {
3424         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3425     }
3426     else static if (LDC_with_ARM64)
3427     {
3428         short8 sa = cast(short8)a;
3429         short8 r = cast(short8)_mm_setzero_si128();
3430         ubyte count = cast(ubyte) imm8;
3431         if (count > 15)
3432             return cast(__m128i)r;
3433         r = sa << short8(count);
3434         return cast(__m128i)r;
3435     }
3436     else
3437     {
3438         short8 sa = cast(short8)a;
3439         short8 r = cast(short8)_mm_setzero_si128();
3440         ubyte count = cast(ubyte) imm8;
3441         if (count > 15)
3442             return cast(__m128i)r;
3443         foreach(i; 0..8)
3444             r.ptr[i] = cast(short)(sa.array[i] << count);
3445         return cast(__m128i)r;
3446     }
3447 }
3448 unittest
3449 {
3450     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3451     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3452     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3453     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3454     assert(B.array == expectedB);
3455     assert(B2.array == expectedB);
3456 
3457     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3458     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3459     assert(C.array == expectedC);
3460 }
3461 
3462 
3463 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3464 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3465 {
3466     static if (bytes & 0xF0)
3467     {
3468         return _mm_setzero_si128();
3469     }
3470     else
3471     {
3472         static if (GDC_with_SSE2)
3473         {
3474             return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 
3475         }
3476         else version(DigitalMars)
3477         {
3478             version(D_InlineAsm_X86)
3479             {
3480                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3481                 {
3482                     movdqu XMM0, op;
3483                     pslldq XMM0, bytes;
3484                     movdqu op, XMM0;
3485                 }
3486                 return op;
3487             }
3488             else
3489             {
3490                 byte16 A = cast(byte16)op;
3491                 byte16 R;
3492                 for (int n = 15; n >= bytes; --n)
3493                     R.ptr[n] = A.array[n-bytes];
3494                 for (int n = bytes-1; n >= 0; --n)
3495                     R.ptr[n] = 0;
3496                 return cast(__m128i)R;
3497             }
3498         }
3499         else
3500         {
3501             return cast(__m128i) shufflevector!(byte16,
3502             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3503             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3504             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3505             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3506         }
3507     }
3508 }
3509 unittest
3510 {
3511     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3512     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3513     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3514     assert(R.array == correct);
3515 
3516     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3517     int[4] expectedB = [0, 0, 0, 0];
3518     assert(B.array == expectedB);
3519 }
3520 
3521 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3522 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3523 {
3524     version(LDC)
3525     {
3526         // Disappeared with LDC 1.11
3527         static if (__VERSION__ < 2081)
3528             return __builtin_ia32_sqrtpd(vec);
3529         else
3530         {
3531             vec.array[0] = llvm_sqrt(vec.array[0]);
3532             vec.array[1] = llvm_sqrt(vec.array[1]);
3533             return vec;
3534         }
3535     }
3536     else static if (GDC_with_SSE2)    
3537     {
3538         return __builtin_ia32_sqrtpd(vec);
3539     }
3540     else
3541     {
3542         vec.ptr[0] = sqrt(vec.array[0]);
3543         vec.ptr[1] = sqrt(vec.array[1]);
3544         return vec;
3545     }
3546 }
3547 
3548 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3549 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3550 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3551 {
3552     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3553     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3554     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3555     version(LDC)
3556     {
3557         // Disappeared with LDC 1.11
3558         static if (__VERSION__ < 2081)
3559         {
3560             __m128d c = __builtin_ia32_sqrtsd(b);
3561             a[0] = c[0];
3562             return a;
3563         }
3564         else
3565         {
3566             a.array[0] = llvm_sqrt(b.array[0]);
3567             return a;
3568         }
3569     }
3570     else static if (GDC_with_SSE2)
3571     {
3572         __m128d c = __builtin_ia32_sqrtsd(b);
3573         a.ptr[0] = c.array[0];
3574         return a;
3575     }
3576     else
3577     {
3578         a.ptr[0] = sqrt(b.array[0]);
3579         return a;
3580     }
3581 }
3582 unittest
3583 {
3584     __m128d A = _mm_setr_pd(1.0, 3.0);
3585     __m128d B = _mm_setr_pd(4.0, 5.0);
3586     __m128d R = _mm_sqrt_sd(A, B);
3587     double[2] correct = [2.0, 3.0 ];
3588     assert(R.array == correct);
3589 }
3590 
3591 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3592 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3593 {
3594     static if (GDC_with_SSE2)
3595     {
3596         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3597     }
3598     else static if (LDC_with_SSE2)
3599     {
3600         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3601     }
3602     else
3603     {
3604         short8 sa = cast(short8)a;
3605         long2 lc = cast(long2)count;
3606         int bits = cast(int)(lc.array[0]);
3607         short8 r = void;
3608         foreach(i; 0..8)
3609             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3610         return cast(int4)r;
3611     }
3612 }
3613 
3614 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3615 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3616 {
3617     static if (LDC_with_SSE2)
3618     {
3619         return __builtin_ia32_psrad128(a, count);
3620     }
3621     else static if (GDC_with_SSE2)
3622     {
3623         return __builtin_ia32_psrad128(a, count);
3624     }
3625     else
3626     {    
3627         int4 r = void;
3628         long2 lc = cast(long2)count;
3629         int bits = cast(int)(lc.array[0]);
3630         r.ptr[0] = (a.array[0] >> bits);
3631         r.ptr[1] = (a.array[1] >> bits);
3632         r.ptr[2] = (a.array[2] >> bits);
3633         r.ptr[3] = (a.array[3] >> bits);
3634         return r;
3635     }
3636 }
3637 
3638 
3639 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3640 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3641 {
3642     static if (GDC_with_SSE2)
3643     {
3644         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3645     }
3646     else static if (LDC_with_SSE2)
3647     {
3648         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3649     }
3650     else static if (LDC_with_ARM64)
3651     {
3652         short8 sa = cast(short8)a;
3653         ubyte count = cast(ubyte)imm8;
3654         if (count > 15) 
3655             count = 15;
3656         short8 r = sa >> short8(count);
3657         return cast(__m128i)r;
3658     }
3659     else
3660     {
3661         short8 sa = cast(short8)a;
3662         short8 r = void;
3663 
3664         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3665         //       D says "It's illegal to shift by the same or more bits 
3666         //       than the size of the quantity being shifted"
3667         //       and it's UB instead.
3668         ubyte count = cast(ubyte)imm8;
3669         if (count > 15) 
3670             count = 15;
3671         foreach(i; 0..8)
3672             r.ptr[i] = cast(short)(sa.array[i] >> count);
3673         return cast(int4)r;
3674     }
3675 }
3676 unittest
3677 {
3678     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3679     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3680     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3681     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3682     assert(B.array == expectedB);
3683     assert(B2.array == expectedB);
3684 
3685     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3686     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3687     assert(C.array == expectedC);
3688 }
3689 
3690 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3691 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3692 {
3693     static if (LDC_with_SSE2)
3694     {
3695         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3696     }
3697     else static if (GDC_with_SSE2)
3698     {
3699         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3700     }
3701     else
3702     {
3703         int4 r = void;
3704 
3705         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3706         //       D says "It's illegal to shift by the same or more bits 
3707         //       than the size of the quantity being shifted"
3708         //       and it's UB instead.
3709         ubyte count = cast(ubyte) imm8;
3710         if (count > 31)
3711             count = 31;
3712 
3713         r.ptr[0] = (a.array[0] >> count);
3714         r.ptr[1] = (a.array[1] >> count);
3715         r.ptr[2] = (a.array[2] >> count);
3716         r.ptr[3] = (a.array[3] >> count);
3717         return r;
3718     }
3719 }
3720 unittest
3721 {
3722     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3723     __m128i B = _mm_srai_epi32(A, 1);
3724     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3725     int[4] expectedB = [ 0, 1, 1, -2];
3726     assert(B.array == expectedB);
3727     assert(B2.array == expectedB);
3728 
3729     __m128i C = _mm_srai_epi32(A, 32);
3730     int[4] expectedC = [ 0, 0, 0, -1];
3731     assert(C.array == expectedC);
3732 
3733     __m128i D = _mm_srai_epi32(A, 0);
3734     int[4] expectedD = [ 0, 2, 3, -4];
3735     assert(D.array == expectedD);
3736 }
3737 
3738 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3739 {
3740     static if (LDC_with_SSE2)
3741     {
3742         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3743     }
3744     else static if (GDC_with_SSE2)
3745     {
3746         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3747     }
3748     else
3749     {
3750         short8 sa = cast(short8)a;
3751         long2 lc = cast(long2)count;
3752         int bits = cast(int)(lc.array[0]);
3753         short8 r = void;
3754         foreach(i; 0..8)
3755             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3756         return cast(int4)r;
3757     }
3758 }
3759 
3760 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
3761 {
3762     static if (LDC_with_SSE2)
3763     {
3764         return __builtin_ia32_psrld128(a, count);
3765     }
3766     else static if (GDC_with_SSE2)
3767     {
3768         return __builtin_ia32_psrld128(a, count);
3769     }
3770     else
3771     {
3772         int4 r = void;
3773         long2 lc = cast(long2)count;
3774         int bits = cast(int)(lc.array[0]);
3775         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
3776         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
3777         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
3778         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
3779         return r;
3780     }
3781 }
3782 
3783 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
3784 {
3785     static if (LDC_with_SSE2)
3786     {
3787         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3788     }
3789     else static if (GDC_with_SSE2)
3790     {
3791         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3792     }
3793     else
3794     {
3795         long2 r = void;
3796         long2 sa = cast(long2)a;
3797         long2 lc = cast(long2)count;
3798         int bits = cast(int)(lc.array[0]);
3799         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
3800         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
3801         return cast(__m128i)r;
3802     }
3803 }
3804 
3805 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3806 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3807 {
3808     static if (GDC_with_SSE2)
3809     {
3810         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3811     }
3812     else static if (LDC_with_SSE2)
3813     {
3814         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3815     }
3816     else static if (LDC_with_ARM64)
3817     {
3818         short8 sa = cast(short8)a;
3819         short8 r = cast(short8) _mm_setzero_si128();
3820 
3821         ubyte count = cast(ubyte)imm8;
3822         if (count >= 16)
3823             return cast(__m128i)r;
3824 
3825         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3826         return cast(__m128i)r;
3827     }
3828     else
3829     {
3830         short8 sa = cast(short8)a;
3831         ubyte count = cast(ubyte)imm8;
3832 
3833         short8 r = cast(short8) _mm_setzero_si128();
3834         if (count >= 16)
3835             return cast(__m128i)r;
3836 
3837         foreach(i; 0..8)
3838             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
3839         return cast(__m128i)r;
3840     }
3841 }
3842 unittest
3843 {
3844     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3845     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3846     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
3847     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3848     assert(B.array == expectedB);
3849     assert(B2.array == expectedB);
3850 
3851     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
3852     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
3853     assert(C.array == expectedC);
3854 
3855     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
3856     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
3857     assert(D.array == expectedD);
3858 }
3859 
3860 
3861 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
3862 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
3863 {
3864     static if (GDC_with_SSE2)
3865     {
3866         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3867     }
3868     else static if (LDC_with_SSE2)
3869     {
3870         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
3871     }
3872     else
3873     {
3874         ubyte count = cast(ubyte) imm8;
3875 
3876         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3877         //       D says "It's illegal to shift by the same or more bits 
3878         //       than the size of the quantity being shifted"
3879         //       and it's UB instead.
3880         int4 r = _mm_setzero_si128();
3881         if (count >= 32)
3882             return r;
3883         r.ptr[0] = a.array[0] >>> count;
3884         r.ptr[1] = a.array[1] >>> count;
3885         r.ptr[2] = a.array[2] >>> count;
3886         r.ptr[3] = a.array[3] >>> count;
3887         return r;
3888     }
3889 }
3890 unittest
3891 {
3892     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3893     __m128i B = _mm_srli_epi32(A, 1);
3894     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
3895     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3896     assert(B.array == expectedB);
3897     assert(B2.array == expectedB);
3898  
3899     __m128i C = _mm_srli_epi32(A, 255);
3900     int[4] expectedC = [ 0, 0, 0, 0 ];
3901     assert(C.array == expectedC);
3902 }
3903 
3904 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
3905 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
3906 {
3907     static if (GDC_with_SSE2)
3908     {
3909         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
3910     }
3911     else static if (LDC_with_SSE2)
3912     {
3913         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
3914     }
3915     else
3916     {
3917         long2 r = cast(long2) _mm_setzero_si128();
3918         long2 sa = cast(long2)a;
3919 
3920         ubyte count = cast(ubyte) imm8;
3921         if (count >= 64)
3922             return cast(__m128i)r;
3923 
3924         r.ptr[0] = sa.array[0] >>> count;
3925         r.ptr[1] = sa.array[1] >>> count;
3926         return cast(__m128i)r;
3927     }
3928 }
3929 unittest
3930 {
3931     __m128i A = _mm_setr_epi64(8, -4);
3932     long2 B = cast(long2) _mm_srli_epi64(A, 1);
3933     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
3934     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
3935     assert(B.array == expectedB);
3936     assert(B2.array == expectedB);
3937 
3938     long2 C = cast(long2) _mm_srli_epi64(A, 64);
3939     long[2] expectedC = [ 0, 0 ];
3940     assert(C.array == expectedC);
3941 }
3942 
3943 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3944 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
3945 {
3946     static if (bytes & 0xF0)
3947     {
3948         return _mm_setzero_si128();
3949     }
3950     else static if (GDC_with_SSE2)
3951     {
3952         return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8));
3953     }
3954     else static if (DMD_with_32bit_asm)
3955     {
3956         asm pure nothrow @nogc @trusted
3957         {
3958             movdqu XMM0, v;
3959             psrldq XMM0, bytes;
3960             movdqu v, XMM0;
3961         }
3962         return v;
3963     }
3964     else
3965     {
3966         return cast(__m128i) shufflevector!(byte16,
3967                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
3968                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
3969                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
3970     }
3971 }
3972 unittest
3973 {
3974     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
3975     int[4] correct = [2, 3, 4, 0];
3976     assert(R.array == correct);
3977 
3978     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
3979     int[4] expectedA = [0, 0, 0, 0];
3980     assert(A.array == expectedA);
3981 }
3982 
3983 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3984 /// #BONUS
3985 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
3986 {
3987     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
3988 }
3989 unittest
3990 {
3991     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
3992     float[4] correct = [3.0f, 4.0f, 0, 0];
3993     assert(R.array == correct);
3994 }
3995 
3996 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3997 /// #BONUS
3998 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
3999 {
4000     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4001 }
4002 
4003 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4004 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4005 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4006 {
4007     __m128d* aligned = cast(__m128d*)mem_addr;
4008     *aligned = a;
4009 }
4010 
4011 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4012 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4013 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4014 {
4015     __m128d* aligned = cast(__m128d*)mem_addr;
4016     __m128d r;
4017     r.ptr[0] = a.array[0];
4018     r.ptr[1] = a.array[0];
4019     *aligned = r;
4020 }
4021 
4022 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4023 /// be aligned on any particular boundary.
4024 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4025 {
4026     *mem_addr = a.array[0];
4027 }
4028 
4029 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4030 /// general-protection exception may be generated.
4031 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4032 {
4033     *mem_addr = a;
4034 }
4035 
4036 alias _mm_store1_pd = _mm_store_pd1; ///
4037 
4038 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4039 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4040 {
4041     *mem_addr = a.array[1];
4042 }
4043 
4044 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4045 // expectations from the user point of view. This problem also exist in C++.
4046 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4047 {
4048     long* dest = cast(long*)mem_addr;
4049     long2 la = cast(long2)a;
4050     *dest = la.array[0];
4051 }
4052 unittest
4053 {
4054     long[3] A = [1, 2, 3];
4055     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4056     long[3] correct = [1, 0x1_0000_0000, 3];
4057     assert(A == correct);
4058 }
4059 
4060 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4061 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4062 {
4063     *mem_addr = a.array[0];
4064 }
4065 
4066 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4067 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4068 void _mm_storer_pd (double* mem_addr, __m128d a) pure
4069 {
4070     __m128d* aligned = cast(__m128d*)mem_addr;
4071     *aligned = shufflevector!(double2, 1, 0)(a, a);
4072 }
4073 
4074 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4075 /// `mem_addr` does not need to be aligned on any particular boundary.
4076 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
4077 {
4078     storeUnaligned!double2(a, mem_addr);
4079 }
4080 
4081 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4082 /// boundary.
4083 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4084 {
4085     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4086 }
4087 
4088 /// Store 32-bit integer from the first element of `a` into memory. 
4089 /// `mem_addr` does not need to be aligned on any particular boundary.
4090 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4091 {
4092     int* dest = cast(int*)mem_addr;
4093     *dest = a.array[0];
4094 }
4095 unittest
4096 {
4097     int[2] arr = [-24, 12];
4098     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4099     assert(arr == [-24, -1]);
4100 }
4101 
4102 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4103 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4104 /// boundary or a general-protection exception may be generated.
4105 void _mm_stream_pd (double* mem_addr, __m128d a)
4106 {
4107     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4108     __m128d* dest = cast(__m128d*)mem_addr;
4109     *dest = a;
4110 }
4111 
4112 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4113 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4114 /// may be generated.
4115 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4116 {
4117     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4118     __m128i* dest = cast(__m128i*)mem_addr;
4119     *dest = a;
4120 }
4121 
4122 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4123 /// pollution. If the cache line containing address mem_addr is already in the cache,
4124 /// the cache will be updated.
4125 void _mm_stream_si32 (int* mem_addr, int a)
4126 {
4127     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4128     *mem_addr = a;
4129 }
4130 
4131 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4132 /// cache pollution. If the cache line containing address mem_addr is already
4133 /// in the cache, the cache will be updated.
4134 void _mm_stream_si64 (long* mem_addr, long a)
4135 {
4136     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4137     *mem_addr = a;
4138 }
4139 
4140 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4141 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4142 {
4143     return cast(__m128i)(cast(short8)a - cast(short8)b);
4144 }
4145 
4146 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4147 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4148 {
4149     return cast(__m128i)(cast(int4)a - cast(int4)b);
4150 }
4151 
4152 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4153 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4154 {
4155     return cast(__m128i)(cast(long2)a - cast(long2)b);
4156 }
4157 
4158 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4159 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4160 {
4161     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4162 }
4163 
4164 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4165 /// floating-point elements in `a`.
4166 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4167 {
4168     return a - b;
4169 }
4170 
4171 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4172 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4173 /// upper element of result.
4174 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4175 {
4176     version(DigitalMars)
4177     {
4178         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4179         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4180         asm pure nothrow @nogc @trusted { nop;}
4181         a[0] = a[0] - b[0];
4182         return a;
4183     }
4184     else static if (GDC_with_SSE2)
4185     {
4186         return __builtin_ia32_subsd(a, b);
4187     }
4188     else
4189     {
4190         a.ptr[0] -= b.array[0];
4191         return a;
4192     }
4193 }
4194 unittest
4195 {
4196     __m128d a = [1.5, -2.0];
4197     a = _mm_sub_sd(a, a);
4198     assert(a.array == [0.0, -2.0]);
4199 }
4200 
4201 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4202 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4203 {
4204     return a - b;
4205 }
4206 
4207 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4208 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4209 {
4210     version(LDC)
4211     {
4212         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4213         {
4214             // Generates PSUBSW since LDC 1.15 -O0
4215             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4216             
4217             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4218             enum ir = `
4219                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4220                 ret <8 x i16> %r`;
4221             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4222         }
4223         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4224         {
4225             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4226             short[8] res;
4227             short8 sa = cast(short8)a;
4228             short8 sb = cast(short8)b;
4229             foreach(i; 0..8)
4230                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4231             return _mm_loadu_si128(cast(int4*)res.ptr);
4232         }
4233         else static if (LDC_with_SSE2)
4234         {
4235             return __builtin_ia32_psubsw128(a, b);
4236         }
4237         else
4238             static assert(false);
4239     }
4240     else static if (GDC_with_SSE2)
4241     {
4242         return __builtin_ia32_psubsw128(a, b);
4243     }
4244     else
4245     {
4246         short[8] res;
4247         short8 sa = cast(short8)a;
4248         short8 sb = cast(short8)b;
4249         foreach(i; 0..8)
4250             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4251         return _mm_loadu_si128(cast(int4*)res.ptr);
4252     }
4253 }
4254 unittest
4255 {
4256     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4257                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4258     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4259     assert(res.array == correctResult);
4260 }
4261 
4262 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4263 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4264 {
4265     version(LDC)
4266     {
4267         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4268         {
4269             // x86: Generates PSUBSB since LDC 1.15 -O0
4270             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4271             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4272             enum ir = `
4273                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4274                 ret <16 x i8> %r`;
4275             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4276         }
4277         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4278         {
4279             byte[16] res;
4280             byte16 sa = cast(byte16)a;
4281             byte16 sb = cast(byte16)b;
4282             foreach(i; 0..16)
4283                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4284             return _mm_loadu_si128(cast(int4*)res.ptr);
4285         }
4286         else static if (LDC_with_SSE2)
4287         {
4288             return __builtin_ia32_psubsb128(a, b);
4289         }
4290         else
4291             static assert(false);
4292     }
4293     else static if (GDC_with_SSE2)
4294     {
4295         return __builtin_ia32_psubsb128(a, b);
4296     }
4297     else
4298     {
4299         byte[16] res;
4300         byte16 sa = cast(byte16)a;
4301         byte16 sb = cast(byte16)b;
4302         foreach(i; 0..16)
4303             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4304         return _mm_loadu_si128(cast(int4*)res.ptr);
4305     }
4306 }
4307 unittest
4308 {
4309     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4310                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4311     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4312     assert(res.array == correctResult);
4313 }
4314 
4315 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4316 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4317 {
4318     version(LDC)
4319     {
4320         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4321         {
4322             // x86: Generates PSUBUSW since LDC 1.15 -O0
4323             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4324             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4325             enum ir = `
4326                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4327                 ret <8 x i16> %r`;
4328             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4329         }
4330         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4331         {
4332             short[8] res;
4333             short8 sa = cast(short8)a;
4334             short8 sb = cast(short8)b;
4335             foreach(i; 0..8)
4336             {
4337                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4338                 res[i] = saturateSignedIntToUnsignedShort(sum);
4339             }
4340             return _mm_loadu_si128(cast(int4*)res.ptr);
4341         }
4342         else static if (LDC_with_SSE2)
4343         {
4344             return __builtin_ia32_psubusw128(a, b);
4345         }
4346         else 
4347             static assert(false);
4348     }
4349     else static if (GDC_with_SSE2)
4350     {
4351         return __builtin_ia32_psubusw128(a, b);
4352     }
4353     else
4354     {
4355         short[8] res;
4356         short8 sa = cast(short8)a;
4357         short8 sb = cast(short8)b;
4358         foreach(i; 0..8)
4359         {
4360             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4361             res[i] = saturateSignedIntToUnsignedShort(sum);
4362         }
4363         return _mm_loadu_si128(cast(int4*)res.ptr);
4364     }
4365 }
4366 unittest
4367 {
4368     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4369                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4370     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4371     assert(R.array == correct);
4372 }
4373 
4374 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4375 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4376 {
4377     version(LDC)
4378     {
4379         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4380         {
4381             // x86: Generates PSUBUSB since LDC 1.15 -O0
4382             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4383             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4384             enum ir = `
4385                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4386                 ret <16 x i8> %r`;
4387             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4388         }
4389         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4390         {
4391             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4392             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4393             {
4394                 ubyte[16] res;
4395                 byte16 sa = cast(byte16)a;
4396                 byte16 sb = cast(byte16)b;
4397                 foreach(i; 0..16)
4398                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4399                 return _mm_loadu_si128(cast(int4*)res.ptr);
4400             }
4401         }
4402         else static if (LDC_with_SSE2)
4403         {
4404             return __builtin_ia32_psubusb128(a, b);
4405         }
4406         else 
4407             static assert(false);
4408     }
4409     else static if (GDC_with_SSE2)
4410     {
4411         return __builtin_ia32_psubusb128(a, b);
4412     }
4413     else
4414     {
4415         ubyte[16] res;
4416         byte16 sa = cast(byte16)a;
4417         byte16 sb = cast(byte16)b;
4418         foreach(i; 0..16)
4419             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4420         return _mm_loadu_si128(cast(int4*)res.ptr);
4421     }
4422 }
4423 unittest
4424 {
4425     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4426                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4427     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4428     assert(res.array == correctResult);
4429 }
4430 
4431 // Note: the only difference between these intrinsics is the signalling
4432 //       behaviour of quiet NaNs. This is incorrect but the case where
4433 //       you would want to differentiate between qNaN and sNaN and then
4434 //       treat them differently on purpose seems extremely rare.
4435 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4436 alias _mm_ucomige_sd = _mm_comige_sd; ///
4437 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4438 alias _mm_ucomile_sd = _mm_comile_sd; ///
4439 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4440 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4441 
4442 /// Return vector of type `__m128d` with undefined elements.
4443 __m128d _mm_undefined_pd() pure @safe
4444 {
4445     __m128d result = void;
4446     return result;
4447 }
4448 
4449 /// Return vector of type `__m128i` with undefined elements.
4450 __m128i _mm_undefined_si128() pure @safe
4451 {
4452     __m128i result = void;
4453     return result;
4454 }
4455 
4456 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4457 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4458 {
4459     static if (GDC_with_SSE2)
4460     {
4461         return __builtin_ia32_punpckhwd128(a, b);
4462     }
4463     else static if (DMD_with_32bit_asm)
4464     {
4465         asm pure nothrow @nogc @trusted
4466         {
4467             movdqu XMM0, a;
4468             movdqu XMM1, b;
4469             punpckhwd XMM0, XMM1;
4470             movdqu a, XMM0;
4471         }
4472         return a;
4473     }
4474     else
4475     {
4476         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4477                                            (cast(short8)a, cast(short8)b);
4478     }
4479 }
4480 unittest
4481 {
4482     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4483     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4484     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4485     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4486     assert(C.array == correct);
4487 }
4488 
4489 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4490 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
4491 {
4492     static if (GDC_with_SSE2)
4493     {
4494         return __builtin_ia32_punpckhdq128(a, b);
4495     }
4496     else
4497     {
4498         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4499     }
4500 }
4501 // TODO unittest
4502 
4503 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4504 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4505 {
4506     static if (GDC_with_SSE2)
4507     {
4508         return __builtin_ia32_punpckhqdq128(a, b);
4509     }
4510     else
4511     {
4512         __m128i r = cast(__m128i)b;
4513         r[0] = a[2];
4514         r[1] = a[3];
4515         return r; 
4516     }
4517 }
4518 unittest // Issue #36
4519 {
4520     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4521     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4522     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4523     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4524     assert(C.array == correct);
4525 }
4526 
4527 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4528 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4529 {
4530     static if (GDC_with_SSE2)
4531     {
4532         return __builtin_ia32_punpckhbw128(a, b);
4533     }
4534     else static if (DMD_with_32bit_asm)
4535     {
4536         asm pure nothrow @nogc @trusted
4537         {
4538             movdqu XMM0, a;
4539             movdqu XMM1, b;
4540             punpckhbw XMM0, XMM1;
4541             movdqu a, XMM0;
4542         }
4543         return a;
4544     }
4545     else
4546     {
4547         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4548                                                    12, 28, 13, 29, 14, 30, 15, 31)
4549                                                    (cast(byte16)a, cast(byte16)b);
4550     }
4551 }
4552 // TODO unittest
4553 
4554 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4555 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4556 {
4557     static if (GDC_with_SSE2)
4558     {
4559         return __builtin_ia32_unpckhpd(a, b);
4560     }
4561     else
4562     {
4563         return shufflevector!(__m128d, 1, 3)(a, b);
4564     }
4565 }
4566 // TODO unittest
4567 
4568 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4569 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4570 {
4571     static if (GDC_with_SSE2)
4572     {
4573         return __builtin_ia32_punpcklwd128(a, b);
4574     }
4575     else static if (DMD_with_32bit_asm)
4576     {
4577         asm pure nothrow @nogc @trusted
4578         {
4579             movdqu XMM0, a;
4580             movdqu XMM1, b;
4581             punpcklwd XMM0, XMM1;
4582             movdqu a, XMM0;
4583         }
4584         return a;
4585     }
4586     else
4587     {
4588         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4589                                            (cast(short8)a, cast(short8)b);
4590     }
4591 }
4592 // TODO unittest
4593 
4594 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4595 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
4596 {
4597     static if (GDC_with_SSE2)
4598     {
4599         return __builtin_ia32_punpckldq128(a, b);
4600     }
4601     else
4602     {
4603         return shufflevector!(int4, 0, 4, 1, 5)
4604                              (cast(int4)a, cast(int4)b);
4605     }
4606 }
4607 // TODO unittest
4608 
4609 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4610 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4611 {
4612     static if (GDC_with_SSE2)
4613     {
4614         return __builtin_ia32_punpcklqdq128(a, b);
4615     }
4616     else
4617     {
4618         long2 lA = cast(long2)a;
4619         long2 lB = cast(long2)b;
4620         long2 R;
4621         R.ptr[0] = lA.array[0];
4622         R.ptr[1] = lB.array[0];
4623         return cast(__m128i)R;
4624     }
4625 }
4626 unittest // Issue #36
4627 {
4628     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4629     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4630     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4631     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4632     assert(C.array == correct);
4633 }
4634 
4635 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
4636 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4637 {
4638     static if (GDC_with_SSE2)
4639     {
4640         return __builtin_ia32_punpcklbw128(a, b);
4641     }
4642     else static if (DMD_with_32bit_asm)
4643     {
4644         asm pure nothrow @nogc @trusted
4645         {
4646             movdqu XMM0, a;
4647             movdqu XMM1, b;
4648             punpcklbw XMM0, XMM1;
4649             movdqu a, XMM0;
4650         }
4651         return a;
4652     }
4653     else
4654     {
4655         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4656                                                     4, 20, 5, 21, 6, 22, 7, 23)
4657                                            (cast(byte16)a, cast(byte16)b);
4658     }
4659 }
4660 // TODO unittest
4661 
4662 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
4663 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4664 {
4665     static if (GDC_with_SSE2)
4666     {
4667         return __builtin_ia32_unpcklpd(a, b);
4668     }
4669     else
4670     {
4671         return shufflevector!(__m128d, 0, 2)(a, b);
4672     }
4673 }
4674 // TODO unittest
4675 
4676 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4677 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4678 {
4679     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4680 }
4681 
4682 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
4683 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4684 {
4685     return a ^ b;
4686 }
4687 
4688 unittest
4689 {
4690     float distance(float[4] a, float[4] b) nothrow @nogc
4691     {
4692         __m128 va = _mm_loadu_ps(a.ptr);
4693         __m128 vb = _mm_loadu_ps(b.ptr);
4694         __m128 diffSquared = _mm_sub_ps(va, vb);
4695         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4696         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4697         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4698         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4699     }
4700     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4701 }