1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (GDC_with_SSE2)
83     {
84         return __builtin_ia32_addsd(a, b);
85     }
86     else version(DigitalMars)
87     {
88         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
89         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
90         asm pure nothrow @nogc @trusted { nop;}
91         a[0] = a[0] + b[0];
92         return a;
93     }
94     else
95     {
96         a[0] += b[0];
97         return a;
98     }
99 }
100 unittest
101 {
102     __m128d a = [1.5, -2.0];
103     a = _mm_add_sd(a, a);
104     assert(a.array == [3.0, -2.0]);
105 }
106 
107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
109 {
110     pragma(inline, true);
111     return a + b;
112 }
113 unittest
114 {
115     __m128d a = [1.5, -2.0];
116     a = _mm_add_pd(a, a);
117     assert(a.array == [3.0, -4.0]);
118 }
119 
120 /// Add 64-bit integers `a` and `b`.
121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
122 {
123     pragma(inline, true);
124     return a + b;
125 }
126 
127 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
129 {
130     static if (GDC_with_SSE2)
131     {
132         return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
133     }
134     else version(LDC)
135     {
136         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
137         {
138             // x86: Generates PADDSW since LDC 1.15 -O0
139             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
140             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
141             enum ir = `
142                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
143                 ret <8 x i16> %r`;
144             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
145         }
146         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
147         {
148             short[8] res;
149             short8 sa = cast(short8)a;
150             short8 sb = cast(short8)b;
151             foreach(i; 0..8)
152                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
153             return _mm_loadu_si128(cast(int4*)res.ptr);
154         }
155         else
156             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
157     }
158     else
159     {
160         short[8] res;
161         short8 sa = cast(short8)a;
162         short8 sb = cast(short8)b;
163         foreach(i; 0..8)
164             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
165         return _mm_loadu_si128(cast(int4*)res.ptr);
166     }
167 }
168 unittest
169 {
170     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
171                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
172     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
173     assert(res.array == correctResult);
174 }
175 
176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
178 {
179     static if (GDC_with_SSE2)
180     {
181         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
182     }
183     else version(LDC)
184     {
185         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
186         {
187             // x86: Generates PADDSB since LDC 1.15 -O0
188             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
189             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
190             enum ir = `
191                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
192                 ret <16 x i8> %r`;
193             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
194         }
195         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
196         {
197             byte[16] res;
198             byte16 sa = cast(byte16)a;
199             byte16 sb = cast(byte16)b;
200             foreach(i; 0..16)
201                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
202             return _mm_loadu_si128(cast(int4*)res.ptr);
203         }
204         else
205             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
206     }
207     else
208     {
209         byte[16] res;
210         byte16 sa = cast(byte16)a;
211         byte16 sb = cast(byte16)b;
212         foreach(i; 0..16)
213             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
214         return _mm_loadu_si128(cast(int4*)res.ptr);
215     }
216 }
217 unittest
218 {
219     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
220                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
221     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
222                                                16, 18, 20, 22, 24, 26, 28, 30];
223     assert(res.array == correctResult);
224 }
225 
226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
227 // PERF: #GDC version?
228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
229 {
230     version(LDC)
231     {
232         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
233         {
234             // x86: Generates PADDUSB since LDC 1.15 -O0
235             // ARM: Generates uqadd.16b since LDC 1.21 -O1
236             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
237             enum ir = `
238                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
239                 ret <16 x i8> %r`;
240             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
241         }
242         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
243         {
244             ubyte[16] res;
245             byte16 sa = cast(byte16)a;
246             byte16 sb = cast(byte16)b;
247             foreach(i; 0..16)
248                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
249             return _mm_loadu_si128(cast(int4*)res.ptr);
250         }
251         else
252             return __builtin_ia32_paddusb128(a, b);
253     }
254     else
255     {
256         ubyte[16] res;
257         byte16 sa = cast(byte16)a;
258         byte16 sb = cast(byte16)b;
259         foreach(i; 0..16)
260             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
261         return _mm_loadu_si128(cast(int4*)res.ptr);
262     }
263 }
264 unittest
265 {
266     byte16 res = cast(byte16) 
267         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
268                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
269     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
270                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
271     assert(res.array == correctResult);
272 }
273 
274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
275 // PERF: #GDC version?
276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
277 {
278     version(LDC)
279     {
280         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
281         {
282             // x86: Generates PADDUSW since LDC 1.15 -O0
283             // ARM: Generates uqadd.8h since LDC 1.21 -O1
284             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
285             enum ir = `
286                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
287                 ret <8 x i16> %r`;
288             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
289         }
290         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
291         {
292             ushort[8] res;
293             short8 sa = cast(short8)a;
294             short8 sb = cast(short8)b;
295             foreach(i; 0..8)
296                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
297             return _mm_loadu_si128(cast(int4*)res.ptr);
298         }
299         else
300             return __builtin_ia32_paddusw128(a, b);
301     }
302     else
303     {
304         ushort[8] res;
305         short8 sa = cast(short8)a;
306         short8 sb = cast(short8)b;
307         foreach(i; 0..8)
308             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
309         return _mm_loadu_si128(cast(int4*)res.ptr);
310     }
311 }
312 unittest
313 {
314     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
315                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
316     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
317     assert(res.array == correctResult);
318 }
319 
320 /// Compute the bitwise AND of packed double-precision (64-bit) 
321 /// floating-point elements in `a` and `b`.
322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
323 {
324     pragma(inline, true);
325     return cast(__m128d)( cast(long2)a & cast(long2)b );
326 }
327 unittest
328 {
329     double a = 4.32;
330     double b = -78.99;
331     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
332     __m128d A = _mm_set_pd(a, b);
333     __m128d B = _mm_set_pd(b, a);
334     long2 R = cast(long2)( _mm_and_pd(A, B) );
335     assert(R.array[0] == correct);
336     assert(R.array[1] == correct);
337 }
338 
339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
341 {
342     pragma(inline, true);
343     return a & b;
344 }
345 unittest
346 {
347     __m128i A = _mm_set1_epi32(7);
348     __m128i B = _mm_set1_epi32(14);
349     __m128i R = _mm_and_si128(A, B);
350     int[4] correct = [6, 6, 6, 6];
351     assert(R.array == correct);
352 }
353 
354 /// Compute the bitwise NOT of packed double-precision (64-bit) 
355 /// floating-point elements in `a` and then AND with `b`.
356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
357 {
358     return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
359 }
360 unittest
361 {
362     double a = 4.32;
363     double b = -78.99;
364     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
365     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
366     __m128d A = _mm_setr_pd(a, b);
367     __m128d B = _mm_setr_pd(b, a);
368     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
369     assert(R.array[0] == correct);
370     assert(R.array[1] == correct2);
371 }
372 
373 /// Compute the bitwise NOT of 128 bits (representing integer data) 
374 /// in `a` and then AND with `b`.
375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
376 {
377     return (~a) & b;
378 }
379 unittest
380 {
381     __m128i A = _mm_set1_epi32(7);
382     __m128i B = _mm_set1_epi32(14);
383     __m128i R = _mm_andnot_si128(A, B);
384     int[4] correct = [8, 8, 8, 8];
385     assert(R.array == correct);
386 }
387 
388 /// Average packed unsigned 16-bit integers in `a` and `b`.
389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
390 {
391     static if (GDC_with_SSE2)
392     {
393         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
394     }
395     else static if (LDC_with_ARM64)
396     {
397         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
398     }
399     else version(LDC)
400     {
401         // Generates pavgw even in LDC 1.0, even in -O0
402         // But not in ARM
403         enum ir = `
404             %ia = zext <8 x i16> %0 to <8 x i32>
405             %ib = zext <8 x i16> %1 to <8 x i32>
406             %isum = add <8 x i32> %ia, %ib
407             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
408             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
409             %r = trunc <8 x i32> %isums to <8 x i16>
410             ret <8 x i16> %r`;
411         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
412     }
413     else
414     {
415         short8 sa = cast(short8)a;
416         short8 sb = cast(short8)b;
417         short8 sr = void;
418         foreach(i; 0..8)
419         {
420             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
421         }
422         return cast(int4)sr;
423     }
424 }
425 unittest
426 {
427     __m128i A = _mm_set1_epi16(31);
428     __m128i B = _mm_set1_epi16(64);
429     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
430     foreach(i; 0..8)
431         assert(avg.array[i] == 48);
432 }
433 
434 /// Average packed unsigned 8-bit integers in `a` and `b`.
435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
436 {
437     static if (GDC_with_SSE2)
438     {
439         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
440     }
441     else static if (LDC_with_ARM64)
442     {
443         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
444     }
445     else version(LDC)
446     {
447         // Generates pavgb even in LDC 1.0, even in -O0
448         // But not in ARM
449         enum ir = `
450             %ia = zext <16 x i8> %0 to <16 x i16>
451             %ib = zext <16 x i8> %1 to <16 x i16>
452             %isum = add <16 x i16> %ia, %ib
453             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
454             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
455             %r = trunc <16 x i16> %isums to <16 x i8>
456             ret <16 x i8> %r`;
457         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
458     }
459     else
460     {
461         byte16 sa = cast(byte16)a;
462         byte16 sb = cast(byte16)b;
463         byte16 sr = void;
464         foreach(i; 0..16)
465         {
466             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
467         }
468         return cast(int4)sr;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_set1_epi8(31);
474     __m128i B = _mm_set1_epi8(64);
475     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
476     foreach(i; 0..16)
477         assert(avg.array[i] == 48);
478 }
479 
480 /// Shift `a` left by `bytes` bytes while shifting in zeros.
481 alias _mm_bslli_si128 = _mm_slli_si128;
482 unittest
483 {
484     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
485     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
486     __m128i result = _mm_bslli_si128!5(toShift);
487     assert( (cast(byte16)result).array == exact);
488 }
489 
490 /// Shift `v` right by `bytes` bytes while shifting in zeros.
491 alias _mm_bsrli_si128 = _mm_srli_si128;
492 unittest
493 {
494     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
495     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
496     __m128i result = _mm_bsrli_si128!5(toShift);
497     assert( (cast(byte16)result).array == exact);
498 }
499 
500 /// Cast vector of type `__m128d` to type `__m128`. 
501 /// Note: Also possible with a regular `cast(__m128)(a)`.
502 __m128 _mm_castpd_ps (__m128d a) pure @safe
503 {
504     return cast(__m128)a;
505 }
506 
507 /// Cast vector of type `__m128d` to type `__m128i`. 
508 /// Note: Also possible with a regular `cast(__m128i)(a)`.
509 __m128i _mm_castpd_si128 (__m128d a) pure @safe
510 {
511     return cast(__m128i)a;
512 }
513 
514 /// Cast vector of type `__m128` to type `__m128d`. 
515 /// Note: Also possible with a regular `cast(__m128d)(a)`.
516 __m128d _mm_castps_pd (__m128 a) pure @safe
517 {
518     return cast(__m128d)a;
519 }
520 
521 /// Cast vector of type `__m128` to type `__m128i`. 
522 /// Note: Also possible with a regular `cast(__m128i)(a)`.
523 __m128i _mm_castps_si128 (__m128 a) pure @safe
524 {
525     return cast(__m128i)a;
526 }
527 
528 /// Cast vector of type `__m128i` to type `__m128d`. 
529 /// Note: Also possible with a regular `cast(__m128d)(a)`.
530 __m128d _mm_castsi128_pd (__m128i a) pure @safe
531 {
532     return cast(__m128d)a;
533 }
534 
535 /// Cast vector of type `__m128i` to type `__m128`. 
536 /// Note: Also possible with a regular `cast(__m128)(a)`.
537 __m128 _mm_castsi128_ps (__m128i a) pure @safe
538 {
539     return cast(__m128)a;
540 }
541 
542 /// Invalidate and flush the cache line that contains `p` 
543 /// from all levels of the cache hierarchy.
544 void _mm_clflush (const(void)* p) @trusted
545 {
546     static if (GDC_with_SSE2)
547     {
548         __builtin_ia32_clflush(p);
549     }
550     else static if (LDC_with_SSE2)
551     {
552         __builtin_ia32_clflush(cast(void*)p);
553     }
554     else version(D_InlineAsm_X86)
555     {
556         asm pure nothrow @nogc @safe
557         {
558             mov EAX, p;
559             clflush [EAX];
560         }
561     }
562     else version(D_InlineAsm_X86_64)
563     {
564         asm pure nothrow @nogc @safe
565         {
566             mov RAX, p;
567             clflush [RAX];
568         }
569     }
570     else 
571     {
572         // Do nothing. Invalidating cacheline does
573         // not affect correctness.
574     }
575 }
576 unittest
577 {
578     ubyte[64] cacheline;
579     _mm_clflush(cacheline.ptr);
580 }
581 
582 /// Compare packed 16-bit integers in `a` and `b` for equality.
583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
584 {
585     static if (GDC_with_SSE2)
586     {
587         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
588     }
589     else
590     {
591         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
592     }
593 }
594 unittest
595 {
596     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
597     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
598     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
599     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
600     assert(R.array == E);
601 }
602 
603 /// Compare packed 32-bit integers in `a` and `b` for equality.
604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
605 {
606     static if (GDC_with_SSE2)
607     {
608         return __builtin_ia32_pcmpeqd128(a, b);
609     }
610     else
611     {
612         return equalMask!__m128i(a, b);
613     }
614 }
615 unittest
616 {
617     int4   A = [-3, -2, -1,  0];
618     int4   B = [ 4, -2,  2,  0];
619     int[4] E = [ 0, -1,  0, -1];
620     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
621     assert(R.array == E);
622 }
623 
624 /// Compare packed 8-bit integers in `a` and `b` for equality.
625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
626 {
627     static if (GDC_with_SSE2)
628     {
629         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
630     }
631     else
632     {
633         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
634     }
635 }
636 unittest
637 {
638     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
639     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
640     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
641     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
642     assert(C.array == correct);
643 }
644 
645 /// Compare packed double-precision (64-bit) floating-point elements 
646 /// in `a` and `b` for equality.
647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
648 {
649     static if (GDC_with_SSE2)
650     {
651         return __builtin_ia32_cmpeqpd(a, b);
652     }
653     else
654     {
655         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
656     }
657 }
658 
659 /// Compare the lower double-precision (64-bit) floating-point elements
660 /// in `a` and `b` for equality, store the result in the lower element,
661 /// and copy the upper element from `a`.
662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
663 {
664     static if (GDC_with_SSE2)
665     {
666         return __builtin_ia32_cmpeqsd(a, b);
667     }
668     else
669     {
670         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
671     }
672 }
673 
674 /// Compare packed double-precision (64-bit) floating-point elements 
675 /// in `a` and `b` for greater-than-or-equal.
676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
677 {
678     static if (GDC_with_SSE2)
679     {
680         return __builtin_ia32_cmpgepd(a, b);
681     }
682     else
683     {
684         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
685     }
686 }
687 
688 /// Compare the lower double-precision (64-bit) floating-point elements 
689 /// in `a` and `b` for greater-than-or-equal, store the result in the 
690 /// lower element, and copy the upper element from `a`.
691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
692 {
693     // Note: There is no __builtin_ia32_cmpgesd builtin.
694     static if (GDC_with_SSE2)
695     {
696         return __builtin_ia32_cmpnltsd(b, a);
697     }
698     else
699     {
700         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
701     }
702 }
703 
704 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
706 {
707     static if (GDC_with_SSE2)
708     {
709         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
710     }
711     else
712     {
713         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
714     }
715 }
716 unittest
717 {
718     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
719     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
720     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
721     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
722     assert(R.array == E);
723 }
724 
725 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
727 {
728     static if (GDC_with_SSE2)
729     {
730         return __builtin_ia32_pcmpgtd128(a, b); 
731     }
732     else
733     {
734         return cast(__m128i)( greaterMask!int4(a, b));
735     }
736 }
737 unittest
738 {
739     int4   A = [-3,  2, -1,  0];
740     int4   B = [ 4, -2,  2,  0];
741     int[4] E = [ 0, -1,  0,  0];
742     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
743     assert(R.array == E);
744 }
745 
746 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
748 {
749     static if (GDC_with_SSE2)
750     {
751         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
752     }
753     else
754     {
755         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
756     }
757 }
758 unittest
759 {
760     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
761     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
762     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
763     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
764     __m128i D = _mm_cmpeq_epi8(A, B);
765     assert(C.array == correct);
766 }
767 
768 /// Compare packed double-precision (64-bit) floating-point elements 
769 /// in `a` and `b` for greater-than.
770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
771 {
772     static if (GDC_with_SSE2)
773     {
774         return __builtin_ia32_cmpgtpd(a, b); 
775     }
776     else
777     {
778         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
779     }
780 }
781 
782 /// Compare the lower double-precision (64-bit) floating-point elements 
783 /// in `a` and `b` for greater-than, store the result in the lower element,
784 /// and copy the upper element from `a`.
785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
786 {
787     // Note: There is no __builtin_ia32_cmpgtsd builtin.
788     static if (GDC_with_SSE2)
789     {
790         return __builtin_ia32_cmpnlesd(b, a);
791     }
792     else
793     {
794         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
795     }
796 }
797 
798 /// Compare packed double-precision (64-bit) floating-point elements 
799 /// in `a` and `b` for less-than-or-equal.
800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
801 {
802     static if (GDC_with_SSE2)
803     {
804         return __builtin_ia32_cmplepd(a, b); 
805     }
806     else
807     {
808         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
809     }
810 }
811 
812 /// Compare the lower double-precision (64-bit) floating-point elements 
813 /// in `a` and `b` for less-than-or-equal, store the result in the 
814 /// lower element, and copy the upper element from `a`.
815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
816 {
817     static if (GDC_with_SSE2)
818     {
819         return __builtin_ia32_cmplesd(a, b); 
820     }
821     else
822     {
823         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
824     }
825 }
826 
827 /// Compare packed 16-bit integers in `a` and `b` for less-than.
828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
829 {
830     return _mm_cmpgt_epi16(b, a);
831 }
832 
833 /// Compare packed 32-bit integers in `a` and `b` for less-than.
834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
835 {
836     return _mm_cmpgt_epi32(b, a);
837 }
838 
839 /// Compare packed 8-bit integers in `a` and `b` for less-than.
840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
841 {
842     return _mm_cmpgt_epi8(b, a);
843 }
844 
845 /// Compare packed double-precision (64-bit) floating-point elements
846 /// in `a` and `b` for less-than.
847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
848 {
849     static if (GDC_with_SSE2)
850     {
851         return __builtin_ia32_cmpltpd(a, b); 
852     }
853     else
854     {
855         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
856     }
857 }
858 
859 /// Compare the lower double-precision (64-bit) floating-point elements
860 /// in `a` and `b` for less-than, store the result in the lower 
861 /// element, and copy the upper element from `a`.
862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
863 {
864     static if (GDC_with_SSE2)
865     {
866         return __builtin_ia32_cmpltsd(a, b); 
867     }
868     else
869     {
870         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
871     }
872 }
873 
874 /// Compare packed double-precision (64-bit) floating-point elements
875 /// in `a` and `b` for not-equal.
876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
877 {
878     static if (GDC_with_SSE2)
879     {
880         return __builtin_ia32_cmpneqpd(a, b); 
881     }
882     else
883     {
884         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
885     }
886 }
887 
888 /// Compare the lower double-precision (64-bit) floating-point elements
889 /// in `a` and `b` for not-equal, store the result in the lower 
890 /// element, and copy the upper element from `a`.
891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
892 {
893     static if (GDC_with_SSE2)
894     {
895         return __builtin_ia32_cmpneqsd(a, b); 
896     }
897     else
898     {
899         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
900     }
901 }
902 
903 /// Compare packed double-precision (64-bit) floating-point elements 
904 /// in `a` and `b` for not-greater-than-or-equal.
905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
906 {
907     static if (GDC_with_SSE2)
908     {
909         return __builtin_ia32_cmpngepd(a, b); 
910     }
911     else
912     {
913         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
914     }
915 }
916 
917 /// Compare the lower double-precision (64-bit) floating-point elements 
918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
919 /// the lower element, and copy the upper element from `a`.
920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
921 {
922     // Note: There is no __builtin_ia32_cmpngesd builtin.
923     static if (GDC_with_SSE2)
924     {
925         return __builtin_ia32_cmpltsd(b, a); 
926     }
927     else
928     {
929         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
930     }
931 }
932 
933 /// Compare packed double-precision (64-bit) floating-point elements 
934 /// in `a` and `b` for not-greater-than.
935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
936 {
937     static if (GDC_with_SSE2)
938     {
939         return __builtin_ia32_cmpngtpd(a, b);
940     }
941     else
942     {
943         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
944     }
945 }
946 
947 /// Compare the lower double-precision (64-bit) floating-point elements 
948 /// in `a` and `b` for not-greater-than, store the result in the 
949 /// lower element, and copy the upper element from `a`.
950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
951 {
952     // Note: There is no __builtin_ia32_cmpngtsd builtin.
953     static if (GDC_with_SSE2)
954     {
955         return __builtin_ia32_cmplesd(b, a);
956     }
957     else
958     {
959         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
960     }
961 }
962 
963 /// Compare packed double-precision (64-bit) floating-point elements 
964 /// in `a` and `b` for not-less-than-or-equal.
965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
966 {
967     static if (GDC_with_SSE2)
968     {
969         return __builtin_ia32_cmpnlepd(a, b);
970     }
971     else
972     {
973         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
974     }
975 }
976 
977 /// Compare the lower double-precision (64-bit) floating-point elements 
978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
979 /// lower element, and copy the upper element from `a`.
980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
981 {
982     static if (GDC_with_SSE2)
983     {
984         return __builtin_ia32_cmpnlesd(a, b);
985     }
986     else
987     {
988         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
989     }
990 }
991  
992 /// Compare packed double-precision (64-bit) floating-point elements 
993 /// in `a` and `b` for not-less-than.
994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
995 {
996     static if (GDC_with_SSE2)
997     {
998         return __builtin_ia32_cmpnltpd(a, b);
999     }
1000     else
1001     {
1002         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1003     }
1004 }
1005 
1006 /// Compare the lower double-precision (64-bit) floating-point elements 
1007 /// in `a` and `b` for not-less-than, store the result in the lower 
1008 /// element, and copy the upper element from `a`.
1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1010 {
1011     static if (GDC_with_SSE2)
1012     {
1013         return __builtin_ia32_cmpnltsd(a, b);
1014     }
1015     else
1016     {
1017         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1018     }
1019 }
1020 
1021 /// Compare packed double-precision (64-bit) floating-point elements 
1022 /// in `a` and `b` to see if neither is NaN.
1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1024 {
1025     static if (GDC_with_SSE2)
1026     {
1027         return __builtin_ia32_cmpordpd(a, b);
1028     }
1029     else
1030     {
1031         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1032     }
1033 }
1034 
1035 /// Compare the lower double-precision (64-bit) floating-point elements 
1036 /// in `a` and `b` to see if neither is NaN, store the result in the 
1037 /// lower element, and copy the upper element from `a` to the upper element.
1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1039 {
1040     static if (GDC_with_SSE2)
1041     {
1042         return __builtin_ia32_cmpordsd(a, b);
1043     }
1044     else
1045     {
1046         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1047     }
1048 }
1049 
1050 /// Compare packed double-precision (64-bit) floating-point elements 
1051 /// in `a` and `b` to see if either is NaN.
1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1053 {
1054     static if (GDC_with_SSE2)
1055     {
1056         return __builtin_ia32_cmpunordpd(a, b);
1057     }
1058     else
1059     {
1060         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1061     }
1062 }
1063 
1064 /// Compare the lower double-precision (64-bit) floating-point elements 
1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1066 /// element, and copy the upper element from `a` to the upper element.
1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1068 {
1069     static if (GDC_with_SSE2)
1070     {
1071         return __builtin_ia32_cmpunordsd(a, b);
1072     }
1073     else
1074     {
1075         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1076     }
1077 }
1078 
1079 /// Compare the lower double-precision (64-bit) floating-point element 
1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1082 {
1083     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1084     // comisd instruction, it returns false in case of unordered instead.
1085     //
1086     // Actually C++ compilers disagree over the meaning of that instruction.
1087     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1088     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1089     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1090     return a.array[0] == b.array[0];
1091 }
1092 unittest
1093 {
1094     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1095     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1096     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1097     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1098     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1099 }
1100 
1101 /// Compare the lower double-precision (64-bit) floating-point element 
1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1103 /// result (0 or 1).
1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1105 {
1106     return a.array[0] >= b.array[0];
1107 }
1108 unittest
1109 {
1110     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1111     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1112     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1113     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1114     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1115     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1116 }
1117 
1118 /// Compare the lower double-precision (64-bit) floating-point element 
1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1121 {
1122     return a.array[0] > b.array[0];
1123 }
1124 unittest
1125 {
1126     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1127     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1128     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1129     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1130     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1131 }
1132 
1133 /// Compare the lower double-precision (64-bit) floating-point element 
1134 /// in `a` and `b` for less-than-or-equal.
1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1136 {
1137     return a.array[0] <= b.array[0];
1138 }
1139 unittest
1140 {
1141     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1142     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1143     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1144     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1145     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1146     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1147 }
1148 
1149 /// Compare the lower double-precision (64-bit) floating-point element 
1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1152 {
1153     return a.array[0] < b.array[0];
1154 }
1155 unittest
1156 {
1157     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1158     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1159     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1160     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1161     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1162     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1163 }
1164 
1165 /// Compare the lower double-precision (64-bit) floating-point element
1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1168 {
1169     return a.array[0] != b.array[0];
1170 }
1171 unittest
1172 {
1173     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1174     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1175     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1176     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1177     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1178 }
1179 
1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1181 /// floating-point elements.
1182  __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1183 {
1184     version(LDC)
1185     {
1186         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1187         enum ir = `
1188             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1189             %r = sitofp <2 x i32> %v to <2 x double>
1190             ret <2 x double> %r`;
1191         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1192     }
1193     else static if (GDC_with_SSE2)
1194     {
1195         return __builtin_ia32_cvtdq2pd(a);
1196     }
1197     else
1198     {
1199         double2 r = void;
1200         r.ptr[0] = a.array[0];
1201         r.ptr[1] = a.array[1];
1202         return r;
1203     }
1204 }
1205 unittest
1206 {
1207     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1208     assert(A.array[0] == 54.0);
1209     assert(A.array[1] == 54.0);
1210 }
1211 
1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1213 /// floating-point elements.
1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1215 {
1216     static if (GDC_with_SSE2)
1217     {
1218         return __builtin_ia32_cvtdq2ps(a);
1219     }
1220     else version(LDC)
1221     {
1222         // See #86 for why we had to resort to LLVM IR.
1223         // Plain code below was leading to catastrophic behaviour. 
1224         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1225         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1226         enum ir = `
1227             %r = sitofp <4 x i32> %0 to <4 x float>
1228             ret <4 x float> %r`;
1229         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1230     }
1231     else
1232     {
1233         __m128 res;
1234         res.ptr[0] = cast(float)a.array[0];
1235         res.ptr[1] = cast(float)a.array[1];
1236         res.ptr[2] = cast(float)a.array[2];
1237         res.ptr[3] = cast(float)a.array[3];
1238         return res;
1239     }
1240 }
1241 unittest
1242 {
1243     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1244     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1245 }
1246 
1247 /// Convert packed double-precision (64-bit) floating-point elements 
1248 /// in `a` to packed 32-bit integers.
1249 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1250 {
1251     // PERF ARM32
1252     static if (LDC_with_SSE2)
1253     {
1254         return __builtin_ia32_cvtpd2dq(a);
1255     }
1256     else static if (GDC_with_SSE2)
1257     {
1258         return __builtin_ia32_cvtpd2dq(a);
1259     }
1260     else static if (LDC_with_ARM64)
1261     {
1262         // Get current rounding mode.
1263         uint fpscr = arm_get_fpcr();
1264         long2 i;
1265         switch(fpscr & _MM_ROUND_MASK_ARM)
1266         {
1267             default:
1268             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1269             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1270             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1271             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1272         }
1273         int4 zero = 0;
1274         return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1275     }
1276     else
1277     {
1278         // PERF ARM32
1279         __m128i r = _mm_setzero_si128();
1280         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1281         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1282         return r;
1283     }
1284 }
1285 unittest
1286 {
1287     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1288     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1289 }
1290 
1291 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1292 /// to packed 32-bit integers
1293 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1294 {
1295     return to_m64(_mm_cvtpd_epi32(v));
1296 }
1297 unittest
1298 {
1299     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1300     assert(A.array[0] == 55 && A.array[1] == 61);
1301 }
1302 
1303 /// Convert packed double-precision (64-bit) floating-point elements 
1304 /// in `a` to packed single-precision (32-bit) floating-point elements.
1305 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1306 {
1307     static if (LDC_with_SSE2)
1308     {
1309         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1310     }
1311     else static if (GDC_with_SSE2)
1312     {
1313         return __builtin_ia32_cvtpd2ps(a);
1314     }
1315     else
1316     { 
1317         __m128 r = void;
1318         r.ptr[0] = a.array[0];
1319         r.ptr[1] = a.array[1];
1320         r.ptr[2] = 0;
1321         r.ptr[3] = 0;
1322         return r;
1323     }
1324 }
1325 unittest
1326 {
1327     __m128d A = _mm_set_pd(5.25, 4.0);
1328     __m128 B = _mm_cvtpd_ps(A);
1329     assert(B.array == [4.0f, 5.25f, 0, 0]);
1330 }
1331 
1332 /// Convert packed 32-bit integers in `v` to packed double-precision 
1333 /// (64-bit) floating-point elements.
1334 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1335 {
1336     return _mm_cvtepi32_pd(to_m128i(v));
1337 }
1338 unittest
1339 {
1340     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1341     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1342 }
1343 
1344 /// Convert packed single-precision (32-bit) floating-point elements 
1345 /// in `a` to packed 32-bit integers
1346 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1347 {
1348     static if (LDC_with_SSE2)
1349     {
1350         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1351     }
1352     else static if (GDC_with_SSE2)
1353     {
1354         return __builtin_ia32_cvtps2dq(a);
1355     }
1356     else static if (LDC_with_ARM64)
1357     {
1358         // Get current rounding mode.
1359         uint fpscr = arm_get_fpcr();
1360         switch(fpscr & _MM_ROUND_MASK_ARM)
1361         {
1362             default:
1363             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1364             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1365             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1366             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1367         }
1368     }
1369     else
1370     {
1371         __m128i r = void;
1372         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1373         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1374         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1375         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1376         return r;
1377     }
1378 }
1379 unittest
1380 {
1381     // GDC bug #98607
1382     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1383     // GDC does not provide optimization barrier for rounding mode.
1384     // Workarounded with different literals. This bug will likely only manifest in unittest.
1385     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1386 
1387     uint savedRounding = _MM_GET_ROUNDING_MODE();
1388 
1389     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1390     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1391     assert(A.array == [1, -2, 54, -3]);
1392 
1393     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1394     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1395     assert(A.array == [1, -3, 53, -3]);
1396 
1397     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1398     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1399     assert(A.array == [2, -2, 54, -2]);
1400 
1401     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1402     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1403     assert(A.array == [1, -2, 53, -2]);
1404 
1405     _MM_SET_ROUNDING_MODE(savedRounding);
1406 }
1407 
1408 /// Convert packed single-precision (32-bit) floating-point elements 
1409 /// in `a` to packed double-precision (64-bit) floating-point elements.
1410 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1411 {
1412     version(LDC)
1413     {
1414         // Generates cvtps2pd since LDC 1.0 -O0
1415         enum ir = `
1416             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1417             %r = fpext <2 x float> %v to <2 x double>
1418             ret <2 x double> %r`;
1419         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1420     }
1421     else static if (GDC_with_SSE2)
1422     {
1423         return __builtin_ia32_cvtps2pd(a);
1424     }
1425     else
1426     {
1427         double2 r = void;
1428         r.ptr[0] = a.array[0];
1429         r.ptr[1] = a.array[1];
1430         return r;
1431     }
1432 }
1433 unittest
1434 {
1435     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1436     assert(A.array[0] == 54.0);
1437     assert(A.array[1] == 54.0);
1438 }
1439 
1440 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1441 double _mm_cvtsd_f64 (__m128d a) pure @safe
1442 {
1443     return a.array[0];
1444 }
1445 
1446 /// Convert the lower double-precision (64-bit) floating-point element
1447 /// in `a` to a 32-bit integer.
1448 int _mm_cvtsd_si32 (__m128d a) @safe
1449 {
1450     static if (LDC_with_SSE2)
1451     {
1452         return __builtin_ia32_cvtsd2si(a);
1453     }
1454     else static if (GDC_with_SSE2)
1455     {
1456         return __builtin_ia32_cvtsd2si(a);
1457     }
1458     else
1459     {
1460         return convertDoubleToInt32UsingMXCSR(a[0]);
1461     }
1462 }
1463 unittest
1464 {
1465     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1466 }
1467 
1468 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1469 long _mm_cvtsd_si64 (__m128d a) @trusted
1470 {
1471     version (LDC)
1472     {
1473         version (X86_64)
1474         {
1475             return __builtin_ia32_cvtsd2si64(a);
1476         }
1477         else
1478         {
1479             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1480             // using SSE instructions only. So the builtin doesn't exit for this arch.
1481             return convertDoubleToInt64UsingMXCSR(a[0]);
1482         }
1483     }
1484     else
1485     {
1486         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1487     }
1488 }
1489 unittest
1490 {
1491     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1492 
1493     uint savedRounding = _MM_GET_ROUNDING_MODE();
1494 
1495     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1496     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1497 
1498     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1499     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1500 
1501     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1502     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1503 
1504     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1505     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1506 
1507     _MM_SET_ROUNDING_MODE(savedRounding);
1508 }
1509 
1510 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1511 
1512 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1513 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1514 /// to the upper elements of result.
1515 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1516 {
1517     static if (GDC_with_SSE2)
1518     {
1519         return __builtin_ia32_cvtsd2ss(a, b); 
1520     }
1521     else
1522     {
1523         // Generates cvtsd2ss since LDC 1.3 -O0
1524         a.ptr[0] = b.array[0];
1525         return a;
1526     }
1527 }
1528 unittest
1529 {
1530     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1531     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1532 }
1533 
1534 /// Get the lower 32-bit integer in `a`.
1535 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1536 {
1537     return a.array[0];
1538 }
1539 
1540 /// Get the lower 64-bit integer in `a`.
1541 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1542 {
1543     long2 la = cast(long2)a;
1544     return la.array[0];
1545 }
1546 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1547 
1548 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1549 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1550 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1551 {
1552     a.ptr[0] = cast(double)b;
1553     return a;
1554 }
1555 unittest
1556 {
1557     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1558     assert(a.array == [42.0, 0]);
1559 }
1560 
1561 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1562 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1563 {
1564     int4 r = [0, 0, 0, 0];
1565     r.ptr[0] = a;
1566     return r;
1567 }
1568 unittest
1569 {
1570     __m128i a = _mm_cvtsi32_si128(65);
1571     assert(a.array == [65, 0, 0, 0]);
1572 }
1573 
1574 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1575 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1576 
1577 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1578 {
1579     a.ptr[0] = cast(double)b;
1580     return a;
1581 }
1582 unittest
1583 {
1584     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1585     assert(a.array == [42.0, 0]);
1586 }
1587 
1588 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1589 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1590 {
1591     long2 r = [0, 0];
1592     r.ptr[0] = a;
1593     return cast(__m128i)(r);
1594 }
1595 
1596 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1597 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1598 
1599 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1600 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1601 // element of result.
1602 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1603 {
1604     a.ptr[0] = b.array[0];
1605     return a;
1606 }
1607 unittest
1608 {
1609     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1610     assert(a.array == [42.0, 0]);
1611 }
1612 
1613 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1614 long _mm_cvttss_si64 (__m128 a) pure @safe
1615 {
1616     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1617 }
1618 unittest
1619 {
1620     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1621 }
1622 
1623 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1624 /// Put zeroes in the upper elements of result.
1625 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1626 {
1627     static if (LDC_with_SSE2)
1628     {
1629         return __builtin_ia32_cvttpd2dq(a);
1630     }
1631     else static if (GDC_with_SSE2)
1632     {
1633         return __builtin_ia32_cvttpd2dq(a);
1634     }
1635     else
1636     {
1637         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1638         __m128i r;
1639         r.ptr[0] = cast(int)a.array[0];
1640         r.ptr[1] = cast(int)a.array[1];
1641         r.ptr[2] = 0;
1642         r.ptr[3] = 0;
1643         return r;
1644     }
1645 }
1646 unittest
1647 {
1648     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1649     assert(R.array == [-4, 45641, 0, 0]);
1650 }
1651 
1652 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1653 /// to packed 32-bit integers with truncation.
1654 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1655 {
1656     return to_m64(_mm_cvttpd_epi32(v));
1657 }
1658 unittest
1659 {
1660     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1661     int[2] correct = [-4, 45641];
1662     assert(R.array == correct);
1663 }
1664 
1665 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1666 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1667 {
1668     // x86: Generates cvttps2dq since LDC 1.3 -O2
1669     // ARM64: generates fcvtze since LDC 1.8 -O2
1670     __m128i r;
1671     r.ptr[0] = cast(int)a.array[0];
1672     r.ptr[1] = cast(int)a.array[1];
1673     r.ptr[2] = cast(int)a.array[2];
1674     r.ptr[3] = cast(int)a.array[3];
1675     return r;
1676 }
1677 unittest
1678 {
1679     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1680     assert(R.array == [-4, 45641, 0, 1]);
1681 }
1682 
1683 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1684 int _mm_cvttsd_si32 (__m128d a)
1685 {
1686     // Generates cvttsd2si since LDC 1.3 -O0
1687     return cast(int)a.array[0];
1688 }
1689 
1690 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1691 long _mm_cvttsd_si64 (__m128d a)
1692 {
1693     // Generates cvttsd2si since LDC 1.3 -O0
1694     // but in 32-bit instead, it's a long sequence that resort to FPU
1695     return cast(long)a.array[0];
1696 }
1697 
1698 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1699 
1700 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1701 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1702 {
1703     pragma(inline, true);
1704     return a / b;
1705 }
1706 
1707 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1708 {
1709     static if (GDC_with_SSE2)
1710     {
1711         return __builtin_ia32_divsd(a, b);
1712     }
1713     else version(DigitalMars)
1714     {
1715         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1716         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1717         asm pure nothrow @nogc @trusted { nop;}
1718         a.array[0] = a.array[0] / b.array[0];
1719         return a;
1720     }
1721     else
1722     {
1723         a.ptr[0] /= b.array[0];
1724         return a;
1725     }
1726 }
1727 unittest
1728 {
1729     __m128d a = [2.0, 4.5];
1730     a = _mm_div_sd(a, a);
1731     assert(a.array == [1.0, 4.5]);
1732 }
1733 
1734 /// Extract a 16-bit integer from `v`, selected with `index`.
1735 /// Warning: the returned value is zero-extended to 32-bits.
1736 int _mm_extract_epi16(__m128i v, int index) pure @safe
1737 {
1738     short8 r = cast(short8)v;
1739     return cast(ushort)(r.array[index & 7]);
1740 }
1741 unittest
1742 {
1743     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1744     assert(_mm_extract_epi16(A, 6) == 6);
1745     assert(_mm_extract_epi16(A, 0) == 65535);
1746     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1747 }
1748 
1749 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1750 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1751 {
1752     short8 r = cast(short8)v;
1753     r.ptr[index & 7] = cast(short)i;
1754     return cast(__m128i)r;
1755 }
1756 unittest
1757 {
1758     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1759     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1760     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1761     assert(R.array == correct);
1762 }
1763 
1764 
1765 void _mm_lfence() @trusted
1766 {
1767     version(GNU)
1768     {
1769     
1770         static if (GDC_with_SSE2)
1771         {
1772             __builtin_ia32_lfence();
1773         }
1774         else version(X86)
1775         {
1776             asm pure nothrow @nogc @trusted
1777             {
1778                 "lfence;\n" : : : ;
1779             }
1780         }
1781         else
1782             static assert(false);
1783     }
1784     else static if (LDC_with_SSE2)
1785     {
1786         __builtin_ia32_lfence();
1787     }
1788     else static if (DMD_with_asm)
1789     {
1790         asm nothrow @nogc pure @safe
1791         {
1792             lfence;
1793         }
1794     }
1795     else version(LDC)
1796     {
1797         llvm_memory_fence(); // PERF actually generates mfence
1798     }
1799     else
1800         static assert(false);
1801 }
1802 unittest
1803 {
1804     _mm_lfence();
1805 }
1806 
1807 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1808 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1809 __m128d _mm_load_pd (const(double) * mem_addr) pure
1810 {
1811     pragma(inline, true);
1812     __m128d* aligned = cast(__m128d*)mem_addr;
1813     return *aligned;
1814 }
1815 unittest
1816 {
1817     align(16) double[2] S = [-5.0, 7.0];
1818     __m128d R = _mm_load_pd(S.ptr);
1819     assert(R.array == S);
1820 }
1821 
1822 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1823 /// `mem_addr` does not need to be aligned on any particular boundary.
1824 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1825 {
1826     double m = *mem_addr;
1827     __m128d r;
1828     r.ptr[0] = m;
1829     r.ptr[1] = m;
1830     return r;
1831 }
1832 unittest
1833 {
1834     double what = 4;
1835     __m128d R = _mm_load_pd1(&what);
1836     double[2] correct = [4.0, 4];
1837     assert(R.array == correct);
1838 }
1839 
1840 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1841 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1842 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1843 {
1844     double2 r = [0, 0];
1845     r.ptr[0] = *mem_addr;
1846     return r;
1847 }
1848 unittest
1849 {
1850     double x = -42;
1851     __m128d a = _mm_load_sd(&x);
1852     assert(a.array == [-42.0, 0.0]);
1853 }
1854 
1855 /// Load 128-bits of integer data from memory into dst. 
1856 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1857 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1858 {
1859     pragma(inline, true);
1860     return *mem_addr;
1861 }
1862 unittest
1863 {
1864     align(16) int[4] correct = [-1, 2, 3, 4];
1865     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1866     assert(A.array == correct);
1867 }
1868 
1869 alias _mm_load1_pd = _mm_load_pd1; ///
1870 
1871 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1872 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1873 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1874 {
1875     pragma(inline, true);
1876     a.ptr[1] = *mem_addr;
1877     return a;
1878 }
1879 unittest
1880 {
1881     double A = 7.0;
1882     __m128d B = _mm_setr_pd(4.0, -5.0);
1883     __m128d R = _mm_loadh_pd(B, &A);
1884     double[2] correct = [ 4.0, 7.0 ];
1885     assert(R.array == correct);
1886 }
1887 
1888 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
1889 // Note: strange signature since the memory doesn't have to aligned (Issue #60)
1890 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
1891 {
1892     pragma(inline, true);
1893     auto pLong = cast(const(long)*)mem_addr;
1894     long2 r = [0, 0];
1895     r.ptr[0] = *pLong;
1896     return cast(__m128i)(r);
1897 }
1898 unittest
1899 {
1900     long A = 0x7878787870707070;
1901     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
1902     long[2] correct = [0x7878787870707070, 0];
1903     assert(R.array == correct);
1904 }
1905 
1906 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
1907 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
1908 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1909 {
1910     a.ptr[0] = *mem_addr;
1911     return a;
1912 }
1913 unittest
1914 {
1915     double A = 7.0;
1916     __m128d B = _mm_setr_pd(4.0, -5.0);
1917     __m128d R = _mm_loadl_pd(B, &A);
1918     double[2] correct = [ 7.0, -5.0 ];
1919     assert(R.array == correct);
1920 }
1921 
1922 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1924 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1925 {
1926     __m128d a = *cast(__m128d*)(mem_addr);
1927     __m128d r;
1928     r.ptr[0] = a.array[1];
1929     r.ptr[1] = a.array[0];
1930     return r;
1931 }
1932 unittest
1933 {
1934     align(16) double[2] A = [56.0, -74.0];
1935     __m128d R = _mm_loadr_pd(A.ptr);
1936     double[2] correct = [-74.0, 56.0];
1937     assert(R.array == correct);
1938 }
1939 
1940 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
1941 /// `mem_addr` does not need to be aligned on any particular boundary.
1942 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
1943 {
1944     pragma(inline, true);
1945     static if (GDC_with_SSE2)
1946     {
1947         return __builtin_ia32_loadupd(mem_addr); 
1948     }
1949     else version(LDC)
1950     {
1951         return loadUnaligned!(double2)(mem_addr);
1952     }
1953     else version(DigitalMars)
1954     {
1955         // Apparently inside __simd you can use aligned dereferences without fear.
1956         // That was issue 23048 on dlang's Bugzilla.
1957         static if (DMD_with_DSIMD)
1958         {
1959             return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
1960         }
1961         else static if (SSESizedVectorsAreEmulated)
1962         {
1963             // Since this vector is emulated, it doesn't have alignement constraints
1964             // and as such we can just cast it.
1965             return *cast(__m128d*)(mem_addr);
1966         }
1967         else
1968         {
1969             __m128d result;
1970             result.ptr[0] = mem_addr[0];
1971             result.ptr[1] = mem_addr[1];
1972             return result;
1973         }
1974     }
1975     else
1976     {
1977         __m128d result;
1978         result.ptr[0] = mem_addr[0];
1979         result.ptr[1] = mem_addr[1];
1980         return result;
1981     }
1982 }
1983 unittest
1984 {
1985     double[2] A = [56.0, -75.0];
1986     __m128d R = _mm_loadu_pd(A.ptr);
1987     double[2] correct = [56.0, -75.0];
1988     assert(R.array == correct);
1989 }
1990 
1991 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
1992 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1993 {
1994     pragma(inline, true);
1995     static if (GDC_with_SSE2)
1996     {
1997         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1998     }
1999     else
2000     {
2001         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2002     }
2003 }
2004 unittest
2005 {
2006     align(16) int[4] correct = [-1, 2, -3, 4];
2007     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2008     assert(A.array == correct);
2009 }
2010 
2011 /// Load unaligned 32-bit integer from memory into the first element of result.
2012 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
2013 {
2014     pragma(inline, true);
2015     int r = *cast(int*)(mem_addr);
2016     int4 result = [0, 0, 0, 0];
2017     result.ptr[0] = r;
2018     return result;
2019 }
2020 unittest
2021 {
2022     int r = 42;
2023     __m128i A = _mm_loadu_si32(&r);
2024     int[4] correct = [42, 0, 0, 0];
2025     assert(A.array == correct);
2026 }
2027 
2028 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2029 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2030 /// and pack the results in destination.
2031 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2032 {
2033     static if (GDC_with_SSE2)
2034     {
2035         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2036     }
2037     else static if (LDC_with_SSE2)
2038     {
2039         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2040     }
2041     else static if (LDC_with_ARM64)
2042     {
2043         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2044         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2045         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2046         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2047         return vcombine_s32(rl, rh);
2048     }
2049     else
2050     {
2051         short8 sa = cast(short8)a;
2052         short8 sb = cast(short8)b;
2053         int4 r;
2054         foreach(i; 0..4)
2055         {
2056             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2057         }
2058         return r;
2059     }
2060 }
2061 unittest
2062 {
2063     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2064     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2065     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2066     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2067     assert(R.array == correct);
2068 }
2069 
2070 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2071 /// (elements are not stored when the highest bit is not set in the corresponding element)
2072 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2073 /// boundary.
2074 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2075 {
2076     static if (GDC_with_SSE2)
2077     {    
2078         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2079     }
2080     else static if (LDC_with_SSE2)
2081     {
2082         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2083     }
2084     else static if (LDC_with_ARM64)
2085     {
2086         // PERF: catastrophic on ARM32
2087         byte16 bmask  = cast(byte16)mask;
2088         byte16 shift = 7;
2089         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2090         mask = cast(__m128i) bmask;
2091         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2092         dest = (a & mask) | (dest & ~mask);
2093         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2094     }
2095     else
2096     {
2097         byte16 b = cast(byte16)a;
2098         byte16 m = cast(byte16)mask;
2099         byte* dest = cast(byte*)(mem_addr);
2100         foreach(j; 0..16)
2101         {
2102             if (m.array[j] & 128)
2103             {
2104                 dest[j] = b.array[j];
2105             }
2106         }
2107     }
2108 }
2109 unittest
2110 {
2111     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2112     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2113     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2114     _mm_maskmoveu_si128(A, mask, dest.ptr);
2115     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2116     assert(dest == correct);
2117 }
2118 
2119 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2120 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2121 {
2122     static if (GDC_with_SSE2)
2123     {
2124         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2125     }
2126     else version(LDC)
2127     {
2128         // x86: pmaxsw since LDC 1.0 -O1
2129         // ARM: smax.8h since LDC 1.5 -01
2130         short8 sa = cast(short8)a;
2131         short8 sb = cast(short8)b;
2132         short8 greater = greaterMask!short8(sa, sb);
2133         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2134     }
2135     else
2136     {
2137         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2138         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2139         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2140         return _mm_xor_si128(b, mask);
2141     }
2142 }
2143 unittest
2144 {
2145     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2146                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2147     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2148     assert(R.array == correct);
2149 }
2150 
2151 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2152 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2153 {
2154     version(LDC)
2155     {
2156         // x86: pmaxub since LDC 1.0.0 -O1
2157         // ARM64: umax.16b since LDC 1.5.0 -O1
2158         // PERF: catastrophic on ARM32
2159         ubyte16 sa = cast(ubyte16)a;
2160         ubyte16 sb = cast(ubyte16)b;
2161         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2162         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2163     }
2164     else
2165     {
2166         __m128i value128 = _mm_set1_epi8(-128);
2167         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2168         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2169         __m128i mask = _mm_and_si128(aTob, higher);
2170         return _mm_xor_si128(b, mask);
2171     }
2172 }
2173 unittest
2174 {
2175     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2176                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2177     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2178     assert(R.array == correct);
2179 }
2180 
2181 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values.
2182 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2183 {
2184     static if (GDC_with_SSE2)
2185     {
2186         return __builtin_ia32_maxpd(a, b);
2187     }
2188     else
2189     {
2190         // x86: Generates maxpd starting with LDC 1.9 -O2
2191         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2192         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2193         return a;
2194     }
2195 }
2196 unittest
2197 {
2198     __m128d A = _mm_setr_pd(4.0, 1.0);
2199     __m128d B = _mm_setr_pd(1.0, 8.0);
2200     __m128d M = _mm_max_pd(A, B);
2201     assert(M.array[0] == 4.0);
2202     assert(M.array[1] == 8.0);
2203 }
2204 
2205 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2206 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2207 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2208 {
2209     static if (GDC_with_SSE2)
2210     {
2211         return __builtin_ia32_maxsd(a, b);
2212     }
2213     else
2214     {
2215          __m128d r = a;
2216         // Generates maxsd starting with LDC 1.3
2217         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2218         return r;
2219     }
2220 }
2221 unittest
2222 {
2223     __m128d A = _mm_setr_pd(1.0, 1.0);
2224     __m128d B = _mm_setr_pd(4.0, 2.0);
2225     __m128d M = _mm_max_sd(A, B);
2226     assert(M.array[0] == 4.0);
2227     assert(M.array[1] == 1.0);
2228 }
2229 
2230 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2231 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2232 /// is globally visible before any memory instruction which follows the fence in program order.
2233 void _mm_mfence() @trusted
2234 {
2235     version(GNU)
2236     {
2237         static if (GDC_with_SSE2)
2238         {
2239             __builtin_ia32_mfence();
2240         }
2241         else version(X86)
2242         {
2243             asm pure nothrow @nogc @trusted
2244             {
2245                 "mfence;\n" : : : ;
2246             }
2247         }
2248         else
2249             static assert(false);
2250     }
2251     else static if (LDC_with_SSE2)
2252     {
2253         __builtin_ia32_mfence();
2254     }
2255     else static if (DMD_with_asm)
2256     {
2257         asm nothrow @nogc pure @safe
2258         {
2259             mfence;
2260         }
2261     }
2262     else version(LDC)
2263     {
2264         void _mm_mfence() pure @safe
2265         {
2266             // Note: will generate the DMB instruction on ARM
2267             llvm_memory_fence();
2268         }
2269     }
2270     else
2271         static assert(false);
2272 }
2273 unittest
2274 {
2275     _mm_mfence();
2276 }
2277 
2278 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2279 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2280 {
2281     static if (GDC_with_SSE2)
2282     {
2283         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2284     }
2285     else version(LDC)
2286     {
2287         // x86: pminsw since LDC 1.0 -O1
2288         // ARM64: smin.8h since LDC 1.5 -01
2289         short8 sa = cast(short8)a;
2290         short8 sb = cast(short8)b;
2291         short8 greater = greaterMask!short8(sa, sb);
2292         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2293     }
2294     else
2295     {
2296         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2297         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2298         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2299         return _mm_xor_si128(b, mask);
2300     }
2301 }
2302 unittest
2303 {
2304     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2305                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2306     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2307     assert(R.array == correct);
2308 }
2309 
2310 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2311 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2312 {
2313     version(LDC)
2314     {
2315         // x86: pminub since LDC 1.0.0 -O1
2316         // ARM: umin.16b since LDC 1.5.0 -O1
2317         // PERF: catastrophic on ARM32
2318         ubyte16 sa = cast(ubyte16)a;
2319         ubyte16 sb = cast(ubyte16)b;
2320         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2321         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2322     }
2323     else
2324     {
2325         __m128i value128 = _mm_set1_epi8(-128);
2326         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2327         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2328         __m128i mask = _mm_and_si128(aTob, lower);
2329         return _mm_xor_si128(b, mask);
2330     }
2331 }
2332 unittest
2333 {
2334     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2335                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2336     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2337     assert(R.array == correct);
2338 }
2339 
2340 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2341 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2342 {
2343     static if (GDC_with_SSE2)
2344     {
2345         return __builtin_ia32_minpd(a, b);
2346     }
2347     else
2348     {
2349         // Generates minpd starting with LDC 1.9
2350         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2351         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2352         return a;
2353     }
2354 }
2355 unittest
2356 {
2357     __m128d A = _mm_setr_pd(1.0, 2.0);
2358     __m128d B = _mm_setr_pd(4.0, 1.0);
2359     __m128d M = _mm_min_pd(A, B);
2360     assert(M.array[0] == 1.0);
2361     assert(M.array[1] == 1.0);
2362 }
2363 
2364 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2365 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2366 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2367 {
2368     static if (GDC_with_SSE2)
2369     {
2370         return __builtin_ia32_minsd(a, b);
2371     }
2372     else
2373     {
2374         // Generates minsd starting with LDC 1.3
2375         __m128d r = a;
2376         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2377         return r;
2378     }
2379 }
2380 unittest
2381 {
2382     __m128d A = _mm_setr_pd(1.0, 3.0);
2383     __m128d B = _mm_setr_pd(4.0, 2.0);
2384     __m128d M = _mm_min_sd(A, B);
2385     assert(M.array[0] == 1.0);
2386     assert(M.array[1] == 3.0);
2387 }
2388 
2389 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2390 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2391 {
2392     static if (GDC_with_SSE2)
2393     {
2394         // slightly better with GDC -O0
2395         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2396     }
2397     else
2398     {
2399         long2 result = [ 0, 0 ];
2400         long2 la = cast(long2) a;
2401         result.ptr[0] = la.array[0];
2402         return cast(__m128i)(result);
2403     }
2404 }
2405 unittest
2406 {
2407     long2 A = [13, 47];
2408     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2409     long[2] correct = [13, 0];
2410     assert(B.array == correct);
2411 }
2412 
2413 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2414 /// the upper element from `a` to the upper element of dst.
2415 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2416 {
2417     static if (GDC_with_SSE2)
2418     {
2419         return __builtin_ia32_movsd(a, b); 
2420     }
2421     else
2422     {
2423         b.ptr[1] = a.array[1];
2424         return b;
2425     }
2426 }
2427 unittest
2428 {
2429     double2 A = [13.0, 47.0];
2430     double2 B = [34.0, 58.0];
2431     double2 C = _mm_move_sd(A, B);
2432     double[2] correct = [34.0, 47.0];
2433     assert(C.array == correct);
2434 }
2435 
2436 /// Create mask from the most significant bit of each 8-bit element in `v`.
2437 int _mm_movemask_epi8 (__m128i a) pure @trusted
2438 {
2439     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2440     static if (GDC_with_SSE2)
2441     {
2442         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2443     }
2444     else static if (LDC_with_SSE2)
2445     {
2446         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2447     }
2448     else static if (LDC_with_ARM64)
2449     {
2450         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2451         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2452         // SO there might be something a bit faster, but this one is reasonable and branchless.
2453         byte8 mask_shift;
2454         mask_shift.ptr[0] = 7;
2455         mask_shift.ptr[1] = 6;
2456         mask_shift.ptr[2] = 5;
2457         mask_shift.ptr[3] = 4;
2458         mask_shift.ptr[4] = 3;
2459         mask_shift.ptr[5] = 2;
2460         mask_shift.ptr[6] = 1;
2461         mask_shift.ptr[7] = 0;
2462         byte8 mask_and = byte8(-128);
2463         byte8 lo = vget_low_u8(cast(byte16)a);
2464         byte8 hi = vget_high_u8(cast(byte16)a);
2465         lo = vand_u8(lo, mask_and);
2466         lo = vshr_u8(lo, mask_shift);
2467         hi = vand_u8(hi, mask_and);
2468         hi = vshr_u8(hi, mask_shift);
2469         lo = vpadd_u8(lo,lo);
2470         lo = vpadd_u8(lo,lo);
2471         lo = vpadd_u8(lo,lo);
2472         hi = vpadd_u8(hi,hi);
2473         hi = vpadd_u8(hi,hi);
2474         hi = vpadd_u8(hi,hi);
2475         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2476     }
2477     else
2478     {
2479         byte16 ai = cast(byte16)a;
2480         int r = 0;
2481         foreach(bit; 0..16)
2482         {
2483             if (ai.array[bit] < 0) r += (1 << bit);
2484         }
2485         return r;
2486     }
2487 }
2488 unittest
2489 {
2490     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2491 }
2492 
2493 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2494 int _mm_movemask_epi16 (__m128i a) pure @trusted
2495 {
2496     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2497 }
2498 unittest
2499 {
2500     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2501 }
2502 
2503 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2504 /// loating-point element in `v`.
2505 int _mm_movemask_pd(__m128d v) pure @safe
2506 {
2507     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2508     static if (GDC_with_SSE2)
2509     {
2510         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2511         /// packed double-precision (64-bit) floating-point element in `v`.
2512         return __builtin_ia32_movmskpd(v);
2513     }
2514     else static if (LDC_with_SSE2)
2515     {
2516         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2517         /// packed double-precision (64-bit) floating-point element in `v`.
2518         return __builtin_ia32_movmskpd(v);
2519     }
2520     else
2521     {
2522         long2 lv = cast(long2)v;
2523         int r = 0;
2524         if (lv.array[0] < 0) r += 1;
2525         if (lv.array[1] < 0) r += 2;
2526         return r;
2527     }
2528 }
2529 unittest
2530 {
2531     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2532     assert(_mm_movemask_pd(A) == 2);
2533 }
2534 
2535 /// Copy the lower 64-bit integer in `v`.
2536 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2537 {
2538     long2 lv = cast(long2)v;
2539     return long1(lv.array[0]);
2540 }
2541 unittest
2542 {
2543     __m128i A = _mm_set_epi64x(-1, -2);
2544     __m64 R = _mm_movepi64_pi64(A);
2545     assert(R.array[0] == -2);
2546 }
2547 
2548 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2549 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2550 {
2551     long2 r;
2552     r.ptr[0] = a.array[0];
2553     r.ptr[1] = 0;
2554     return cast(__m128i)r;
2555 }
2556 
2557 // Note: generates pmuludq in LDC with -O1
2558 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2559 {
2560     __m128i zero = _mm_setzero_si128();
2561 
2562     static if (__VERSION__ >= 2088)
2563     {
2564         // Need LLVM9 to avoid this shufflevector
2565         long2 la, lb;
2566         la.ptr[0] = cast(uint)a.array[0];
2567         la.ptr[1] = cast(uint)a.array[2];
2568         lb.ptr[0] = cast(uint)b.array[0];
2569         lb.ptr[1] = cast(uint)b.array[2];
2570     }
2571     else
2572     {
2573         long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2574         long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2575     }
2576 
2577     version(DigitalMars)
2578     {
2579         // DMD has no long2 mul
2580         // long2 mul not supported before LDC 1.5
2581         la.ptr[0] *= lb.array[0];
2582         la.ptr[1] *= lb.array[1];
2583         return cast(__m128i)(la);
2584     }
2585     else
2586     {
2587         static if (__VERSION__ >= 2076)
2588         {
2589             return cast(__m128i)(la * lb);
2590         }
2591         else
2592         {
2593             // long2 mul not supported before LDC 1.5
2594             la.ptr[0] *= lb.array[0];
2595             la.ptr[1] *= lb.array[1];
2596             return cast(__m128i)(la);
2597         }
2598     }
2599 }
2600 unittest
2601 {
2602     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2603     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2604     __m128i C = _mm_mul_epu32(A, B);
2605     long2 LC = cast(long2)C;
2606     assert(LC.array[0] == 18446744065119617025uL);
2607     assert(LC.array[1] == 12723420444339690338uL);
2608 }
2609 
2610 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2611 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2612 {
2613     pragma(inline, true);
2614     return a * b;
2615 }
2616 unittest
2617 {
2618     __m128d a = [-2.0, 1.5];
2619     a = _mm_mul_pd(a, a);
2620     assert(a.array == [4.0, 2.25]);
2621 }
2622 
2623 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2624 /// element of result, and copy the upper element from `a` to the upper element of result.
2625 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2626 {
2627     version(DigitalMars)
2628     {    
2629         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2630         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2631         asm pure nothrow @nogc @trusted { nop;}
2632         a.array[0] = a.array[0] * b.array[0];
2633         return a;
2634     }
2635     else static if (GDC_with_SSE2)
2636     {
2637         return __builtin_ia32_mulsd(a, b);
2638     }
2639     else
2640     {
2641         a.ptr[0] *= b.array[0];
2642         return a;
2643     }
2644 }
2645 unittest
2646 {
2647     __m128d a = [-2.0, 1.5];
2648     a = _mm_mul_sd(a, a);
2649     assert(a.array == [4.0, 1.5]);
2650 }
2651 
2652 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2653 /// and get an unsigned 64-bit result.
2654 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2655 {
2656     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2657 }
2658 unittest
2659 {
2660     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2661     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2662     __m64 C = _mm_mul_su32(A, B);
2663     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2664 }
2665 
2666 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2667 /// high 16 bits of the intermediate integers.
2668 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2669 {
2670     static if (GDC_with_SSE2)
2671     {
2672         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2673     }
2674     else static if (LDC_with_SSE2)
2675     {
2676         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2677     }
2678     else
2679     {
2680         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2681         //        PERF: it seems the simde solution has one less instruction in ARM64.
2682         // PERF: Catastrophic in ARM32.
2683         short8 sa = cast(short8)a;
2684         short8 sb = cast(short8)b;
2685         short8 r = void;
2686         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2687         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2688         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2689         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2690         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2691         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2692         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2693         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2694         return cast(__m128i)r;
2695     }
2696 }
2697 unittest
2698 {
2699     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2700     __m128i B = _mm_set1_epi16(16384);
2701     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2702     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2703     assert(R.array == correct);
2704 }
2705 
2706 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2707 /// high 16 bits of the intermediate integers.
2708 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2709 {
2710     static if (GDC_with_SSE2)
2711     {
2712         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2713     }
2714     else static if (LDC_with_SSE2)
2715     {
2716         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2717     }
2718     else
2719     {
2720         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2721         //      it seems the simde solution has one less instruction in ARM64
2722         // PERF: Catastrophic in ARM32.
2723         short8 sa = cast(short8)a;
2724         short8 sb = cast(short8)b;
2725         short8 r = void;
2726         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2727         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2728         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2729         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2730         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2731         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2732         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2733         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2734         return cast(__m128i)r;
2735     }
2736 }
2737 unittest
2738 {
2739     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2740     __m128i B = _mm_set1_epi16(16384);
2741     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2742     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2743     assert(R.array == correct);
2744 }
2745 
2746 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2747 /// bits of the intermediate integers.
2748 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2749 {
2750     return cast(__m128i)(cast(short8)a * cast(short8)b);
2751 }
2752 unittest
2753 {
2754     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2755     __m128i B = _mm_set1_epi16(16384);
2756     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2757     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2758     assert(R.array == correct);
2759 }
2760 
2761 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
2762 __m128i _mm_not_si128 (__m128i a) pure @safe
2763 {
2764     return ~a;
2765 }
2766 unittest
2767 {
2768     __m128i A = _mm_set1_epi32(-748);
2769     int4 notA = cast(int4) _mm_not_si128(A);
2770     int[4] correct = [747, 747, 747, 747];
2771     assert(notA.array == correct);
2772 }
2773 
2774 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2775 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2776 {
2777     pragma(inline, true);
2778     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2779 }
2780 
2781 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2782 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2783 {
2784     pragma(inline, true);
2785     return a | b;
2786 }
2787 
2788 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2789 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2790 {
2791     static if (GDC_with_SSE2)
2792     {
2793         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2794     }    
2795     else static if (LDC_with_SSE2)
2796     {
2797         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2798     }
2799     else static if (LDC_with_ARM64)
2800     {
2801         short4 ra = vqmovn_s32(cast(int4)a);
2802         short4 rb = vqmovn_s32(cast(int4)b);
2803         return cast(__m128i)vcombine_s16(ra, rb);
2804     }
2805     else
2806     {
2807         // PERF: catastrophic on ARM32
2808         short8 r;
2809         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
2810         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
2811         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
2812         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
2813         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
2814         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
2815         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
2816         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
2817         return cast(__m128i)r;
2818     }
2819 }
2820 unittest
2821 {
2822     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2823     short8 R = cast(short8) _mm_packs_epi32(A, A);
2824     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2825     assert(R.array == correct);
2826 }
2827 
2828 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
2829 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
2830 {
2831     static if (GDC_with_SSE2)
2832     {
2833         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2834     }
2835     else static if (LDC_with_SSE2)
2836     {
2837         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
2838     }
2839     else static if (LDC_with_ARM64)
2840     {
2841         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
2842         byte8 ra = vqmovn_s16(cast(short8)a);
2843         byte8 rb = vqmovn_s16(cast(short8)b);
2844         return cast(__m128i)vcombine_s8(ra, rb);
2845     }
2846     else
2847     {
2848         // PERF: ARM32 is missing
2849         byte16 r;
2850         short8 sa = cast(short8)a;
2851         short8 sb = cast(short8)b;
2852         foreach(i; 0..8)
2853             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
2854         foreach(i; 0..8)
2855             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2856         return cast(__m128i)r;
2857     }
2858 }
2859 unittest
2860 {
2861     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2862     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2863     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2864                         127, -128, 127, 0, 127, -128, 127, 0];
2865     assert(R.array == correct);
2866 }
2867 
2868 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
2869 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2870 {
2871     static if (GDC_with_SSE2)
2872     {
2873         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2874     }
2875     else static if (LDC_with_SSE2)
2876     {
2877         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
2878     }
2879     else static if (LDC_with_ARM64)
2880     {
2881         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
2882         byte8 ra = vqmovun_s16(cast(short8)a);
2883         byte8 rb = vqmovun_s16(cast(short8)b);
2884         return cast(__m128i)vcombine_s8(ra, rb);
2885     }
2886     else
2887     {
2888         short8 sa = cast(short8)a;
2889         short8 sb = cast(short8)b;
2890         ubyte[16] result = void;
2891         for (int i = 0; i < 8; ++i)
2892         {
2893             short s = sa[i];
2894             if (s < 0) s = 0;
2895             if (s > 255) s = 255;
2896             result[i] = cast(ubyte)s;
2897 
2898             s = sb[i];
2899             if (s < 0) s = 0;
2900             if (s > 255) s = 255;
2901             result[i+8] = cast(ubyte)s;
2902         }
2903         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2904     }
2905 }
2906 unittest
2907 {
2908     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2909     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2910     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2911                                                 0, 255, 0, 255, 255, 2, 1, 0];
2912     foreach(i; 0..16)
2913         assert(AA.array[i] == cast(byte)(correctResult[i]));
2914 }
2915 
2916 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
2917 /// and power consumption of spin-wait loops.
2918 void _mm_pause() @trusted
2919 {
2920     version(GNU)
2921     {
2922         static if (GDC_with_SSE2)
2923         {
2924             __builtin_ia32_pause();
2925         }
2926         else version(X86)
2927         {
2928             asm pure nothrow @nogc @trusted
2929             {
2930                 "pause;\n" : : : ;
2931             }
2932         }
2933         else
2934             static assert(false);
2935     }
2936     else static if (LDC_with_SSE2)
2937     {
2938         __builtin_ia32_pause();
2939     }
2940     else static if (DMD_with_asm)
2941     {
2942         asm nothrow @nogc pure @safe
2943         {
2944             rep; nop; // F3 90 =  pause
2945         }
2946     }
2947     else version (LDC)
2948     {
2949         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
2950     }
2951     else
2952         static assert(false);
2953 }
2954 unittest
2955 {
2956     _mm_pause();
2957 }
2958 
2959 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2960 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2961 /// low 16 bits of 64-bit elements in result.
2962 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
2963 {
2964     static if (GDC_with_SSE2)
2965     {
2966         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
2967     }
2968     else static if (LDC_with_SSE2)
2969     {
2970         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
2971     }
2972     else static if (LDC_with_ARM64)
2973     {
2974         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
2975 
2976         // PERF: Looks suboptimal vs addp
2977         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
2978         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
2979         ushort8 r = 0;
2980         r[0] = r0;
2981         r[4] = r4;
2982         return cast(__m128i) r;
2983     }
2984     else
2985     {
2986         // PERF: ARM32 is lacking
2987         byte16 ab = cast(byte16)a;
2988         byte16 bb = cast(byte16)b;
2989         ubyte[16] t;
2990         foreach(i; 0..16)
2991         {
2992             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2993             if (diff < 0) diff = -diff;
2994             t[i] = cast(ubyte)(diff);
2995         }
2996         int4 r = _mm_setzero_si128();
2997         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2998         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2999         return r;
3000     }
3001 }
3002 unittest
3003 {
3004     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3005     __m128i B = _mm_set1_epi8(1);
3006     __m128i R = _mm_sad_epu8(A, B);
3007     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3008                       0,
3009                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3010                       0];
3011     assert(R.array == correct);
3012 }
3013 
3014 /// Set packed 16-bit integers with the supplied values.
3015 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3016 {
3017     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
3018     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
3019 }
3020 unittest
3021 {
3022     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3023     short8 B = cast(short8) A;
3024     foreach(i; 0..8)
3025         assert(B.array[i] == i);
3026 }
3027 
3028 /// Set packed 32-bit integers with the supplied values.
3029 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3030 {
3031     pragma(inline, true);
3032     int[4] result = [e0, e1, e2, e3];
3033     return loadUnaligned!(int4)(result.ptr);
3034 }
3035 unittest
3036 {
3037     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3038     foreach(i; 0..4)
3039         assert(A.array[i] == i);
3040 }
3041 
3042 /// Set packed 64-bit integers with the supplied values.
3043 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3044 {
3045     pragma(inline, true);
3046     long[2] result = [e0.array[0], e1.array[0]];
3047     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3048 }
3049 unittest
3050 {
3051     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3052     long2 B = cast(long2) A;
3053     assert(B.array[0] == 5678);
3054     assert(B.array[1] == 1234);
3055 }
3056 
3057 /// Set packed 64-bit integers with the supplied values.
3058 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3059 {
3060     pragma(inline, true);
3061     long[2] result = [e0, e1];
3062     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3063 }
3064 unittest
3065 {
3066     __m128i A = _mm_set_epi64x(1234, 5678);
3067     long2 B = cast(long2) A;
3068     assert(B.array[0] == 5678);
3069     assert(B.array[1] == 1234);
3070 }
3071 
3072 /// Set packed 8-bit integers with the supplied values.
3073 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3074                       byte e11, byte e10, byte e9, byte e8,
3075                       byte e7, byte e6, byte e5, byte e4,
3076                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3077 {
3078     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3079                      e8, e9, e10, e11, e12, e13, e14, e15];
3080     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3081 }
3082 
3083 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3084 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3085 {
3086     pragma(inline, true);
3087     double[2] result = [e0, e1];
3088     return loadUnaligned!(double2)(result.ptr);
3089 }
3090 unittest
3091 {
3092     __m128d A = _mm_set_pd(61.0, 55.0);
3093     double[2] correct = [55.0, 61.0];
3094     assert(A.array == correct);
3095 }
3096 
3097 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3098 __m128d _mm_set_pd1 (double a) pure @trusted
3099 {
3100     pragma(inline, true);
3101     double[2] result = [a, a];
3102     return loadUnaligned!(double2)(result.ptr);
3103 }
3104 unittest
3105 {
3106     __m128d A = _mm_set_pd1(61.0);
3107     double[2] correct = [61.0, 61.0];
3108     assert(A.array == correct);
3109 }
3110 
3111 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3112 /// and zero the upper element.
3113 __m128d _mm_set_sd (double a) pure @trusted
3114 {
3115     double[2] result = [a, 0];
3116     return loadUnaligned!(double2)(result.ptr);
3117 }
3118 
3119 /// Broadcast 16-bit integer a to all elements of dst.
3120 __m128i _mm_set1_epi16 (short a) pure @trusted
3121 {
3122     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3123     {
3124         short8 v = a;
3125         return cast(__m128i) v;
3126     }
3127     else
3128     {
3129         pragma(inline, true);
3130         return cast(__m128i)(short8(a));
3131     }
3132 }
3133 unittest
3134 {
3135     short8 a = cast(short8) _mm_set1_epi16(31);
3136     for (int i = 0; i < 8; ++i)
3137         assert(a.array[i] == 31);
3138 }
3139 
3140 /// Broadcast 32-bit integer `a` to all elements.
3141 __m128i _mm_set1_epi32 (int a) pure @trusted
3142 {
3143     pragma(inline, true);
3144     return cast(__m128i)(int4(a));
3145 }
3146 unittest
3147 {
3148     int4 a = cast(int4) _mm_set1_epi32(31);
3149     for (int i = 0; i < 4; ++i)
3150         assert(a.array[i] == 31);
3151 }
3152 
3153 /// Broadcast 64-bit integer `a` to all elements.
3154 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3155 {
3156     return _mm_set_epi64(a, a);
3157 }
3158 unittest
3159 {
3160     long b = 0x1DEADCAFE; 
3161     __m64 a;
3162     a.ptr[0] = b;
3163     long2 c = cast(long2) _mm_set1_epi64(a);
3164     assert(c.array[0] == b);
3165     assert(c.array[1] == b);
3166 }
3167 
3168 /// Broadcast 64-bit integer `a` to all elements
3169 __m128i _mm_set1_epi64x (long a) pure @trusted
3170 {
3171     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3172     return cast(__m128i)(b);
3173 }
3174 unittest
3175 {
3176     long b = 0x1DEADCAFE;
3177     long2 c = cast(long2) _mm_set1_epi64x(b);
3178     for (int i = 0; i < 2; ++i)
3179         assert(c.array[i] == b);
3180 }
3181 
3182 /// Broadcast 8-bit integer `a` to all elements.
3183 __m128i _mm_set1_epi8 (byte a) pure @trusted
3184 {
3185     pragma(inline, true);
3186     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3187     return cast(__m128i)(b);
3188 }
3189 unittest
3190 {
3191     byte16 b = cast(byte16) _mm_set1_epi8(31);
3192     for (int i = 0; i < 16; ++i)
3193         assert(b.array[i] == 31);
3194 }
3195 
3196 alias _mm_set1_pd = _mm_set_pd1;
3197 
3198 /// Set packed 16-bit integers with the supplied values in reverse order.
3199 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3200                         short e3, short e2, short e1, short e0) pure @trusted
3201 {
3202     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
3203     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
3204 }
3205 unittest
3206 {
3207     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3208     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3209     assert(A.array == correct);
3210 }
3211 
3212 /// Set packed 32-bit integers with the supplied values in reverse order.
3213 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3214 {
3215     pragma(inline, true);
3216     int[4] result = [e3, e2, e1, e0];
3217     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3218 }
3219 unittest
3220 {
3221     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3222     int[4] correct = [-1, 0, -2147483648, 2147483647];
3223     assert(A.array == correct);
3224 }
3225 
3226 /// Set packed 64-bit integers with the supplied values in reverse order.
3227 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3228 {
3229     long[2] result = [e1, e0];
3230     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
3231 }
3232 unittest
3233 {
3234     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3235     long[2] correct = [-1, 0];
3236     assert(A.array == correct);
3237 }
3238 
3239 /// Set packed 8-bit integers with the supplied values in reverse order.
3240 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3241                        byte e11, byte e10, byte e9,  byte e8,
3242                        byte e7,  byte e6,  byte e5,  byte e4,
3243                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3244 {
3245     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3246                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3247     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
3248 }
3249 
3250 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3251 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3252 {
3253     pragma(inline, true);
3254     double2 result;
3255     result.ptr[0] = e1;
3256     result.ptr[1] = e0;
3257     return result;
3258 }
3259 unittest
3260 {
3261     __m128d A = _mm_setr_pd(61.0, 55.0);
3262     double[2] correct = [61.0, 55.0];
3263     assert(A.array == correct);
3264 }
3265 
3266 /// Return vector of type `__m128d` with all elements set to zero.
3267 __m128d _mm_setzero_pd () pure @trusted
3268 {
3269     pragma(inline, true);
3270     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3271     double[2] result = [0.0, 0.0];
3272     return loadUnaligned!(double2)(result.ptr);
3273 }
3274 
3275 /// Return vector of type `__m128i` with all elements set to zero.
3276 __m128i _mm_setzero_si128() pure @trusted
3277 {
3278     pragma(inline, true);
3279     // Note: using loadUnaligned has better -O0 codegen compared to .ptr
3280     int[4] result = [0, 0, 0, 0];
3281     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
3282 }
3283 
3284 /// Shuffle 32-bit integers in a using the control in `imm8`.
3285 /// See_also: `_MM_SHUFFLE`.
3286 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
3287 {
3288     static if (GDC_with_SSE2)
3289     {
3290         return __builtin_ia32_pshufd(a, imm8);
3291     }
3292     else
3293     {
3294         return shufflevector!(int4, (imm8 >> 0) & 3,
3295                                     (imm8 >> 2) & 3,
3296                                     (imm8 >> 4) & 3,
3297                                     (imm8 >> 6) & 3)(a, a);
3298     }
3299 }
3300 unittest
3301 {
3302     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3303     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3304     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3305     int[4] expectedB = [ 3, 2, 1, 0 ];
3306     assert(B.array == expectedB);
3307 }
3308 
3309 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3310 /// See_also: `_MM_SHUFFLE2`.
3311 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
3312 {
3313     static if (GDC_with_SSE2)
3314     {
3315         return __builtin_ia32_shufpd(a, b, imm8);
3316     }
3317     else
3318     {
3319         return shufflevector!(double2, 0 + ( imm8 & 1 ),
3320                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
3321     }
3322 }
3323 unittest
3324 {
3325     __m128d A = _mm_setr_pd(0.5, 2.0);
3326     __m128d B = _mm_setr_pd(4.0, 5.0);
3327     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3328     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3329     double[2] correct = [ 2.0, 5.0 ];
3330     assert(R.array == correct);
3331 }
3332 
3333 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3334 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3335 /// See also: `_MM_SHUFFLE`.
3336 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
3337 {
3338     static if (GDC_with_SSE2)
3339     {
3340         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3341     }
3342     else
3343     {
3344         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
3345                                           4 + ( (imm8 >> 0) & 3 ),
3346                                           4 + ( (imm8 >> 2) & 3 ),
3347                                           4 + ( (imm8 >> 4) & 3 ),
3348                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3349     }
3350 }
3351 unittest
3352 {
3353     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3354     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3355     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3356     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3357     assert(C.array == expectedC);
3358 }
3359 
3360 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3361 /// bits of result, with the high 64 bits being copied from from `a` to result.
3362 /// See_also: `_MM_SHUFFLE`.
3363 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
3364 {
3365     static if (GDC_with_SSE2)
3366     {
3367         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3368     }
3369     else
3370     {
3371         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
3372                                                     ( (imm8 >> 2) & 3 ),
3373                                                     ( (imm8 >> 4) & 3 ),
3374                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3375     }
3376 }
3377 unittest
3378 {
3379     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3380     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3381     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3382     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3383     assert(B.array == expectedB);
3384 }
3385 
3386 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3387 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3388 {
3389     static if (LDC_with_SSE2)
3390     {
3391         return __builtin_ia32_pslld128(a, count);
3392     }
3393     else static if (GDC_with_SSE2)
3394     {
3395         return __builtin_ia32_pslld128(a, count);
3396     }
3397     else static if (DMD_with_32bit_asm)
3398     {
3399         asm pure nothrow @nogc @trusted
3400         {
3401             movdqu XMM0, a;
3402             movdqu XMM1, count;
3403             pslld XMM0, XMM1;
3404             movdqu a, XMM0;
3405         }
3406         return a;
3407     }
3408     else
3409     {
3410         int4 r = void;
3411         long2 lc = cast(long2)count;
3412         int bits = cast(int)(lc.array[0]);
3413         foreach(i; 0..4)
3414             r[i] = cast(uint)(a[i]) << bits;
3415         return r;
3416     }
3417 }
3418 
3419 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3420 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3421 {
3422     static if (LDC_with_SSE2)
3423     {
3424         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3425     }
3426     else static if (GDC_with_SSE2)
3427     {
3428         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3429     }
3430     else static if (DMD_with_32bit_asm)
3431     {
3432         asm pure nothrow @nogc @trusted
3433         {
3434             movdqu XMM0, a;
3435             movdqu XMM1, count;
3436             psllq XMM0, XMM1;
3437             movdqu a, XMM0;
3438         }
3439         return a;
3440     }
3441     else
3442     {
3443         // ARM: good since LDC 1.12 -O2
3444         // ~but -O0 version is catastrophic
3445         long2 r = void;
3446         long2 sa = cast(long2)a;
3447         long2 lc = cast(long2)count;
3448         int bits = cast(int)(lc.array[0]);
3449         foreach(i; 0..2)
3450             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3451         return cast(__m128i)r;
3452     }
3453 }
3454 
3455 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3456 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3457 {
3458     static if (LDC_with_SSE2)
3459     {
3460         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3461     }
3462     else static if (GDC_with_SSE2)
3463     {
3464         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3465     }
3466     else static if (DMD_with_32bit_asm)
3467     {
3468         asm pure nothrow @nogc
3469         {
3470             movdqu XMM0, a;
3471             movdqu XMM1, count;
3472             psllw XMM0, XMM1;
3473             movdqu a, XMM0;
3474         }
3475         return a;
3476     }
3477     else
3478     {
3479         short8 sa = cast(short8)a;
3480         long2 lc = cast(long2)count;
3481         int bits = cast(int)(lc.array[0]);
3482         short8 r = void;
3483         foreach(i; 0..8)
3484             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3485         return cast(int4)r;
3486     }
3487 }
3488 
3489 
3490 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3491 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3492 {
3493     static if (GDC_with_SSE2)
3494     {
3495         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3496     }
3497     else static if (LDC_with_SSE2)
3498     {
3499         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3500     }
3501     else
3502     {
3503         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3504         //       D says "It's illegal to shift by the same or more bits 
3505         //       than the size of the quantity being shifted"
3506         //       and it's UB instead.
3507         int4 r = _mm_setzero_si128();
3508 
3509         ubyte count = cast(ubyte) imm8;
3510         if (count > 31)
3511             return r;
3512         
3513         foreach(i; 0..4)
3514             r.array[i] = cast(uint)(a.array[i]) << count;
3515         return r;
3516     }
3517 }
3518 unittest
3519 {
3520     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3521     __m128i B = _mm_slli_epi32(A, 1);
3522     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3523     int[4] expectedB = [ 0, 4, 6, -8];
3524     assert(B.array == expectedB);
3525     assert(B2.array == expectedB);
3526 
3527     __m128i C = _mm_slli_epi32(A, 0);
3528     int[4] expectedC = [ 0, 2, 3, -4];
3529     assert(C.array == expectedC);
3530 
3531     __m128i D = _mm_slli_epi32(A, 65);
3532     int[4] expectedD = [ 0, 0, 0, 0];
3533     assert(D.array == expectedD);
3534 }
3535 
3536 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3537 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3538 {
3539     static if (GDC_with_SSE2)
3540     {
3541         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3542     }
3543     else static if (LDC_with_SSE2)
3544     {
3545         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3546     }
3547     else
3548     {
3549         long2 sa = cast(long2)a;
3550 
3551         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3552         //       D says "It's illegal to shift by the same or more bits 
3553         //       than the size of the quantity being shifted"
3554         //       and it's UB instead.
3555         long2 r = cast(long2) _mm_setzero_si128();
3556         ubyte count = cast(ubyte) imm8;
3557         if (count > 63)
3558             return cast(__m128i)r;
3559 
3560         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3561         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3562         return cast(__m128i)r;
3563     }
3564 }
3565 unittest
3566 {
3567     __m128i A = _mm_setr_epi64(8, -4);
3568     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3569     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3570     long[2] expectedB = [ 16, -8];
3571     assert(B.array == expectedB);
3572     assert(B2.array == expectedB);
3573 
3574     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3575     long[2] expectedC = [ 8, -4];
3576     assert(C.array == expectedC);
3577 
3578     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3579     long[2] expectedD = [ 0, -0];
3580     assert(D.array == expectedD);
3581 }
3582 
3583 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3584 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3585 {
3586     static if (GDC_with_SSE2)
3587     {
3588         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3589     }
3590     else static if (LDC_with_SSE2)
3591     {
3592         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3593     }
3594     else static if (LDC_with_ARM64)
3595     {
3596         short8 sa = cast(short8)a;
3597         short8 r = cast(short8)_mm_setzero_si128();
3598         ubyte count = cast(ubyte) imm8;
3599         if (count > 15)
3600             return cast(__m128i)r;
3601         r = sa << short8(count);
3602         return cast(__m128i)r;
3603     }
3604     else
3605     {
3606         short8 sa = cast(short8)a;
3607         short8 r = cast(short8)_mm_setzero_si128();
3608         ubyte count = cast(ubyte) imm8;
3609         if (count > 15)
3610             return cast(__m128i)r;
3611         foreach(i; 0..8)
3612             r.ptr[i] = cast(short)(sa.array[i] << count);
3613         return cast(__m128i)r;
3614     }
3615 }
3616 unittest
3617 {
3618     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3619     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3620     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3621     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3622     assert(B.array == expectedB);
3623     assert(B2.array == expectedB);
3624 
3625     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3626     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3627     assert(C.array == expectedC);
3628 }
3629 
3630 
3631 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3632 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3633 {
3634     static if (bytes & 0xF0)
3635     {
3636         return _mm_setzero_si128();
3637     }
3638     else
3639     {
3640         static if (GDC_with_SSE2)
3641         {
3642             return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3643         }
3644         else version(DigitalMars)
3645         {
3646             version(D_InlineAsm_X86)
3647             {
3648                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3649                 {
3650                     movdqu XMM0, op;
3651                     pslldq XMM0, bytes;
3652                     movdqu op, XMM0;
3653                 }
3654                 return op;
3655             }
3656             else
3657             {
3658                 byte16 A = cast(byte16)op;
3659                 byte16 R;
3660                 for (int n = 15; n >= bytes; --n)
3661                     R.ptr[n] = A.array[n-bytes];
3662                 for (int n = bytes-1; n >= 0; --n)
3663                     R.ptr[n] = 0;
3664                 return cast(__m128i)R;
3665             }
3666         }
3667         else
3668         {
3669             return cast(__m128i) shufflevector!(byte16,
3670             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3671             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3672             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3673             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3674         }
3675     }
3676 }
3677 unittest
3678 {
3679     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3680     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3681     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3682     assert(R.array == correct);
3683 
3684     __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1));
3685     int[4] expectedB = [0, 0, 0, 0];
3686     assert(B.array == expectedB);
3687 }
3688 
3689 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3690 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3691 {
3692     version(LDC)
3693     {
3694         // Disappeared with LDC 1.11
3695         static if (__VERSION__ < 2081)
3696             return __builtin_ia32_sqrtpd(vec);
3697         else
3698         {
3699             vec.array[0] = llvm_sqrt(vec.array[0]);
3700             vec.array[1] = llvm_sqrt(vec.array[1]);
3701             return vec;
3702         }
3703     }
3704     else static if (GDC_with_SSE2)    
3705     {
3706         return __builtin_ia32_sqrtpd(vec);
3707     }
3708     else
3709     {
3710         vec.ptr[0] = sqrt(vec.array[0]);
3711         vec.ptr[1] = sqrt(vec.array[1]);
3712         return vec;
3713     }
3714 }
3715 
3716 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
3717 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
3718 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
3719 {
3720     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
3721     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
3722     //        The quadword at bits 127:64 of the destination operand remains unchanged."
3723     version(LDC)
3724     {
3725         // Disappeared with LDC 1.11
3726         static if (__VERSION__ < 2081)
3727         {
3728             __m128d c = __builtin_ia32_sqrtsd(b);
3729             a[0] = c[0];
3730             return a;
3731         }
3732         else
3733         {
3734             a.array[0] = llvm_sqrt(b.array[0]);
3735             return a;
3736         }
3737     }
3738     else static if (GDC_with_SSE2)
3739     {
3740         __m128d c = __builtin_ia32_sqrtsd(b);
3741         a.ptr[0] = c.array[0];
3742         return a;
3743     }
3744     else
3745     {
3746         a.ptr[0] = sqrt(b.array[0]);
3747         return a;
3748     }
3749 }
3750 unittest
3751 {
3752     __m128d A = _mm_setr_pd(1.0, 3.0);
3753     __m128d B = _mm_setr_pd(4.0, 5.0);
3754     __m128d R = _mm_sqrt_sd(A, B);
3755     double[2] correct = [2.0, 3.0 ];
3756     assert(R.array == correct);
3757 }
3758 
3759 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
3760 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
3761 {
3762     static if (GDC_with_SSE2)
3763     {
3764         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3765     }
3766     else static if (LDC_with_SSE2)
3767     {
3768         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
3769     }
3770     else
3771     {
3772         short8 sa = cast(short8)a;
3773         long2 lc = cast(long2)count;
3774         int bits = cast(int)(lc.array[0]);
3775         short8 r = void;
3776         foreach(i; 0..8)
3777             r.ptr[i] = cast(short)(sa.array[i] >> bits);
3778         return cast(int4)r;
3779     }
3780 }
3781 
3782 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
3783 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
3784 {
3785     static if (LDC_with_SSE2)
3786     {
3787         return __builtin_ia32_psrad128(a, count);
3788     }
3789     else static if (GDC_with_SSE2)
3790     {
3791         return __builtin_ia32_psrad128(a, count);
3792     }
3793     else
3794     {    
3795         int4 r = void;
3796         long2 lc = cast(long2)count;
3797         int bits = cast(int)(lc.array[0]);
3798         r.ptr[0] = (a.array[0] >> bits);
3799         r.ptr[1] = (a.array[1] >> bits);
3800         r.ptr[2] = (a.array[2] >> bits);
3801         r.ptr[3] = (a.array[3] >> bits);
3802         return r;
3803     }
3804 }
3805 
3806 
3807 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
3808 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
3809 {
3810     static if (GDC_with_SSE2)
3811     {
3812         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3813     }
3814     else static if (LDC_with_SSE2)
3815     {
3816         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
3817     }
3818     else static if (LDC_with_ARM64)
3819     {
3820         short8 sa = cast(short8)a;
3821         ubyte count = cast(ubyte)imm8;
3822         if (count > 15) 
3823             count = 15;
3824         short8 r = sa >> short8(count);
3825         return cast(__m128i)r;
3826     }
3827     else
3828     {
3829         short8 sa = cast(short8)a;
3830         short8 r = void;
3831 
3832         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3833         //       D says "It's illegal to shift by the same or more bits 
3834         //       than the size of the quantity being shifted"
3835         //       and it's UB instead.
3836         ubyte count = cast(ubyte)imm8;
3837         if (count > 15) 
3838             count = 15;
3839         foreach(i; 0..8)
3840             r.ptr[i] = cast(short)(sa.array[i] >> count);
3841         return cast(int4)r;
3842     }
3843 }
3844 unittest
3845 {
3846     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3847     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3848     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
3849     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3850     assert(B.array == expectedB);
3851     assert(B2.array == expectedB);
3852 
3853     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
3854     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
3855     assert(C.array == expectedC);
3856 }
3857 
3858 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
3859 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
3860 {
3861     static if (LDC_with_SSE2)
3862     {
3863         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3864     }
3865     else static if (GDC_with_SSE2)
3866     {
3867         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
3868     }
3869     else
3870     {
3871         int4 r = void;
3872 
3873         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3874         //       D says "It's illegal to shift by the same or more bits 
3875         //       than the size of the quantity being shifted"
3876         //       and it's UB instead.
3877         ubyte count = cast(ubyte) imm8;
3878         if (count > 31)
3879             count = 31;
3880 
3881         r.ptr[0] = (a.array[0] >> count);
3882         r.ptr[1] = (a.array[1] >> count);
3883         r.ptr[2] = (a.array[2] >> count);
3884         r.ptr[3] = (a.array[3] >> count);
3885         return r;
3886     }
3887 }
3888 unittest
3889 {
3890     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3891     __m128i B = _mm_srai_epi32(A, 1);
3892     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
3893     int[4] expectedB = [ 0, 1, 1, -2];
3894     assert(B.array == expectedB);
3895     assert(B2.array == expectedB);
3896 
3897     __m128i C = _mm_srai_epi32(A, 32);
3898     int[4] expectedC = [ 0, 0, 0, -1];
3899     assert(C.array == expectedC);
3900 
3901     __m128i D = _mm_srai_epi32(A, 0);
3902     int[4] expectedD = [ 0, 2, 3, -4];
3903     assert(D.array == expectedD);
3904 }
3905 
3906 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
3907 {
3908     static if (LDC_with_SSE2)
3909     {
3910         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3911     }
3912     else static if (GDC_with_SSE2)
3913     {
3914         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
3915     }
3916     else
3917     {
3918         short8 sa = cast(short8)a;
3919         long2 lc = cast(long2)count;
3920         int bits = cast(int)(lc.array[0]);
3921         short8 r = void;
3922         foreach(i; 0..8)
3923             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3924         return cast(int4)r;
3925     }
3926 }
3927 
3928 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
3929 {
3930     static if (LDC_with_SSE2)
3931     {
3932         return __builtin_ia32_psrld128(a, count);
3933     }
3934     else static if (GDC_with_SSE2)
3935     {
3936         return __builtin_ia32_psrld128(a, count);
3937     }
3938     else
3939     {
3940         int4 r = void;
3941         long2 lc = cast(long2)count;
3942         int bits = cast(int)(lc.array[0]);
3943         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
3944         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
3945         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
3946         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
3947         return r;
3948     }
3949 }
3950 
3951 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
3952 {
3953     static if (LDC_with_SSE2)
3954     {
3955         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3956     }
3957     else static if (GDC_with_SSE2)
3958     {
3959         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
3960     }
3961     else
3962     {
3963         // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047
3964         // => avoid void initialization.
3965         long2 r;
3966         long2 sa = cast(long2)a;
3967         long2 lc = cast(long2)count;
3968         int bits = cast(int)(lc.array[0]);
3969         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
3970         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
3971         return cast(__m128i)r;
3972     }
3973 }
3974 
3975 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
3976 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
3977 {
3978     static if (GDC_with_SSE2)
3979     {
3980         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3981     }
3982     else static if (LDC_with_SSE2)
3983     {
3984         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
3985     }
3986     else static if (LDC_with_ARM64)
3987     {
3988         short8 sa = cast(short8)a;
3989         short8 r = cast(short8) _mm_setzero_si128();
3990 
3991         ubyte count = cast(ubyte)imm8;
3992         if (count >= 16)
3993             return cast(__m128i)r;
3994 
3995         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
3996         return cast(__m128i)r;
3997     }
3998     else
3999     {
4000         short8 sa = cast(short8)a;
4001         ubyte count = cast(ubyte)imm8;
4002 
4003         short8 r = cast(short8) _mm_setzero_si128();
4004         if (count >= 16)
4005             return cast(__m128i)r;
4006 
4007         foreach(i; 0..8)
4008             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4009         return cast(__m128i)r;
4010     }
4011 }
4012 unittest
4013 {
4014     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4015     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4016     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4017     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4018     assert(B.array == expectedB);
4019     assert(B2.array == expectedB);
4020 
4021     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4022     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4023     assert(C.array == expectedC);
4024 
4025     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4026     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4027     assert(D.array == expectedD);
4028 }
4029 
4030 
4031 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4032 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4033 {
4034     static if (GDC_with_SSE2)
4035     {
4036         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4037     }
4038     else static if (LDC_with_SSE2)
4039     {
4040         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4041     }
4042     else
4043     {
4044         ubyte count = cast(ubyte) imm8;
4045 
4046         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4047         //       D says "It's illegal to shift by the same or more bits 
4048         //       than the size of the quantity being shifted"
4049         //       and it's UB instead.
4050         int4 r = _mm_setzero_si128();
4051         if (count >= 32)
4052             return r;
4053         r.ptr[0] = a.array[0] >>> count;
4054         r.ptr[1] = a.array[1] >>> count;
4055         r.ptr[2] = a.array[2] >>> count;
4056         r.ptr[3] = a.array[3] >>> count;
4057         return r;
4058     }
4059 }
4060 unittest
4061 {
4062     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4063     __m128i B = _mm_srli_epi32(A, 1);
4064     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4065     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4066     assert(B.array == expectedB);
4067     assert(B2.array == expectedB);
4068  
4069     __m128i C = _mm_srli_epi32(A, 255);
4070     int[4] expectedC = [ 0, 0, 0, 0 ];
4071     assert(C.array == expectedC);
4072 }
4073 
4074 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4075 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4076 {
4077     static if (GDC_with_SSE2)
4078     {
4079         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4080     }
4081     else static if (LDC_with_SSE2)
4082     {
4083         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4084     }
4085     else
4086     {
4087         long2 r = cast(long2) _mm_setzero_si128();
4088         long2 sa = cast(long2)a;
4089 
4090         ubyte count = cast(ubyte) imm8;
4091         if (count >= 64)
4092             return cast(__m128i)r;
4093 
4094         r.ptr[0] = sa.array[0] >>> count;
4095         r.ptr[1] = sa.array[1] >>> count;
4096         return cast(__m128i)r;
4097     }
4098 }
4099 unittest
4100 {
4101     __m128i A = _mm_setr_epi64(8, -4);
4102     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4103     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4104     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4105     assert(B.array == expectedB);
4106     assert(B2.array == expectedB);
4107 
4108     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4109     long[2] expectedC = [ 0, 0 ];
4110     assert(C.array == expectedC);
4111 }
4112 
4113 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4114 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
4115 {
4116     static if (bytes & 0xF0)
4117     {
4118         return _mm_setzero_si128();
4119     }
4120     else static if (GDC_with_SSE2)
4121     {
4122         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4123     }
4124     else static if (DMD_with_32bit_asm)
4125     {
4126         asm pure nothrow @nogc @trusted
4127         {
4128             movdqu XMM0, v;
4129             psrldq XMM0, bytes;
4130             movdqu v, XMM0;
4131         }
4132         return v;
4133     }
4134     else
4135     {
4136         return cast(__m128i) shufflevector!(byte16,
4137                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4138                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4139                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4140     }
4141 }
4142 unittest
4143 {
4144     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
4145     int[4] correct = [2, 3, 4, 0];
4146     assert(R.array == correct);
4147 
4148     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4149     int[4] expectedA = [0, 0, 0, 0];
4150     assert(A.array == expectedA);
4151 }
4152 
4153 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4154 /// #BONUS
4155 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4156 {
4157     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4158 }
4159 unittest
4160 {
4161     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4162     float[4] correct = [3.0f, 4.0f, 0, 0];
4163     assert(R.array == correct);
4164 }
4165 
4166 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4167 /// #BONUS
4168 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4169 {
4170     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4171 }
4172 
4173 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4174 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4175 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4176 {
4177     pragma(inline, true);
4178     __m128d* aligned = cast(__m128d*)mem_addr;
4179     *aligned = a;
4180 }
4181 
4182 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4183 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4184 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4185 {
4186     __m128d* aligned = cast(__m128d*)mem_addr;
4187     __m128d r;
4188     r.ptr[0] = a.array[0];
4189     r.ptr[1] = a.array[0];
4190     *aligned = r;
4191 }
4192 
4193 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4194 /// be aligned on any particular boundary.
4195 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4196 {
4197     pragma(inline, true);
4198     *mem_addr = a.array[0];
4199 }
4200 
4201 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4202 /// general-protection exception may be generated.
4203 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4204 {
4205     pragma(inline, true);
4206     *mem_addr = a;
4207 }
4208 
4209 alias _mm_store1_pd = _mm_store_pd1; ///
4210 
4211 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4212 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4213 {
4214     pragma(inline, true);
4215     *mem_addr = a.array[1];
4216 }
4217 
4218 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4219 // expectations from the user point of view. This problem also exist in C++.
4220 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4221 {
4222     pragma(inline, true);
4223     long* dest = cast(long*)mem_addr;
4224     long2 la = cast(long2)a;
4225     *dest = la.array[0];
4226 }
4227 unittest
4228 {
4229     long[3] A = [1, 2, 3];
4230     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4231     long[3] correct = [1, 0x1_0000_0000, 3];
4232     assert(A == correct);
4233 }
4234 
4235 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4236 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4237 {
4238     pragma(inline, true);
4239     *mem_addr = a.array[0];
4240 }
4241 
4242 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 
4243 /// aligned on a 16-byte boundary or a general-protection exception may be generated.
4244 void _mm_storer_pd (double* mem_addr, __m128d a) pure
4245 {
4246     __m128d* aligned = cast(__m128d*)mem_addr;
4247     *aligned = shufflevector!(double2, 1, 0)(a, a);
4248 }
4249 
4250 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4251 /// `mem_addr` does not need to be aligned on any particular boundary.
4252 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
4253 {
4254     pragma(inline, true);
4255     storeUnaligned!double2(a, mem_addr);
4256 }
4257 
4258 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4259 /// boundary.
4260 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
4261 {
4262     pragma(inline, true);
4263     storeUnaligned!__m128i(a, cast(int*)mem_addr);
4264 }
4265 
4266 /// Store 32-bit integer from the first element of `a` into memory. 
4267 /// `mem_addr` does not need to be aligned on any particular boundary.
4268 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted
4269 {
4270     pragma(inline, true);
4271     int* dest = cast(int*)mem_addr;
4272     *dest = a.array[0];
4273 }
4274 unittest
4275 {
4276     int[2] arr = [-24, 12];
4277     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4278     assert(arr == [-24, -1]);
4279 }
4280 
4281 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4282 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
4283 /// boundary or a general-protection exception may be generated.
4284 void _mm_stream_pd (double* mem_addr, __m128d a)
4285 {
4286     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4287     __m128d* dest = cast(__m128d*)mem_addr;
4288     *dest = a;
4289 }
4290 
4291 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4292 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
4293 /// may be generated.
4294 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
4295 {
4296     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4297     __m128i* dest = cast(__m128i*)mem_addr;
4298     *dest = a;
4299 }
4300 
4301 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4302 /// pollution. If the cache line containing address mem_addr is already in the cache,
4303 /// the cache will be updated.
4304 void _mm_stream_si32 (int* mem_addr, int a)
4305 {
4306     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4307     *mem_addr = a;
4308 }
4309 
4310 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4311 /// cache pollution. If the cache line containing address mem_addr is already
4312 /// in the cache, the cache will be updated.
4313 void _mm_stream_si64 (long* mem_addr, long a)
4314 {
4315     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
4316     *mem_addr = a;
4317 }
4318 
4319 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4320 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4321 {
4322     pragma(inline, true);
4323     return cast(__m128i)(cast(short8)a - cast(short8)b);
4324 }
4325 
4326 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4327 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4328 {
4329     pragma(inline, true);
4330     return cast(__m128i)(cast(int4)a - cast(int4)b);
4331 }
4332 
4333 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4334 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4335 {
4336     pragma(inline, true);
4337     return cast(__m128i)(cast(long2)a - cast(long2)b);
4338 }
4339 
4340 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4341 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4342 {
4343     pragma(inline, true);
4344     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4345 }
4346 
4347 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4348 /// floating-point elements in `a`.
4349 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4350 {
4351     pragma(inline, true);
4352     return a - b;
4353 }
4354 
4355 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4356 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4357 /// upper element of result.
4358 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4359 {
4360     version(DigitalMars)
4361     {
4362         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4363         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4364         asm pure nothrow @nogc @trusted { nop;}
4365         a[0] = a[0] - b[0];
4366         return a;
4367     }
4368     else static if (GDC_with_SSE2)
4369     {
4370         return __builtin_ia32_subsd(a, b);
4371     }
4372     else
4373     {
4374         a.ptr[0] -= b.array[0];
4375         return a;
4376     }
4377 }
4378 unittest
4379 {
4380     __m128d a = [1.5, -2.0];
4381     a = _mm_sub_sd(a, a);
4382     assert(a.array == [0.0, -2.0]);
4383 }
4384 
4385 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4386 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4387 {
4388     pragma(inline, true);
4389     return a - b;
4390 }
4391 
4392 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4393 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4394 {
4395     version(LDC)
4396     {
4397         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4398         {
4399             // Generates PSUBSW since LDC 1.15 -O0
4400             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4401             
4402             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4403             enum ir = `
4404                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4405                 ret <8 x i16> %r`;
4406             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4407         }
4408         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4409         {
4410             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4411             short[8] res;
4412             short8 sa = cast(short8)a;
4413             short8 sb = cast(short8)b;
4414             foreach(i; 0..8)
4415                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4416             return _mm_loadu_si128(cast(int4*)res.ptr);
4417         }
4418         else static if (LDC_with_SSE2)
4419         {
4420             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4421         }
4422         else
4423             static assert(false);
4424     }
4425     else static if (GDC_with_SSE2)
4426     {
4427         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4428     }
4429     else
4430     {
4431         short[8] res;
4432         short8 sa = cast(short8)a;
4433         short8 sb = cast(short8)b;
4434         foreach(i; 0..8)
4435             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4436         return _mm_loadu_si128(cast(int4*)res.ptr);
4437     }
4438 }
4439 unittest
4440 {
4441     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4442                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4443     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4444     assert(res.array == correctResult);
4445 }
4446 
4447 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
4448 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
4449 {
4450     version(LDC)
4451     {
4452         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4453         {
4454             // x86: Generates PSUBSB since LDC 1.15 -O0
4455             // ARM: Generates sqsub.16b since LDC 1.21 -O0
4456             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4457             enum ir = `
4458                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4459                 ret <16 x i8> %r`;
4460             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4461         }
4462         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4463         {
4464             byte[16] res;
4465             byte16 sa = cast(byte16)a;
4466             byte16 sb = cast(byte16)b;
4467             foreach(i; 0..16)
4468                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4469             return _mm_loadu_si128(cast(int4*)res.ptr);
4470         }
4471         else static if (LDC_with_SSE2)
4472         {
4473             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
4474         }
4475         else
4476             static assert(false);
4477     }
4478     else static if (GDC_with_SSE2)
4479     {
4480         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
4481     }
4482     else
4483     {
4484         byte[16] res;
4485         byte16 sa = cast(byte16)a;
4486         byte16 sb = cast(byte16)b;
4487         foreach(i; 0..16)
4488             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
4489         return _mm_loadu_si128(cast(int4*)res.ptr);
4490     }
4491 }
4492 unittest
4493 {
4494     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4495                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4496     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4497     assert(res.array == correctResult);
4498 }
4499 
4500 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
4501 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
4502 {
4503     version(LDC)
4504     {
4505         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4506         {
4507             // x86: Generates PSUBUSW since LDC 1.15 -O0
4508             // ARM: Generates uqsub.8h since LDC 1.21 -O0
4509             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4510             enum ir = `
4511                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4512                 ret <8 x i16> %r`;
4513             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4514         }
4515         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4516         {
4517             short[8] res;
4518             short8 sa = cast(short8)a;
4519             short8 sb = cast(short8)b;
4520             foreach(i; 0..8)
4521             {
4522                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4523                 res[i] = saturateSignedIntToUnsignedShort(sum);
4524             }
4525             return _mm_loadu_si128(cast(int4*)res.ptr);
4526         }
4527         else static if (LDC_with_SSE2)
4528         {
4529             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
4530         }
4531         else 
4532             static assert(false);
4533     }
4534     else static if (GDC_with_SSE2)
4535     {
4536         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
4537     }
4538     else
4539     {
4540         short[8] res;
4541         short8 sa = cast(short8)a;
4542         short8 sb = cast(short8)b;
4543         foreach(i; 0..8)
4544         {
4545             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
4546             res[i] = saturateSignedIntToUnsignedShort(sum);
4547         }
4548         return _mm_loadu_si128(cast(int4*)res.ptr);
4549     }
4550 }
4551 unittest
4552 {
4553     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
4554                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
4555     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
4556     assert(R.array == correct);
4557 }
4558 
4559 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4560 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4561 {
4562     version(LDC)
4563     {
4564         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4565         {
4566             // x86: Generates PSUBUSB since LDC 1.15 -O0
4567             // ARM: Generates uqsub.16b since LDC 1.21 -O0
4568             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
4569             enum ir = `
4570                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
4571                 ret <16 x i8> %r`;
4572             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
4573         }
4574         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
4575         {
4576             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
4577             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
4578             {
4579                 ubyte[16] res;
4580                 byte16 sa = cast(byte16)a;
4581                 byte16 sb = cast(byte16)b;
4582                 foreach(i; 0..16)
4583                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4584                 return _mm_loadu_si128(cast(int4*)res.ptr);
4585             }
4586         }
4587         else static if (LDC_with_SSE2)
4588         {
4589             return __builtin_ia32_psubusb128(a, b);
4590         }
4591         else 
4592             static assert(false);
4593     }
4594     else static if (GDC_with_SSE2)
4595     {
4596         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
4597     }
4598     else
4599     {
4600         ubyte[16] res;
4601         byte16 sa = cast(byte16)a;
4602         byte16 sb = cast(byte16)b;
4603         foreach(i; 0..16)
4604             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
4605         return _mm_loadu_si128(cast(int4*)res.ptr);
4606     }
4607 }
4608 unittest
4609 {
4610     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
4611                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
4612     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
4613     assert(res.array == correctResult);
4614 }
4615 
4616 // Note: the only difference between these intrinsics is the signalling
4617 //       behaviour of quiet NaNs. This is incorrect but the case where
4618 //       you would want to differentiate between qNaN and sNaN and then
4619 //       treat them differently on purpose seems extremely rare.
4620 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
4621 alias _mm_ucomige_sd = _mm_comige_sd; ///
4622 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
4623 alias _mm_ucomile_sd = _mm_comile_sd; ///
4624 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
4625 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
4626 
4627 /// Return vector of type `__m128d` with undefined elements.
4628 __m128d _mm_undefined_pd() pure @safe
4629 {
4630     pragma(inline, true);
4631     __m128d result = void;
4632     return result;
4633 }
4634 
4635 /// Return vector of type `__m128i` with undefined elements.
4636 __m128i _mm_undefined_si128() pure @safe
4637 {
4638     pragma(inline, true);
4639     __m128i result = void;
4640     return result;
4641 }
4642 
4643 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
4644 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
4645 {
4646     static if (GDC_with_SSE2)
4647     {
4648         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
4649     }
4650     else static if (DMD_with_32bit_asm)
4651     {
4652         asm pure nothrow @nogc @trusted
4653         {
4654             movdqu XMM0, a;
4655             movdqu XMM1, b;
4656             punpckhwd XMM0, XMM1;
4657             movdqu a, XMM0;
4658         }
4659         return a;
4660     }
4661     else
4662     {
4663         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
4664                                            (cast(short8)a, cast(short8)b);
4665     }
4666 }
4667 unittest
4668 {
4669     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
4670     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
4671     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
4672     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
4673     assert(C.array == correct);
4674 }
4675 
4676 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
4677 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
4678 {
4679     static if (GDC_with_SSE2)
4680     {
4681         return __builtin_ia32_punpckhdq128(a, b);
4682     }
4683     else version(DigitalMars)
4684     {
4685         __m128i r;
4686         r.ptr[0] = a.array[2];
4687         r.ptr[1] = b.array[2];
4688         r.ptr[2] = a.array[3];
4689         r.ptr[3] = b.array[3];
4690         return r;
4691     }
4692     else
4693     {
4694         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
4695     }
4696 }
4697 unittest
4698 {
4699     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4700     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4701     __m128i C = _mm_unpackhi_epi32(A, B);
4702     int[4] correct = [3, 7, 4, 8];
4703     assert(C.array == correct);
4704 }
4705 
4706 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
4707 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
4708 {
4709     static if (GDC_with_SSE2)
4710     {
4711         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
4712     }
4713     else
4714     {
4715         __m128i r = cast(__m128i)b;
4716         r[0] = a[2];
4717         r[1] = a[3];
4718         return r; 
4719     }
4720 }
4721 unittest // Issue #36
4722 {
4723     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4724     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4725     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
4726     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
4727     assert(C.array == correct);
4728 }
4729 
4730 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
4731 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
4732 {
4733     static if (GDC_with_SSE2)
4734     {
4735         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
4736     }
4737     else static if (DMD_with_32bit_asm)
4738     {
4739         asm pure nothrow @nogc @trusted
4740         {
4741             movdqu XMM0, a;
4742             movdqu XMM1, b;
4743             punpckhbw XMM0, XMM1;
4744             movdqu a, XMM0;
4745         }
4746         return a;
4747     }
4748     else
4749     {
4750         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
4751                                                    12, 28, 13, 29, 14, 30, 15, 31)
4752                                                    (cast(byte16)a, cast(byte16)b);
4753     }
4754 }
4755 unittest
4756 {
4757     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4758     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4759     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
4760     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
4761     assert(C.array == correct);
4762 }
4763 
4764 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
4765 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
4766 {
4767     static if (GDC_with_SSE2)
4768     {
4769         return __builtin_ia32_unpckhpd(a, b);
4770     }
4771     else
4772     {
4773         return shufflevector!(__m128d, 1, 3)(a, b);
4774     }
4775 }
4776 unittest
4777 {
4778     __m128d A = _mm_setr_pd(4.0, 6.0);
4779     __m128d B = _mm_setr_pd(7.0, 9.0);
4780     __m128d C = _mm_unpackhi_pd(A, B);
4781     double[2] correct = [6.0, 9.0];
4782     assert(C.array == correct);
4783 }
4784 
4785 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
4786 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
4787 {
4788     static if (GDC_with_SSE2)
4789     {
4790         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
4791     }
4792     else static if (DMD_with_32bit_asm)
4793     {
4794         asm pure nothrow @nogc @trusted
4795         {
4796             movdqu XMM0, a;
4797             movdqu XMM1, b;
4798             punpcklwd XMM0, XMM1;
4799             movdqu a, XMM0;
4800         }
4801         return a;
4802     }
4803     else
4804     {
4805         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
4806                                            (cast(short8)a, cast(short8)b);
4807     }
4808 }
4809 unittest
4810 {
4811     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4812     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4813     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
4814     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
4815     assert(C.array == correct);
4816 }
4817 
4818 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
4819 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
4820 {
4821     static if (GDC_with_SSE2)
4822     {
4823         return __builtin_ia32_punpckldq128(a, b);
4824     }
4825     else version(DigitalMars)
4826     {
4827         __m128i r;
4828         r.ptr[0] = a.array[0];
4829         r.ptr[1] = b.array[0];
4830         r.ptr[2] = a.array[1];
4831         r.ptr[3] = b.array[1];
4832         return r;
4833     }
4834     else
4835     {
4836         return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
4837     }
4838 }
4839 unittest
4840 {
4841     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4842     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
4843     __m128i C = _mm_unpacklo_epi32(A, B);
4844     int[4] correct = [1, 5, 2, 6];
4845     assert(C.array == correct);
4846 }
4847 
4848 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
4849 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
4850 {
4851     static if (GDC_with_SSE2)
4852     {
4853         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
4854     }
4855     else
4856     {
4857         long2 lA = cast(long2)a;
4858         long2 lB = cast(long2)b;
4859         long2 R;
4860         R.ptr[0] = lA.array[0];
4861         R.ptr[1] = lB.array[0];
4862         return cast(__m128i)R;
4863     }
4864 }
4865 unittest // Issue #36
4866 {
4867     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
4868     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
4869     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
4870     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
4871     assert(C.array == correct);
4872 }
4873 
4874 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
4875 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
4876 {
4877     static if (GDC_with_SSE2)
4878     {
4879         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
4880     }
4881     else static if (DMD_with_32bit_asm)
4882     {
4883         asm pure nothrow @nogc @trusted
4884         {
4885             movdqu XMM0, a;
4886             movdqu XMM1, b;
4887             punpcklbw XMM0, XMM1;
4888             movdqu a, XMM0;
4889         }
4890         return a;
4891     }
4892     else
4893     {
4894         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
4895                                                     4, 20, 5, 21, 6, 22, 7, 23)
4896                                            (cast(byte16)a, cast(byte16)b);
4897     }
4898 }
4899 unittest
4900 {
4901     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
4902     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
4903     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
4904     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
4905     assert(C.array == correct);
4906 }
4907 
4908 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
4909 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
4910 {
4911     static if (GDC_with_SSE2)
4912     {
4913         return __builtin_ia32_unpcklpd(a, b);
4914     }
4915     else
4916     {
4917         return shufflevector!(__m128d, 0, 2)(a, b);
4918     }
4919 }
4920 unittest
4921 {
4922     __m128d A = _mm_setr_pd(4.0, 6.0);
4923     __m128d B = _mm_setr_pd(7.0, 9.0);
4924     __m128d C = _mm_unpacklo_pd(A, B);
4925     double[2] correct = [4.0, 7.0];
4926     assert(C.array == correct);
4927 }
4928 
4929 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
4930 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
4931 {
4932     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
4933 }
4934 // TODO unittest and thus force inline
4935 
4936 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
4937 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
4938 {
4939     return a ^ b;
4940 }
4941 // TODO unittest and thus force inline
4942 
4943 unittest
4944 {
4945     float distance(float[4] a, float[4] b) nothrow @nogc
4946     {
4947         __m128 va = _mm_loadu_ps(a.ptr);
4948         __m128 vb = _mm_loadu_ps(b.ptr);
4949         __m128 diffSquared = _mm_sub_ps(va, vb);
4950         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
4951         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
4952         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
4953         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
4954     }
4955     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
4956 }