inteli.emmintrin source code

1 /**
2 * SSE2 intrinsics. 
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2
4 *
5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.emmintrin;
9 
10 public import inteli.types;
11 public import inteli.xmmintrin; // SSE2 includes SSE1
12 import inteli.mmx;
13 import inteli.internals;
14 
15 nothrow @nogc:
16 
17 
18 // SSE2 instructions
19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
23 {
24     pragma(inline, true);
25     return cast(__m128i)(cast(short8)a + cast(short8)b);
26 }
27 unittest
28 {
29     __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77);
30     short8 R = cast(short8) _mm_add_epi16(A, A);
31     short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
37 {
38     pragma(inline, true);
39     return cast(__m128i)(cast(int4)a + cast(int4)b);
40 }
41 unittest
42 {
43     __m128i A = _mm_setr_epi32( -7, -1, 0, 9);
44     int4 R = _mm_add_epi32(A, A);
45     int[4] correct = [ -14, -2, 0, 18 ];
46     assert(R.array == correct);
47 }
48 
49 /// Add packed 64-bit integers in `a` and `b`.
50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
51 {
52     pragma(inline, true);
53     return cast(__m128i)(cast(long2)a + cast(long2)b);
54 }
55 unittest
56 {
57     __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000);
58     long2 R = cast(long2) _mm_add_epi64(A, A);
59     long[2] correct = [ -2, 0 ];
60     assert(R.array == correct);
61 }
62 
63 /// Add packed 8-bit integers in `a` and `b`.
64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
65 {
66     pragma(inline, true);
67     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
68 }
69 unittest
70 {
71     __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78);
72     byte16 R = cast(byte16) _mm_add_epi8(A, A);
73     byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100];
74     assert(R.array == correct);
75 }
76 
77 /// Add the lower double-precision (64-bit) floating-point element 
78 /// in `a` and `b`, store the result in the lower element of dst, 
79 /// and copy the upper element from `a` to the upper element of destination. 
80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
81 {
82     static if (DMD_with_DSIMD)
83     {
84         return cast(__m128d) __simd(XMM.ADDSD, a, b);
85     }
86     else static if (GDC_with_SSE2)
87     {
88         return __builtin_ia32_addsd(a, b);
89     }
90     else version(DigitalMars)
91     {
92         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
93         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
94         asm pure nothrow @nogc @trusted { nop;}
95         a[0] = a[0] + b[0];
96         return a;
97     }
98     else
99     {
100         a[0] += b[0];
101         return a;
102     }
103 }
104 unittest
105 {
106     __m128d a = [1.5, -2.0];
107     a = _mm_add_sd(a, a);
108     assert(a.array == [3.0, -2.0]);
109 }
110 
111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`.
112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
113 {
114     pragma(inline, true);
115     return a + b;
116 }
117 unittest
118 {
119     __m128d a = [1.5, -2.0];
120     a = _mm_add_pd(a, a);
121     assert(a.array == [3.0, -4.0]);
122 }
123 
124 /// Add 64-bit integers `a` and `b`.
125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
126 {
127     // PERF DMD
128     pragma(inline, true);
129     return a + b;
130 }
131 
132 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
134 {
135     static if (DMD_with_DSIMD)
136     {
137         return cast(__m128i) __simd(XMM.PADDSW, a, b);
138     }
139     else static if (GDC_with_SSE2)
140     {
141         return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
142     }
143     else version(LDC)
144     {
145         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
146         {
147             // x86: Generates PADDSW since LDC 1.15 -O0
148             // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20            
149             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
150             enum ir = `
151                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
152                 ret <8 x i16> %r`;
153             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
154         }
155         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
156         {
157             short[8] res; // PERF =void;
158             short8 sa = cast(short8)a;
159             short8 sb = cast(short8)b;
160             foreach(i; 0..8)
161                 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
162             return _mm_loadu_si128(cast(int4*)res.ptr);
163         }
164         else
165             return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b);
166     }
167     else
168     {
169         short[8] res; // PERF =void;
170         short8 sa = cast(short8)a;
171         short8 sb = cast(short8)b;
172         foreach(i; 0..8)
173             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
174         return _mm_loadu_si128(cast(int4*)res.ptr);
175     }
176 }
177 unittest
178 {
179     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
180                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
181     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
182     assert(res.array == correctResult);
183 }
184 
185 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
186 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
187 {
188     static if (DMD_with_DSIMD)
189     {
190         return cast(__m128i) __simd(XMM.PADDSB, a, b);
191     }
192     else static if (GDC_with_SSE2)
193     {
194         return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b);
195     }
196     else version(LDC)
197     {
198         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
199         {
200             // x86: Generates PADDSB since LDC 1.15 -O0
201             // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20
202             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
203             enum ir = `
204                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
205                 ret <16 x i8> %r`;
206             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
207         }
208         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
209         {
210             byte[16] res; // PERF =void;
211             byte16 sa = cast(byte16)a;
212             byte16 sb = cast(byte16)b;
213             foreach(i; 0..16)
214                 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
215             return _mm_loadu_si128(cast(int4*)res.ptr);
216         }
217         else
218             return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b);
219     }
220     else
221     {
222         byte[16] res; // PERF =void;
223         byte16 sa = cast(byte16)a;
224         byte16 sb = cast(byte16)b;
225         foreach(i; 0..16)
226             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
227         return _mm_loadu_si128(cast(int4*)res.ptr);
228     }
229 }
230 unittest
231 {
232     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
233                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
234     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
235                                                16, 18, 20, 22, 24, 26, 28, 30];
236     assert(res.array == correctResult);
237 }
238 
239 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
240 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
241 {
242     static if (DMD_with_DSIMD)
243     {
244         return cast(__m128i) __simd(XMM.PADDUSB, a, b);
245     }
246     else static if (GDC_with_SSE2)
247     {
248         return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b);
249     }
250     else version(LDC)
251     {
252         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
253         {
254             // x86: Generates PADDUSB since LDC 1.15 -O0
255             // ARM: Generates uqadd.16b since LDC 1.21 -O1
256             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
257             enum ir = `
258                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
259                 ret <16 x i8> %r`;
260             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
261         }
262         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
263         {
264             ubyte[16] res; // PERF =void;
265             byte16 sa = cast(byte16)a;
266             byte16 sb = cast(byte16)b;
267             foreach(i; 0..16)
268                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
269             return _mm_loadu_si128(cast(int4*)res.ptr);
270         }
271         else
272             return __builtin_ia32_paddusb128(a, b);
273     }
274     else
275     {
276         ubyte[16] res; // PERF =void;
277         byte16 sa = cast(byte16)a;
278         byte16 sb = cast(byte16)b;
279         foreach(i; 0..16)
280             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
281         return _mm_loadu_si128(cast(int4*)res.ptr);
282     }
283 }
284 unittest
285 {
286     byte16 res = cast(byte16) 
287         _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0),
288                       _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0));
289     static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 
290                                                0, cast(byte)255, 4, 6, 8, 10, 12, 14];
291     assert(res.array == correctResult);
292 }
293 
294 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation.
295 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
296 {
297     static if (DMD_with_DSIMD)
298     {
299         // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway
300         return cast(__m128i) __simd(XMM.PADDUSW, a, b);
301     }
302     else static if (GDC_with_SSE2)
303     {
304         return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b);
305     }
306     else version(LDC)
307     {
308         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
309         {
310             // x86: Generates PADDUSW since LDC 1.15 -O0
311             // ARM: Generates uqadd.8h since LDC 1.21 -O1
312             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
313             enum ir = `
314                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
315                 ret <8 x i16> %r`;
316             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
317         }
318         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
319         {
320             ushort[8] res; // PERF =void;
321             short8 sa = cast(short8)a;
322             short8 sb = cast(short8)b;
323             foreach(i; 0..8)
324                 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
325             return _mm_loadu_si128(cast(int4*)res.ptr);
326         }
327         else
328             return __builtin_ia32_paddusw128(a, b);
329     }
330     else
331     {
332         ushort[8] res; // PERF =void;
333         short8 sa = cast(short8)a;
334         short8 sb = cast(short8)b;
335         foreach(i; 0..8)
336             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
337         return _mm_loadu_si128(cast(int4*)res.ptr);
338     }
339 }
340 unittest
341 {
342     short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0),
343                                              _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0));
344     static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6];
345     assert(res.array == correctResult);
346 }
347 
348 /// Compute the bitwise AND of packed double-precision (64-bit) 
349 /// floating-point elements in `a` and `b`.
350 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
351 {
352     pragma(inline, true);
353     return cast(__m128d)( cast(long2)a & cast(long2)b );
354 }
355 unittest
356 {
357     double a = 4.32;
358     double b = -78.99;
359     long correct = (*cast(long*)(&a)) & (*cast(long*)(&b));
360     __m128d A = _mm_set_pd(a, b);
361     __m128d B = _mm_set_pd(b, a);
362     long2 R = cast(long2)( _mm_and_pd(A, B) );
363     assert(R.array[0] == correct);
364     assert(R.array[1] == correct);
365 }
366 
367 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`.
368 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
369 {
370     pragma(inline, true);
371     return a & b;
372 }
373 unittest
374 {
375     __m128i A = _mm_set1_epi32(7);
376     __m128i B = _mm_set1_epi32(14);
377     __m128i R = _mm_and_si128(A, B);
378     int[4] correct = [6, 6, 6, 6];
379     assert(R.array == correct);
380 }
381 
382 /// Compute the bitwise NOT of packed double-precision (64-bit) 
383 /// floating-point elements in `a` and then AND with `b`.
384 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
385 {
386     static if (DMD_with_DSIMD)
387     {
388         return cast(__m128d) __simd(XMM.ANDNPD, a, b);
389     }
390     else
391     {
392         return cast(__m128d)( ~(cast(long2)a) & cast(long2)b);
393     }
394 }
395 unittest
396 {
397     double a = 4.32;
398     double b = -78.99;
399     long correct  = (~*cast(long*)(&a)) & ( *cast(long*)(&b));
400     long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b));
401     __m128d A = _mm_setr_pd(a, b);
402     __m128d B = _mm_setr_pd(b, a);
403     long2 R = cast(long2)( _mm_andnot_pd(A, B) );
404     assert(R.array[0] == correct);
405     assert(R.array[1] == correct2);
406 }
407 
408 /// Compute the bitwise NOT of 128 bits (representing integer data) 
409 /// in `a` and then AND with `b`.
410 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
411 {
412     static if (DMD_with_DSIMD)
413     {
414         return cast(__m128i) __simd(XMM.PANDN, a, b);
415     }
416     else
417     {
418         return (~a) & b;
419     }
420 }
421 unittest
422 {
423     __m128i A = _mm_setr_epi32(7, -2, 9, 54654);
424     __m128i B = _mm_setr_epi32(14, 78, 111, -256);
425     __m128i R = _mm_andnot_si128(A, B);
426     int[4] correct = [8, 0, 102, -54784];
427     assert(R.array == correct);
428 }
429 
430 /// Average packed unsigned 16-bit integers in `a` and `b`.
431 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted
432 {
433     static if (DMD_with_DSIMD)
434     {
435         return cast(__m128i) __simd(XMM.PAVGW, a, b);
436     }
437     else static if (GDC_with_SSE2)
438     {
439         return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b);
440     }
441     else static if (LDC_with_ARM64)
442     {
443         return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b);
444     }
445     else version(LDC)
446     {
447         // Generates pavgw even in LDC 1.0, even in -O0
448         // But not in ARM
449         enum ir = `
450             %ia = zext <8 x i16> %0 to <8 x i32>
451             %ib = zext <8 x i16> %1 to <8 x i32>
452             %isum = add <8 x i32> %ia, %ib
453             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
454             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
455             %r = trunc <8 x i32> %isums to <8 x i16>
456             ret <8 x i16> %r`;
457         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
458     }
459     else
460     {
461         short8 sa = cast(short8)a;
462         short8 sb = cast(short8)b;
463         short8 sr = void;
464         foreach(i; 0..8)
465         {
466             sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 );
467         }
468         return cast(int4)sr;
469     }
470 }
471 unittest
472 {
473     __m128i A = _mm_set1_epi16(31);
474     __m128i B = _mm_set1_epi16(64);
475     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
476     foreach(i; 0..8)
477         assert(avg.array[i] == 48);
478 }
479 
480 /// Average packed unsigned 8-bit integers in `a` and `b`.
481 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted
482 {
483     static if (DMD_with_DSIMD)
484     {
485         return cast(__m128i) __simd(XMM.PAVGB, a, b);
486     }
487     else static if (GDC_with_SSE2)
488     {
489         return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b);
490     }
491     else static if (LDC_with_ARM64)
492     {
493         return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b);
494     }
495     else version(LDC)
496     {
497         // Generates pavgb even in LDC 1.0, even in -O0
498         // But not in ARM
499         enum ir = `
500             %ia = zext <16 x i8> %0 to <16 x i16>
501             %ib = zext <16 x i8> %1 to <16 x i16>
502             %isum = add <16 x i16> %ia, %ib
503             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
504             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
505             %r = trunc <16 x i16> %isums to <16 x i8>
506             ret <16 x i8> %r`;
507         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
508     }
509     else
510     {
511         byte16 sa = cast(byte16)a;
512         byte16 sb = cast(byte16)b;
513         byte16 sr = void;
514         foreach(i; 0..16)
515         {
516             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
517         }
518         return cast(int4)sr;
519     }
520 }
521 unittest
522 {
523     __m128i A = _mm_set1_epi8(31);
524     __m128i B = _mm_set1_epi8(64);
525     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
526     foreach(i; 0..16)
527         assert(avg.array[i] == 48);
528 }
529 
530 /// Shift `a` left by `bytes` bytes while shifting in zeros.
531 alias _mm_bslli_si128 = _mm_slli_si128;
532 unittest
533 {
534     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
535     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
536     __m128i result = _mm_bslli_si128!5(toShift);
537     assert( (cast(byte16)result).array == exact);
538 }
539 
540 /// Shift `v` right by `bytes` bytes while shifting in zeros.
541 alias _mm_bsrli_si128 = _mm_srli_si128;
542 unittest
543 {
544     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
545     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
546     __m128i result = _mm_bsrli_si128!5(toShift);
547     assert( (cast(byte16)result).array == exact);
548 }
549 
550 /// Cast vector of type `__m128d` to type `__m128`. 
551 /// Note: Also possible with a regular `cast(__m128)(a)`.
552 __m128 _mm_castpd_ps (__m128d a) pure @safe
553 {
554     return cast(__m128)a;
555 }
556 
557 /// Cast vector of type `__m128d` to type `__m128i`. 
558 /// Note: Also possible with a regular `cast(__m128i)(a)`.
559 __m128i _mm_castpd_si128 (__m128d a) pure @safe
560 {
561     return cast(__m128i)a;
562 }
563 
564 /// Cast vector of type `__m128` to type `__m128d`. 
565 /// Note: Also possible with a regular `cast(__m128d)(a)`.
566 __m128d _mm_castps_pd (__m128 a) pure @safe
567 {
568     return cast(__m128d)a;
569 }
570 
571 /// Cast vector of type `__m128` to type `__m128i`. 
572 /// Note: Also possible with a regular `cast(__m128i)(a)`.
573 __m128i _mm_castps_si128 (__m128 a) pure @safe
574 {
575     return cast(__m128i)a;
576 }
577 
578 /// Cast vector of type `__m128i` to type `__m128d`. 
579 /// Note: Also possible with a regular `cast(__m128d)(a)`.
580 __m128d _mm_castsi128_pd (__m128i a) pure @safe
581 {
582     return cast(__m128d)a;
583 }
584 
585 /// Cast vector of type `__m128i` to type `__m128`. 
586 /// Note: Also possible with a regular `cast(__m128)(a)`.
587 __m128 _mm_castsi128_ps (__m128i a) pure @safe
588 {
589     return cast(__m128)a;
590 }
591 
592 /// Invalidate and flush the cache line that contains `p` 
593 /// from all levels of the cache hierarchy.
594 void _mm_clflush (const(void)* p) @trusted
595 {
596     static if (GDC_with_SSE2)
597     {
598         __builtin_ia32_clflush(p);
599     }
600     else static if (LDC_with_SSE2)
601     {
602         __builtin_ia32_clflush(cast(void*)p);
603     }
604     else version(D_InlineAsm_X86)
605     {
606         asm pure nothrow @nogc @safe
607         {
608             mov EAX, p;
609             clflush [EAX];
610         }
611     }
612     else version(D_InlineAsm_X86_64)
613     {
614         asm pure nothrow @nogc @safe
615         {
616             mov RAX, p;
617             clflush [RAX];
618         }
619     }
620     else 
621     {
622         // Do nothing. Invalidating cacheline does
623         // not affect correctness.
624     }
625 }
626 unittest
627 {
628     ubyte[64] cacheline;
629     _mm_clflush(cacheline.ptr);
630 }
631 
632 /// Compare packed 16-bit integers in `a` and `b` for equality.
633 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
634 {
635     static if (GDC_with_SSE2)
636     {
637         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b);
638     }
639     else
640     {
641         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
642     }
643 }
644 unittest
645 {
646     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
647     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
648     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
649     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
650     assert(R.array == E);
651 }
652 
653 /// Compare packed 32-bit integers in `a` and `b` for equality.
654 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
655 {
656     static if (GDC_with_SSE2)
657     {
658         return __builtin_ia32_pcmpeqd128(a, b);
659     }
660     else
661     {
662         return equalMask!__m128i(a, b);
663     }
664 }
665 unittest
666 {
667     int4   A = [-3, -2, -1,  0];
668     int4   B = [ 4, -2,  2,  0];
669     int[4] E = [ 0, -1,  0, -1];
670     int4   R = cast(int4)(_mm_cmpeq_epi32(A, B));
671     assert(R.array == E);
672 }
673 
674 /// Compare packed 8-bit integers in `a` and `b` for equality.
675 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
676 {
677     static if (GDC_with_SSE2)
678     {
679         return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b);
680     }
681     else
682     {
683         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
684     }
685 }
686 unittest
687 {
688     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
689     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
690     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
691     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
692     assert(C.array == correct);
693 }
694 
695 /// Compare packed double-precision (64-bit) floating-point elements 
696 /// in `a` and `b` for equality.
697 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
698 {
699     static if (GDC_with_SSE2)
700     {
701         return __builtin_ia32_cmpeqpd(a, b);
702     }
703     else
704     {
705         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
706     }
707 }
708 
709 /// Compare the lower double-precision (64-bit) floating-point elements
710 /// in `a` and `b` for equality, store the result in the lower element,
711 /// and copy the upper element from `a`.
712 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
713 {
714     static if (GDC_with_SSE2)
715     {
716         return __builtin_ia32_cmpeqsd(a, b);
717     }
718     else
719     {
720         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
721     }
722 }
723 
724 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
725 /// #BONUS
726 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe
727 {
728     version (LDC)
729     {
730         // LDC ARM64: generates cmge since -O1
731         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b);
732     }
733     else
734     {        
735         return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b));
736     }
737 }
738 unittest
739 {
740     short8   A = [-3, -2, -32768,  0,  0,  1,  2,  3];
741     short8   B = [ 4,  3,  32767,  1,  0, -1, -2, -3];
742     short[8] E = [ 0,  0,      0,  0,  -1, -1, -1, -1];
743     short8   R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B));
744     assert(R.array == E);
745 }
746 
747 /// Compare packed double-precision (64-bit) floating-point elements 
748 /// in `a` and `b` for greater-than-or-equal.
749 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
750 {
751     static if (GDC_with_SSE2)
752     {
753         return __builtin_ia32_cmpgepd(a, b);
754     }
755     else
756     {
757         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
758     }
759 }
760 
761 /// Compare the lower double-precision (64-bit) floating-point elements 
762 /// in `a` and `b` for greater-than-or-equal, store the result in the 
763 /// lower element, and copy the upper element from `a`.
764 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
765 {
766     // Note: There is no __builtin_ia32_cmpgesd builtin.
767     static if (GDC_with_SSE2)
768     {
769         return __builtin_ia32_cmpnltsd(b, a);
770     }
771     else
772     {
773         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
774     }
775 }
776 
777 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
778 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
779 {
780     static if (GDC_with_SSE2)
781     {
782         return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b);
783     }
784     else
785     {
786         return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b);
787     }
788 }
789 unittest
790 {
791     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
792     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
793     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
794     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
795     assert(R.array == E);
796 }
797 
798 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
799 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
800 {
801     static if (GDC_with_SSE2)
802     {
803         return __builtin_ia32_pcmpgtd128(a, b); 
804     }
805     else
806     {
807         return cast(__m128i)( greaterMask!int4(a, b));
808     }
809 }
810 unittest
811 {
812     int4   A = [-3,  2, -1,  0];
813     int4   B = [ 4, -2,  2,  0];
814     int[4] E = [ 0, -1,  0,  0];
815     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
816     assert(R.array == E);
817 }
818 
819 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
820 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
821 {
822     // Workaround of a GCC bug here.
823     // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence
824     // with __builtin_ia32_pcmpgtb128.
825     // GCC's emmintrin.h uses comparison operators we don't have instead.
826     // PERF: this is a quite severe GDC performance problem.
827     // Could be workarounded with inline assembly, or another algorithm I guess.
828   
829   /*
830     static if (GDC_with_SSE2)
831     {
832         return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b);
833     }
834     else */
835     {
836         return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b);
837     }
838 }
839 unittest
840 {
841     __m128i A = _mm_setr_epi8(1, 2, 3, 1,  127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
842     __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
843     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
844     byte[16] correct =       [0, 0,-1, 0,   -1,   0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
845     __m128i D = _mm_cmpeq_epi8(A, B);
846     assert(C.array == correct);
847 }
848 
849 /// Compare packed double-precision (64-bit) floating-point elements 
850 /// in `a` and `b` for greater-than.
851 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
852 {
853     static if (GDC_with_SSE2)
854     {
855         return __builtin_ia32_cmpgtpd(a, b); 
856     }
857     else
858     {
859         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
860     }
861 }
862 
863 /// Compare the lower double-precision (64-bit) floating-point elements 
864 /// in `a` and `b` for greater-than, store the result in the lower element,
865 /// and copy the upper element from `a`.
866 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
867 {
868     // Note: There is no __builtin_ia32_cmpgtsd builtin.
869     static if (GDC_with_SSE2)
870     {
871         return __builtin_ia32_cmpnlesd(b, a);
872     }
873     else
874     {
875         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
876     }
877 }
878 
879 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal.
880 /// #BONUS
881 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe
882 {
883     version (LDC)
884     {
885         // LDC ARM64: generates cmge since -O1
886         return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a);
887     }
888     else
889     {
890         return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a));
891     }
892 }
893 unittest
894 {
895     short8   A = [-3, -2, -32768,  1,  0,  1,  2,  3];
896     short8   B = [ 4,  3,  32767,  0,  0, -1, -2, -3];
897     short[8] E = [-1, -1,     -1,  0,  -1, 0,  0,  0];
898     short8   R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B));
899     assert(R.array == E);
900 }
901 
902 /// Compare packed double-precision (64-bit) floating-point elements 
903 /// in `a` and `b` for less-than-or-equal.
904 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
905 {
906     static if (GDC_with_SSE2)
907     {
908         return __builtin_ia32_cmplepd(a, b); 
909     }
910     else
911     {
912         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
913     }
914 }
915 
916 /// Compare the lower double-precision (64-bit) floating-point elements 
917 /// in `a` and `b` for less-than-or-equal, store the result in the 
918 /// lower element, and copy the upper element from `a`.
919 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
920 {
921     static if (GDC_with_SSE2)
922     {
923         return __builtin_ia32_cmplesd(a, b); 
924     }
925     else
926     {
927         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
928     }
929 }
930 
931 /// Compare packed 16-bit integers in `a` and `b` for less-than.
932 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
933 {
934     return _mm_cmpgt_epi16(b, a);
935 }
936 
937 /// Compare packed 32-bit integers in `a` and `b` for less-than.
938 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
939 {
940     return _mm_cmpgt_epi32(b, a);
941 }
942 
943 /// Compare packed 8-bit integers in `a` and `b` for less-than.
944 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
945 {
946     return _mm_cmpgt_epi8(b, a);
947 }
948 
949 /// Compare packed double-precision (64-bit) floating-point elements
950 /// in `a` and `b` for less-than.
951 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
952 {
953     static if (GDC_with_SSE2)
954     {
955         return __builtin_ia32_cmpltpd(a, b); 
956     }
957     else
958     {
959         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
960     }
961 }
962 
963 /// Compare the lower double-precision (64-bit) floating-point elements
964 /// in `a` and `b` for less-than, store the result in the lower 
965 /// element, and copy the upper element from `a`.
966 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
967 {
968     static if (GDC_with_SSE2)
969     {
970         return __builtin_ia32_cmpltsd(a, b); 
971     }
972     else
973     {
974         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
975     }
976 }
977 
978 /// Compare packed double-precision (64-bit) floating-point elements
979 /// in `a` and `b` for not-equal.
980 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
981 {
982     static if (GDC_with_SSE2)
983     {
984         return __builtin_ia32_cmpneqpd(a, b); 
985     }
986     else
987     {
988         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
989     }
990 }
991 
992 /// Compare the lower double-precision (64-bit) floating-point elements
993 /// in `a` and `b` for not-equal, store the result in the lower 
994 /// element, and copy the upper element from `a`.
995 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
996 {
997     static if (GDC_with_SSE2)
998     {
999         return __builtin_ia32_cmpneqsd(a, b); 
1000     }
1001     else
1002     {
1003         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
1004     }
1005 }
1006 
1007 /// Compare packed double-precision (64-bit) floating-point elements 
1008 /// in `a` and `b` for not-greater-than-or-equal.
1009 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
1010 {
1011     static if (GDC_with_SSE2)
1012     {
1013         return __builtin_ia32_cmpngepd(a, b); 
1014     }
1015     else
1016     {
1017         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
1018     }
1019 }
1020 
1021 /// Compare the lower double-precision (64-bit) floating-point elements 
1022 /// in `a` and `b` for not-greater-than-or-equal, store the result in 
1023 /// the lower element, and copy the upper element from `a`.
1024 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
1025 {
1026     // Note: There is no __builtin_ia32_cmpngesd builtin.
1027     static if (GDC_with_SSE2)
1028     {
1029         return __builtin_ia32_cmpltsd(b, a); 
1030     }
1031     else
1032     {
1033         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
1034     }
1035 }
1036 
1037 /// Compare packed double-precision (64-bit) floating-point elements 
1038 /// in `a` and `b` for not-greater-than.
1039 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
1040 {
1041     static if (GDC_with_SSE2)
1042     {
1043         return __builtin_ia32_cmpngtpd(a, b);
1044     }
1045     else
1046     {
1047         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
1048     }
1049 }
1050 
1051 /// Compare the lower double-precision (64-bit) floating-point elements 
1052 /// in `a` and `b` for not-greater-than, store the result in the 
1053 /// lower element, and copy the upper element from `a`.
1054 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
1055 {
1056     // Note: There is no __builtin_ia32_cmpngtsd builtin.
1057     static if (GDC_with_SSE2)
1058     {
1059         return __builtin_ia32_cmplesd(b, a);
1060     }
1061     else
1062     {
1063         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
1064     }
1065 }
1066 
1067 /// Compare packed double-precision (64-bit) floating-point elements 
1068 /// in `a` and `b` for not-less-than-or-equal.
1069 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
1070 {
1071     static if (GDC_with_SSE2)
1072     {
1073         return __builtin_ia32_cmpnlepd(a, b);
1074     }
1075     else
1076     {
1077         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
1078     }
1079 }
1080 
1081 /// Compare the lower double-precision (64-bit) floating-point elements 
1082 /// in `a` and `b` for not-less-than-or-equal, store the result in the 
1083 /// lower element, and copy the upper element from `a`.
1084 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
1085 {
1086     static if (GDC_with_SSE2)
1087     {
1088         return __builtin_ia32_cmpnlesd(a, b);
1089     }
1090     else
1091     {
1092         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
1093     }
1094 }
1095  
1096 /// Compare packed double-precision (64-bit) floating-point elements 
1097 /// in `a` and `b` for not-less-than.
1098 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
1099 {
1100     static if (GDC_with_SSE2)
1101     {
1102         return __builtin_ia32_cmpnltpd(a, b);
1103     }
1104     else
1105     {
1106         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
1107     }
1108 }
1109 
1110 /// Compare the lower double-precision (64-bit) floating-point elements 
1111 /// in `a` and `b` for not-less-than, store the result in the lower 
1112 /// element, and copy the upper element from `a`.
1113 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
1114 {
1115     static if (GDC_with_SSE2)
1116     {
1117         return __builtin_ia32_cmpnltsd(a, b);
1118     }
1119     else
1120     {
1121         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
1122     }
1123 }
1124 
1125 /// Compare packed double-precision (64-bit) floating-point elements 
1126 /// in `a` and `b` to see if neither is NaN.
1127 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
1128 {
1129     static if (GDC_with_SSE2)
1130     {
1131         return __builtin_ia32_cmpordpd(a, b);
1132     }
1133     else
1134     {
1135         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
1136     }
1137 }
1138 
1139 /// Compare the lower double-precision (64-bit) floating-point elements 
1140 /// in `a` and `b` to see if neither is NaN, store the result in the 
1141 /// lower element, and copy the upper element from `a` to the upper element.
1142 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
1143 {
1144     static if (GDC_with_SSE2)
1145     {
1146         return __builtin_ia32_cmpordsd(a, b);
1147     }
1148     else
1149     {
1150         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
1151     }
1152 }
1153 
1154 /// Compare packed double-precision (64-bit) floating-point elements 
1155 /// in `a` and `b` to see if either is NaN.
1156 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
1157 {
1158     static if (GDC_with_SSE2)
1159     {
1160         return __builtin_ia32_cmpunordpd(a, b);
1161     }
1162     else
1163     {
1164         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
1165     }
1166 }
1167 
1168 /// Compare the lower double-precision (64-bit) floating-point elements 
1169 /// in `a` and `b` to see if either is NaN, store the result in the lower 
1170 /// element, and copy the upper element from `a` to the upper element.
1171 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
1172 {
1173     static if (GDC_with_SSE2)
1174     {
1175         return __builtin_ia32_cmpunordsd(a, b);
1176     }
1177     else
1178     {
1179         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
1180     }
1181 }
1182 
1183 /// Compare the lower double-precision (64-bit) floating-point element 
1184 /// in `a` and `b` for equality, and return the boolean result (0 or 1).
1185 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
1186 {
1187     // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 
1188     // comisd instruction, it returns false in case of unordered instead.
1189     //
1190     // Actually C++ compilers disagree over the meaning of that instruction.
1191     // GCC will manage NaNs like the comisd instruction (return true if unordered), 
1192     // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says.
1193     // We choose to do like the most numerous. It seems GCC is buggy with NaNs.
1194     return a.array[0] == b.array[0];
1195 }
1196 unittest
1197 {
1198     assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1199     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1200     assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1201     assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1202     assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1203 }
1204 
1205 /// Compare the lower double-precision (64-bit) floating-point element 
1206 /// in `a` and `b` for greater-than-or-equal, and return the boolean 
1207 /// result (0 or 1).
1208 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
1209 {
1210     return a.array[0] >= b.array[0];
1211 }
1212 unittest
1213 {
1214     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1215     assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1216     assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1217     assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1218     assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1219     assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1220 }
1221 
1222 /// Compare the lower double-precision (64-bit) floating-point element 
1223 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1).
1224 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
1225 {
1226     return a.array[0] > b.array[0];
1227 }
1228 unittest
1229 {
1230     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1231     assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1232     assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1233     assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1234     assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1235 }
1236 
1237 /// Compare the lower double-precision (64-bit) floating-point element 
1238 /// in `a` and `b` for less-than-or-equal.
1239 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
1240 {
1241     return a.array[0] <= b.array[0];
1242 }
1243 unittest
1244 {
1245     assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1246     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1247     assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1248     assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1249     assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1250     assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1251 }
1252 
1253 /// Compare the lower double-precision (64-bit) floating-point element 
1254 /// in `a` and `b` for less-than, and return the boolean result (0 or 1).
1255 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
1256 {
1257     return a.array[0] < b.array[0];
1258 }
1259 unittest
1260 {
1261     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1262     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1263     assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0)));
1264     assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1265     assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1266     assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0)));
1267 }
1268 
1269 /// Compare the lower double-precision (64-bit) floating-point element
1270 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1).
1271 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
1272 {
1273     return a.array[0] != b.array[0];
1274 }
1275 unittest
1276 {
1277     assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0)));
1278     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0)));
1279     assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan)));
1280     assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22)));
1281     assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0)));
1282 }
1283 
1284 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
1285 /// floating-point elements.
1286 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted
1287 {
1288     version(LDC)
1289     {
1290         // Generates cvtdq2pd since LDC 1.0, even without optimizations
1291         enum ir = `
1292             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
1293             %r = sitofp <2 x i32> %v to <2 x double>
1294             ret <2 x double> %r`;
1295         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
1296     }
1297     else static if (GDC_with_SSE2)
1298     {
1299         return __builtin_ia32_cvtdq2pd(a);
1300     }
1301     else
1302     {
1303         double2 r = void;
1304         r.ptr[0] = a.array[0];
1305         r.ptr[1] = a.array[1];
1306         return r;
1307     }
1308 }
1309 unittest
1310 {
1311     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
1312     assert(A.array[0] == 54.0);
1313     assert(A.array[1] == 54.0);
1314 }
1315 
1316 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 
1317 /// floating-point elements.
1318 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted
1319 {
1320     static if (DMD_with_DSIMD)
1321     {
1322         return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a);
1323     }
1324     else static if (GDC_with_SSE2)
1325     {
1326         return __builtin_ia32_cvtdq2ps(a);
1327     }
1328     else version(LDC)
1329     {
1330         // See #86 for why we had to resort to LLVM IR.
1331         // Plain code below was leading to catastrophic behaviour. 
1332         // x86: Generates cvtdq2ps since LDC 1.1.0 -O0
1333         // ARM: Generats scvtf.4s since LDC 1.8.0 -O0
1334         enum ir = `
1335             %r = sitofp <4 x i32> %0 to <4 x float>
1336             ret <4 x float> %r`;
1337         return cast(__m128) LDCInlineIR!(ir, float4, int4)(a);
1338     }
1339     else
1340     {
1341         __m128 res; // PERF =void;
1342         res.ptr[0] = cast(float)a.array[0];
1343         res.ptr[1] = cast(float)a.array[1];
1344         res.ptr[2] = cast(float)a.array[2];
1345         res.ptr[3] = cast(float)a.array[3];
1346         return res;
1347     }
1348 }
1349 unittest
1350 {
1351     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1352     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1353 }
1354 
1355 /// Convert packed double-precision (64-bit) floating-point elements 
1356 /// in `a` to packed 32-bit integers.
1357 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted
1358 {
1359     // PERF ARM32
1360     static if (LDC_with_SSE2)
1361     {
1362         return __builtin_ia32_cvtpd2dq(a);
1363     }
1364     else static if (GDC_with_SSE2)
1365     {
1366         return __builtin_ia32_cvtpd2dq(a);
1367     }
1368     else static if (LDC_with_ARM64)
1369     {
1370         // Get current rounding mode.
1371         uint fpscr = arm_get_fpcr();
1372         long2 i;
1373         switch(fpscr & _MM_ROUND_MASK_ARM)
1374         {
1375             default:
1376             case _MM_ROUND_NEAREST_ARM:     i = vcvtnq_s64_f64(a); break;
1377             case _MM_ROUND_DOWN_ARM:        i = vcvtmq_s64_f64(a); break;
1378             case _MM_ROUND_UP_ARM:          i = vcvtpq_s64_f64(a); break;
1379             case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break;
1380         }
1381         int4 zero = 0;
1382         return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero);
1383     }
1384     else
1385     {
1386         // PERF ARM32
1387         __m128i r = _mm_setzero_si128();
1388         r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]);
1389         r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]);
1390         return r;
1391     }
1392 }
1393 unittest
1394 {
1395     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1396     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1397 }
1398 
1399 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1400 /// to packed 32-bit integers
1401 __m64 _mm_cvtpd_pi32 (__m128d v) @safe
1402 {
1403     return to_m64(_mm_cvtpd_epi32(v));
1404 }
1405 unittest
1406 {
1407     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1408     assert(A.array[0] == 55 && A.array[1] == 61);
1409 }
1410 
1411 /// Convert packed double-precision (64-bit) floating-point elements 
1412 /// in `a` to packed single-precision (32-bit) floating-point elements.
1413 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted
1414 {
1415     static if (LDC_with_SSE2)
1416     {
1417         return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately
1418     }
1419     else static if (GDC_with_SSE2)
1420     {
1421         return __builtin_ia32_cvtpd2ps(a);
1422     }
1423     else
1424     { 
1425         __m128 r = void;
1426         r.ptr[0] = a.array[0];
1427         r.ptr[1] = a.array[1];
1428         r.ptr[2] = 0;
1429         r.ptr[3] = 0;
1430         return r;
1431     }
1432 }
1433 unittest
1434 {
1435     __m128d A = _mm_set_pd(5.25, 4.0);
1436     __m128 B = _mm_cvtpd_ps(A);
1437     assert(B.array == [4.0f, 5.25f, 0, 0]);
1438 }
1439 
1440 /// Convert packed 32-bit integers in `v` to packed double-precision 
1441 /// (64-bit) floating-point elements.
1442 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1443 {
1444     return _mm_cvtepi32_pd(to_m128i(v));
1445 }
1446 unittest
1447 {
1448     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1449     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1450 }
1451 
1452 /// Convert packed single-precision (32-bit) floating-point elements 
1453 /// in `a` to packed 32-bit integers
1454 __m128i _mm_cvtps_epi32 (__m128 a) @trusted
1455 {
1456     static if (LDC_with_SSE2)
1457     {
1458         return cast(__m128i) __builtin_ia32_cvtps2dq(a);
1459     }
1460     else static if (GDC_with_SSE2)
1461     {
1462         return __builtin_ia32_cvtps2dq(a);
1463     }
1464     else static if (LDC_with_ARM64)
1465     {
1466         // Get current rounding mode.
1467         uint fpscr = arm_get_fpcr();
1468         switch(fpscr & _MM_ROUND_MASK_ARM)
1469         {
1470             default:
1471             case _MM_ROUND_NEAREST_ARM:     return vcvtnq_s32_f32(a);
1472             case _MM_ROUND_DOWN_ARM:        return vcvtmq_s32_f32(a);
1473             case _MM_ROUND_UP_ARM:          return vcvtpq_s32_f32(a);
1474             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a);
1475         }
1476     }
1477     else
1478     {
1479         __m128i r = void;
1480         r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]);
1481         r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]);
1482         r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]);
1483         r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]);
1484         return r;
1485     }
1486 }
1487 unittest
1488 {
1489     // GDC bug #98607
1490     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
1491     // GDC does not provide optimization barrier for rounding mode.
1492     // Workarounded with different literals. This bug will likely only manifest in unittest.
1493     // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't.
1494 
1495     uint savedRounding = _MM_GET_ROUNDING_MODE();
1496 
1497     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1498     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1499     assert(A.array == [1, -2, 54, -3]);
1500 
1501     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1502     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f));
1503     assert(A.array == [1, -3, 53, -3]);
1504 
1505     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1506     A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f));
1507     assert(A.array == [2, -2, 54, -2]);
1508 
1509     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1510     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f));
1511     assert(A.array == [1, -2, 53, -2]);
1512 
1513     _MM_SET_ROUNDING_MODE(savedRounding);
1514 }
1515 
1516 /// Convert packed single-precision (32-bit) floating-point elements 
1517 /// in `a` to packed double-precision (64-bit) floating-point elements.
1518 __m128d _mm_cvtps_pd (__m128 a) pure @trusted
1519 {
1520     version(LDC)
1521     {
1522         // Generates cvtps2pd since LDC 1.0 -O0
1523         enum ir = `
1524             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1525             %r = fpext <2 x float> %v to <2 x double>
1526             ret <2 x double> %r`;
1527         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1528     }
1529     else static if (GDC_with_SSE2)
1530     {
1531         return __builtin_ia32_cvtps2pd(a);
1532     }
1533     else
1534     {
1535         double2 r = void;
1536         r.ptr[0] = a.array[0];
1537         r.ptr[1] = a.array[1];
1538         return r;
1539     }
1540 }
1541 unittest
1542 {
1543     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1544     assert(A.array[0] == 54.0);
1545     assert(A.array[1] == 54.0);
1546 }
1547 
1548 /// Copy the lower double-precision (64-bit) floating-point element of `a`.
1549 double _mm_cvtsd_f64 (__m128d a) pure @safe
1550 {
1551     return a.array[0];
1552 }
1553 
1554 /// Convert the lower double-precision (64-bit) floating-point element
1555 /// in `a` to a 32-bit integer.
1556 int _mm_cvtsd_si32 (__m128d a) @safe
1557 {
1558     static if (LDC_with_SSE2)
1559     {
1560         return __builtin_ia32_cvtsd2si(a);
1561     }
1562     else static if (GDC_with_SSE2)
1563     {
1564         return __builtin_ia32_cvtsd2si(a);
1565     }
1566     else
1567     {
1568         return convertDoubleToInt32UsingMXCSR(a[0]);
1569     }
1570 }
1571 unittest
1572 {
1573     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1574 }
1575 
1576 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer.
1577 long _mm_cvtsd_si64 (__m128d a) @trusted
1578 {
1579     version (LDC)
1580     {
1581         version (X86_64)
1582         {
1583             return __builtin_ia32_cvtsd2si64(a);
1584         }
1585         else
1586         {
1587             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1588             // using SSE instructions only. So the builtin doesn't exit for this arch.
1589             return convertDoubleToInt64UsingMXCSR(a[0]);
1590         }
1591     }
1592     else
1593     {
1594         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1595     }
1596 }
1597 unittest
1598 {
1599     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1600 
1601     uint savedRounding = _MM_GET_ROUNDING_MODE();
1602 
1603     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1604     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49)));
1605 
1606     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1607     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1608 
1609     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1610     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1611 
1612     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1613     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1614 
1615     _MM_SET_ROUNDING_MODE(savedRounding);
1616 }
1617 
1618 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; ///
1619 
1620 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 
1621 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a`
1622 /// to the upper elements of result.
1623 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted
1624 {
1625     static if (GDC_with_SSE2)
1626     {
1627         return __builtin_ia32_cvtsd2ss(a, b); 
1628     }
1629     else
1630     {
1631         // Generates cvtsd2ss since LDC 1.3 -O0
1632         a.ptr[0] = b.array[0];
1633         return a;
1634     }
1635 }
1636 unittest
1637 {
1638     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1639     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1640 }
1641 
1642 /// Get the lower 32-bit integer in `a`.
1643 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1644 {
1645     return a.array[0];
1646 }
1647 
1648 /// Get the lower 64-bit integer in `a`.
1649 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1650 {
1651     long2 la = cast(long2)a;
1652     return la.array[0];
1653 }
1654 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1655 
1656 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 
1657 /// lower element of result, and copy the upper element from `a` to the upper element of result.
1658 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted
1659 {
1660     a.ptr[0] = cast(double)b;
1661     return a;
1662 }
1663 unittest
1664 {
1665     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1666     assert(a.array == [42.0, 0]);
1667 }
1668 
1669 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements.
1670 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1671 {
1672     int4 r = [0, 0, 0, 0];
1673     r.ptr[0] = a;
1674     return r;
1675 }
1676 unittest
1677 {
1678     __m128i a = _mm_cvtsi32_si128(65);
1679     assert(a.array == [65, 0, 0, 0]);
1680 }
1681 
1682 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 
1683 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
1684 
1685 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted
1686 {
1687     a.ptr[0] = cast(double)b;
1688     return a;
1689 }
1690 unittest
1691 {
1692     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1693     assert(a.array == [42.0, 0]);
1694 }
1695 
1696 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element.
1697 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1698 {
1699     long2 r = [0, 0];
1700     r.ptr[0] = a;
1701     return cast(__m128i)(r);
1702 }
1703 
1704 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; ///
1705 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; ///
1706 
1707 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 
1708 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 
1709 // element of result.
1710 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted
1711 {
1712     a.ptr[0] = b.array[0];
1713     return a;
1714 }
1715 unittest
1716 {
1717     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1718     assert(a.array == [42.0, 0]);
1719 }
1720 
1721 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation.
1722 long _mm_cvttss_si64 (__m128 a) pure @safe
1723 {
1724     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1725 }
1726 unittest
1727 {
1728     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1729 }
1730 
1731 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1732 /// Put zeroes in the upper elements of result.
1733 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted
1734 {
1735     static if (LDC_with_SSE2)
1736     {
1737         return __builtin_ia32_cvttpd2dq(a);
1738     }
1739     else static if (GDC_with_SSE2)
1740     {
1741         return __builtin_ia32_cvttpd2dq(a);
1742     }
1743     else
1744     {
1745         // Note: doesn't generate cvttpd2dq as of LDC 1.13
1746         __m128i r; // PERF =void;
1747         r.ptr[0] = cast(int)a.array[0];
1748         r.ptr[1] = cast(int)a.array[1];
1749         r.ptr[2] = 0;
1750         r.ptr[3] = 0;
1751         return r;
1752     }
1753 }
1754 unittest
1755 {
1756     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1757     assert(R.array == [-4, 45641, 0, 0]);
1758 }
1759 
1760 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1761 /// to packed 32-bit integers with truncation.
1762 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1763 {
1764     return to_m64(_mm_cvttpd_epi32(v));
1765 }
1766 unittest
1767 {
1768     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1769     int[2] correct = [-4, 45641];
1770     assert(R.array == correct);
1771 }
1772 
1773 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
1774 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1775 {
1776     // x86: Generates cvttps2dq since LDC 1.3 -O2
1777     // ARM64: generates fcvtze since LDC 1.8 -O2
1778     __m128i r; // PERF = void;
1779     r.ptr[0] = cast(int)a.array[0];
1780     r.ptr[1] = cast(int)a.array[1];
1781     r.ptr[2] = cast(int)a.array[2];
1782     r.ptr[3] = cast(int)a.array[3];
1783     return r;
1784 }
1785 unittest
1786 {
1787     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1788     assert(R.array == [-4, 45641, 0, 1]);
1789 }
1790 
1791 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation.
1792 int _mm_cvttsd_si32 (__m128d a)
1793 {
1794     // Generates cvttsd2si since LDC 1.3 -O0
1795     return cast(int)a.array[0];
1796 }
1797 
1798 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation.
1799 long _mm_cvttsd_si64 (__m128d a)
1800 {
1801     // Generates cvttsd2si since LDC 1.3 -O0
1802     // but in 32-bit instead, it's a long sequence that resort to FPU
1803     return cast(long)a.array[0];
1804 }
1805 
1806 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; ///
1807 
1808 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`.
1809 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1810 {
1811     pragma(inline, true);
1812     return a / b;
1813 }
1814 
1815 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1816 {
1817     static if (GDC_with_SSE2)
1818     {
1819         return __builtin_ia32_divsd(a, b);
1820     }
1821     else version(DigitalMars)
1822     {
1823         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1824         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
1825         asm pure nothrow @nogc @trusted { nop;}
1826         a.array[0] = a.array[0] / b.array[0];
1827         return a;
1828     }
1829     else
1830     {
1831         a.ptr[0] /= b.array[0];
1832         return a;
1833     }
1834 }
1835 unittest
1836 {
1837     __m128d a = [2.0, 4.5];
1838     a = _mm_div_sd(a, a);
1839     assert(a.array == [1.0, 4.5]);
1840 }
1841 
1842 /// Extract a 16-bit integer from `v`, selected with `index`.
1843 /// Warning: the returned value is zero-extended to 32-bits.
1844 int _mm_extract_epi16(__m128i v, int index) pure @safe
1845 {
1846     short8 r = cast(short8)v;
1847     return cast(ushort)(r.array[index & 7]);
1848 }
1849 unittest
1850 {
1851     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1852     assert(_mm_extract_epi16(A, 6) == 6);
1853     assert(_mm_extract_epi16(A, 0) == 65535);
1854     assert(_mm_extract_epi16(A, 5 + 8) == 5);
1855 }
1856 
1857 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1858 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1859 {
1860     short8 r = cast(short8)v;
1861     r.ptr[index & 7] = cast(short)i;
1862     return cast(__m128i)r;
1863 }
1864 unittest
1865 {
1866     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1867     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1868     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1869     assert(R.array == correct);
1870 }
1871 
1872 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 
1873 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 
1874 /// is globally visible before any load instruction which follows the fence in program order.
1875 void _mm_lfence() @trusted
1876 {
1877     version(GNU)
1878     {
1879         static if (GDC_with_SSE2)
1880         {
1881             __builtin_ia32_lfence();
1882         }
1883         else version(X86)
1884         {
1885             asm pure nothrow @nogc @trusted
1886             {
1887                 "lfence;\n" : : : ;
1888             }
1889         }
1890         else
1891             static assert(false);
1892     }
1893     else static if (LDC_with_SSE2)
1894     {
1895         __builtin_ia32_lfence();
1896     }
1897     else static if (LDC_with_ARM64)
1898     {
1899          __builtin_arm_dmb(9);  // dmb ishld
1900     }
1901     else static if (DMD_with_asm)
1902     {
1903         asm nothrow @nogc pure @safe
1904         {
1905             lfence;
1906         }
1907     }
1908     else version(LDC)
1909     {
1910         // When the architecture is unknown, generate a full memory barrier,
1911         // as the semantics of sfence do not really match those of atomics.
1912         llvm_memory_fence();
1913     }
1914     else
1915         static assert(false);
1916 }
1917 unittest
1918 {
1919     _mm_lfence();
1920 }
1921 
1922 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory.
1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1924 __m128d _mm_load_pd (const(double) * mem_addr) pure
1925 {
1926     pragma(inline, true);
1927     __m128d* aligned = cast(__m128d*)mem_addr;
1928     return *aligned;
1929 }
1930 unittest
1931 {
1932     align(16) double[2] S = [-5.0, 7.0];
1933     __m128d R = _mm_load_pd(S.ptr);
1934     assert(R.array == S);
1935 }
1936 
1937 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
1938 /// `mem_addr` does not need to be aligned on any particular boundary.
1939 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1940 {
1941     double m = *mem_addr;
1942     __m128d r; // PERF =void;
1943     r.ptr[0] = m;
1944     r.ptr[1] = m;
1945     return r;
1946 }
1947 unittest
1948 {
1949     double what = 4;
1950     __m128d R = _mm_load_pd1(&what);
1951     double[2] correct = [4.0, 4];
1952     assert(R.array == correct);
1953 }
1954 
1955 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 
1956 /// element. `mem_addr` does not need to be aligned on any particular boundary.
1957 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1958 {
1959     double2 r = [0, 0];
1960     r.ptr[0] = *mem_addr;
1961     return r;
1962 }
1963 unittest
1964 {
1965     double x = -42;
1966     __m128d a = _mm_load_sd(&x);
1967     assert(a.array == [-42.0, 0.0]);
1968 }
1969 
1970 /// Load 128-bits of integer data from memory into dst. 
1971 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1972 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62
1973 {
1974     pragma(inline, true);
1975     return *mem_addr;
1976 }
1977 unittest
1978 {
1979     align(16) int[4] correct = [-1, 2, 3, 4];
1980     int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr);
1981     assert(A.array == correct);
1982 }
1983 
1984 alias _mm_load1_pd = _mm_load_pd1; ///
1985 
1986 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 
1987 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1988 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1989 {
1990     pragma(inline, true);
1991     a.ptr[1] = *mem_addr;
1992     return a;
1993 }
1994 unittest
1995 {
1996     double A = 7.0;
1997     __m128d B = _mm_setr_pd(4.0, -5.0);
1998     __m128d R = _mm_loadh_pd(B, &A);
1999     double[2] correct = [ 4.0, 7.0 ];
2000     assert(R.array == correct);
2001 }
2002 
2003 /// Load 64-bit integer from memory into the first element of result. Zero out the other.
2004 // Note: strange signature since the memory doesn't have to aligned (Issue #60), and doesn't have to be 128-bit
2005 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature
2006 {
2007     pragma(inline, true);
2008     static if (DMD_with_DSIMD)
2009     {
2010         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2011     }
2012     else
2013     {
2014         auto pLong = cast(const(long)*)mem_addr;
2015         long2 r = [0, 0];
2016         r.ptr[0] = *pLong;
2017         return cast(__m128i)(r);
2018     }
2019 }
2020 unittest
2021 {
2022     long A = 0x7878787870707070;
2023     long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A);
2024     long[2] correct = [0x7878787870707070, 0];
2025     assert(R.array == correct);
2026 }
2027 
2028 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 
2029 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary.
2030 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
2031 {
2032     a.ptr[0] = *mem_addr;
2033     return a;
2034 }
2035 unittest
2036 {
2037     double A = 7.0;
2038     __m128d B = _mm_setr_pd(4.0, -5.0);
2039     __m128d R = _mm_loadl_pd(B, &A);
2040     double[2] correct = [ 7.0, -5.0 ];
2041     assert(R.array == correct);
2042 }
2043 
2044 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 
2045 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2046 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
2047 {
2048     __m128d a = *cast(__m128d*)(mem_addr);
2049     __m128d r; // PERF =void;
2050     r.ptr[0] = a.array[1];
2051     r.ptr[1] = a.array[0];
2052     return r;
2053 }
2054 unittest
2055 {
2056     align(16) double[2] A = [56.0, -74.0];
2057     __m128d R = _mm_loadr_pd(A.ptr);
2058     double[2] correct = [-74.0, 56.0];
2059     assert(R.array == correct);
2060 }
2061 
2062 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 
2063 /// `mem_addr` does not need to be aligned on any particular boundary.
2064 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted
2065 {
2066     pragma(inline, true);
2067     static if (GDC_with_SSE2)
2068     {
2069         return __builtin_ia32_loadupd(mem_addr); 
2070     }
2071     else version(LDC)
2072     {
2073         return loadUnaligned!(double2)(mem_addr);
2074     }
2075     else version(DigitalMars)
2076     {
2077         // Apparently inside __simd you can use aligned dereferences without fear.
2078         // That was issue 23048 on dlang's Bugzilla.
2079         static if (DMD_with_DSIMD)
2080         {
2081             return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr);
2082         }
2083         else static if (SSESizedVectorsAreEmulated)
2084         {
2085             // Since this vector is emulated, it doesn't have alignement constraints
2086             // and as such we can just cast it.
2087             return *cast(__m128d*)(mem_addr);
2088         }
2089         else
2090         {
2091             __m128d result;
2092             result.ptr[0] = mem_addr[0];
2093             result.ptr[1] = mem_addr[1];
2094             return result;
2095         }
2096     }
2097     else
2098     {
2099         __m128d result;
2100         result.ptr[0] = mem_addr[0];
2101         result.ptr[1] = mem_addr[1];
2102         return result;
2103     }
2104 }
2105 unittest
2106 {
2107     double[2] A = [56.0, -75.0];
2108     __m128d R = _mm_loadu_pd(A.ptr);
2109     double[2] correct = [56.0, -75.0];
2110     assert(R.array == correct);
2111 }
2112 
2113 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
2114 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
2115 {
2116     // PERF DMD
2117     pragma(inline, true);
2118     static if (GDC_with_SSE2)
2119     {
2120         return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
2121     }
2122     else version(LDC)
2123     {
2124         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
2125     }
2126     else
2127     {
2128         const(int)* p = cast(const(int)*)mem_addr;
2129         __m128i r = void;
2130         r.ptr[0] = p[0];
2131         r.ptr[1] = p[1];
2132         r.ptr[2] = p[2];
2133         r.ptr[3] = p[3];
2134         return r;
2135     }
2136 }
2137 unittest
2138 {
2139     align(16) int[4] correct = [-1, 2, -3, 4];
2140     int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr);
2141     assert(A.array == correct);
2142 }
2143 
2144 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
2145 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2146 {
2147     static if (DMD_with_DSIMD)
2148     {
2149         int r = *cast(short*)(mem_addr);
2150         return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
2151     }
2152     else version(DigitalMars)
2153     {
2154         // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
2155         // DMD cannot handle the below code...
2156         align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
2157         r[0] = *cast(short*)(mem_addr);
2158         return *cast(int4*)(r.ptr);
2159     }
2160     else
2161     {
2162         short r = *cast(short*)(mem_addr);
2163         short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
2164         result.ptr[0] = r;
2165         return cast(__m128i)result;
2166     }
2167 }
2168 unittest
2169 {
2170     short r = 13;
2171     short8 A = cast(short8) _mm_loadu_si16(&r);
2172     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
2173     assert(A.array == correct);
2174 }
2175 
2176 /// Load unaligned 32-bit integer from memory into the first element of result.
2177 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually
2178 {
2179     pragma(inline, true);
2180     int r = *cast(int*)(mem_addr);
2181     int4 result = [0, 0, 0, 0];
2182     result.ptr[0] = r;
2183     return result;
2184 }
2185 unittest
2186 {
2187     int r = 42;
2188     __m128i A = _mm_loadu_si32(&r);
2189     int[4] correct = [42, 0, 0, 0];
2190     assert(A.array == correct);
2191 }
2192 
2193 /// Load unaligned 64-bit integer from memory into the first element of result.
2194 /// Upper 64-bit is zeroed.
2195 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system
2196 {
2197     pragma(inline, true);
2198     static if (DMD_with_DSIMD)
2199     {
2200         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
2201     }
2202     else
2203     {    
2204         auto pLong = cast(const(long)*)mem_addr;
2205         long2 r = [0, 0];
2206         r.ptr[0] = *pLong;
2207         return cast(__m128i)r;
2208     }
2209 }
2210 unittest
2211 {
2212     long r = 446446446446;
2213     long2 A = cast(long2) _mm_loadu_si64(&r);
2214     long[2] correct = [446446446446, 0];
2215     assert(A.array == correct);
2216 }
2217 
2218 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
2219 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
2220 /// and pack the results in destination.
2221 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted
2222 {
2223     static if (GDC_with_SSE2)
2224     {
2225         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2226     }
2227     else static if (LDC_with_SSE2)
2228     {
2229         return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b);
2230     }
2231     else static if (LDC_with_ARM64)
2232     {
2233         int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b));
2234         int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b));
2235         int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2236         int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2237         return vcombine_s32(rl, rh);
2238     }
2239     else
2240     {
2241         short8 sa = cast(short8)a;
2242         short8 sb = cast(short8)b;
2243         int4 r;
2244         foreach(i; 0..4)
2245         {
2246             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
2247         }
2248         return r;
2249     }
2250 }
2251 unittest
2252 {
2253     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2254     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
2255     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
2256     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
2257     assert(R.array == correct);
2258 }
2259 
2260 /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
2261 /// (elements are not stored when the highest bit is not set in the corresponding element)
2262 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
2263 /// boundary.
2264 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted
2265 {
2266     static if (GDC_with_SSE2)
2267     {    
2268         return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
2269     }
2270     else static if (LDC_with_SSE2)
2271     {
2272         return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr);
2273     }
2274     else static if (LDC_with_ARM64)
2275     {
2276         // PERF: catastrophic on ARM32
2277         byte16 bmask  = cast(byte16)mask;
2278         byte16 shift = 7;
2279         bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask
2280         mask = cast(__m128i) bmask;
2281         __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr);
2282         dest = (a & mask) | (dest & ~mask);
2283         storeUnaligned!__m128i(dest, cast(int*)mem_addr);
2284     }
2285     else
2286     {
2287         byte16 b = cast(byte16)a;
2288         byte16 m = cast(byte16)mask;
2289         byte* dest = cast(byte*)(mem_addr);
2290         foreach(j; 0..16)
2291         {
2292             if (m.array[j] & 128)
2293             {
2294                 dest[j] = b.array[j];
2295             }
2296         }
2297     }
2298 }
2299 unittest
2300 {
2301     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
2302     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
2303     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
2304     _mm_maskmoveu_si128(A, mask, dest.ptr);
2305     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
2306     assert(dest == correct);
2307 }
2308 
2309 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values.
2310 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
2311 {
2312     static if (GDC_with_SSE2)
2313     {
2314         return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b);
2315     }
2316     else version(LDC)
2317     {
2318         // x86: pmaxsw since LDC 1.0 -O1
2319         // ARM: smax.8h since LDC 1.5 -01
2320         short8 sa = cast(short8)a;
2321         short8 sb = cast(short8)b;
2322         short8 greater = greaterMask!short8(sa, sb);
2323         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2324     }
2325     else
2326     {
2327         __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
2328         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2329         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2330         return _mm_xor_si128(b, mask);
2331     }
2332 }
2333 unittest
2334 {
2335     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9,  7, 0,-57),
2336                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-32768, 0,  0));
2337     short[8] correct =                                  [32767, 1,  9,  7, 9,  7, 0,  0];
2338     assert(R.array == correct);
2339 }
2340 
2341 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.
2342 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
2343 {
2344     version(LDC)
2345     {
2346         // x86: pmaxub since LDC 1.0.0 -O1
2347         // ARM64: umax.16b since LDC 1.5.0 -O1
2348         // PERF: catastrophic on ARM32
2349         ubyte16 sa = cast(ubyte16)a;
2350         ubyte16 sb = cast(ubyte16)b;
2351         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2352         return cast(__m128i)( (greater & sa) | (~greater & sb) );
2353     }
2354     else
2355     {
2356         __m128i value128 = _mm_set1_epi8(-128);
2357         __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2358         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2359         __m128i mask = _mm_and_si128(aTob, higher);
2360         return _mm_xor_si128(b, mask);
2361     }
2362 }
2363 unittest
2364 {
2365     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2366                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2367     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
2368     assert(R.array == correct);
2369 }
2370 
2371 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 
2372 /// packed maximum values.
2373 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted
2374 {
2375     static if (GDC_with_SSE2)
2376     {
2377         return __builtin_ia32_maxpd(a, b);
2378     }
2379     else
2380     {
2381         // x86: Generates maxpd starting with LDC 1.9 -O2
2382         a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2383         a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1];
2384         return a;
2385     }
2386 }
2387 unittest
2388 {
2389     __m128d A = _mm_setr_pd(4.0, 1.0);
2390     __m128d B = _mm_setr_pd(1.0, 8.0);
2391     __m128d M = _mm_max_pd(A, B);
2392     assert(M.array[0] == 4.0);
2393     assert(M.array[1] == 8.0);
2394 }
2395 
2396 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 
2397 /// lower element of result, and copy the upper element from `a` to the upper element of result.
2398 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted
2399 {
2400     static if (GDC_with_SSE2)
2401     {
2402         return __builtin_ia32_maxsd(a, b);
2403     }
2404     else
2405     {
2406          __m128d r = a;
2407         // Generates maxsd starting with LDC 1.3
2408         r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
2409         return r;
2410     }
2411 }
2412 unittest
2413 {
2414     __m128d A = _mm_setr_pd(1.0, 1.0);
2415     __m128d B = _mm_setr_pd(4.0, 2.0);
2416     __m128d M = _mm_max_sd(A, B);
2417     assert(M.array[0] == 4.0);
2418     assert(M.array[1] == 1.0);
2419 }
2420 
2421 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 
2422 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 
2423 /// is globally visible before any memory instruction which follows the fence in program order.
2424 void _mm_mfence() @trusted // not pure!
2425 {
2426     version(GNU)
2427     {
2428         static if (GDC_with_SSE2)
2429         {
2430             __builtin_ia32_mfence();
2431         }
2432         else version(X86)
2433         {
2434             asm pure nothrow @nogc @trusted
2435             {
2436                 "mfence;\n" : : : ;
2437             }
2438         }
2439         else
2440             static assert(false);
2441     }
2442     else static if (LDC_with_SSE2)
2443     {
2444         __builtin_ia32_mfence();
2445     }
2446     else static if (DMD_with_asm)
2447     {
2448         asm nothrow @nogc pure @safe
2449         {
2450             mfence;
2451         }
2452     }
2453     else version(LDC)
2454     {
2455         // Note: will generate the DMB ish instruction on ARM
2456         llvm_memory_fence();
2457     }
2458     else
2459         static assert(false);
2460 }
2461 unittest
2462 {
2463     _mm_mfence();
2464 }
2465 
2466 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
2467 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
2468 {
2469     static if (GDC_with_SSE2)
2470     {
2471         return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b);
2472     }
2473     else version(LDC)
2474     {
2475         // x86: pminsw since LDC 1.0 -O1
2476         // ARM64: smin.8h since LDC 1.5 -01
2477         short8 sa = cast(short8)a;
2478         short8 sb = cast(short8)b;
2479         short8 greater = greaterMask!short8(sa, sb);
2480         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2481     }
2482     else
2483     {
2484         __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
2485         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2486         __m128i mask = _mm_and_si128(aTob, lowerShorts);
2487         return _mm_xor_si128(b, mask);
2488     }
2489 }
2490 unittest
2491 {
2492     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-32768),
2493                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
2494     short[8] correct =                                  [-4,-8, -4, -8, 0,-57, 0, -32768];
2495     assert(R.array == correct);
2496 }
2497 
2498 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
2499 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
2500 {
2501     version(LDC)
2502     {
2503         // x86: pminub since LDC 1.0.0 -O1
2504         // ARM: umin.16b since LDC 1.5.0 -O1
2505         // PERF: catastrophic on ARM32
2506         ubyte16 sa = cast(ubyte16)a;
2507         ubyte16 sb = cast(ubyte16)b;
2508         ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb);
2509         return cast(__m128i)( (~greater & sa) | (greater & sb) );
2510     }
2511     else
2512     {
2513         __m128i value128 = _mm_set1_epi8(-128);
2514         __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
2515         __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
2516         __m128i mask = _mm_and_si128(aTob, lower);
2517         return _mm_xor_si128(b, mask);
2518     }
2519 }
2520 unittest
2521 {
2522     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
2523                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
2524     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
2525     assert(R.array == correct);
2526 }
2527 
2528 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values.
2529 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted
2530 {
2531     static if (GDC_with_SSE2)
2532     {
2533         return __builtin_ia32_minpd(a, b);
2534     }
2535     else
2536     {
2537         // Generates minpd starting with LDC 1.9
2538         a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2539         a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
2540         return a;
2541     }
2542 }
2543 unittest
2544 {
2545     __m128d A = _mm_setr_pd(1.0, 2.0);
2546     __m128d B = _mm_setr_pd(4.0, 1.0);
2547     __m128d M = _mm_min_pd(A, B);
2548     assert(M.array[0] == 1.0);
2549     assert(M.array[1] == 1.0);
2550 }
2551 
2552 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 
2553 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
2554 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
2555 {
2556     static if (GDC_with_SSE2)
2557     {
2558         return __builtin_ia32_minsd(a, b);
2559     }
2560     else
2561     {
2562         // Generates minsd starting with LDC 1.3
2563         __m128d r = a;
2564         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
2565         return r;
2566     }
2567 }
2568 unittest
2569 {
2570     __m128d A = _mm_setr_pd(1.0, 3.0);
2571     __m128d B = _mm_setr_pd(4.0, 2.0);
2572     __m128d M = _mm_min_sd(A, B);
2573     assert(M.array[0] == 1.0);
2574     assert(M.array[1] == 3.0);
2575 }
2576 
2577 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element.
2578 __m128i _mm_move_epi64 (__m128i a) pure @trusted
2579 {
2580     static if (GDC_with_SSE2)
2581     {
2582         // slightly better with GDC -O0
2583         return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 
2584     }
2585     else
2586     {
2587         long2 result = [ 0, 0 ];
2588         long2 la = cast(long2) a;
2589         result.ptr[0] = la.array[0];
2590         return cast(__m128i)(result);
2591     }
2592 }
2593 unittest
2594 {
2595     long2 A = [13, 47];
2596     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
2597     long[2] correct = [13, 0];
2598     assert(B.array == correct);
2599 }
2600 
2601 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 
2602 /// the upper element from `a` to the upper element of dst.
2603 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted
2604 {
2605     static if (GDC_with_SSE2)
2606     {
2607         return __builtin_ia32_movsd(a, b); 
2608     }
2609     else
2610     {
2611         b.ptr[1] = a.array[1];
2612         return b;
2613     }
2614 }
2615 unittest
2616 {
2617     double2 A = [13.0, 47.0];
2618     double2 B = [34.0, 58.0];
2619     double2 C = _mm_move_sd(A, B);
2620     double[2] correct = [34.0, 47.0];
2621     assert(C.array == correct);
2622 }
2623 
2624 /// Create mask from the most significant bit of each 8-bit element in `v`.
2625 int _mm_movemask_epi8 (__m128i a) pure @trusted
2626 {
2627     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2628     static if (GDC_with_SSE2)
2629     {
2630         return __builtin_ia32_pmovmskb128(cast(ubyte16)a);
2631     }
2632     else static if (LDC_with_SSE2)
2633     {
2634         return __builtin_ia32_pmovmskb128(cast(byte16)a);
2635     }
2636     else static if (LDC_with_ARM64)
2637     {
2638         // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
2639         // The other two solutions lead to unfound intrinsics in LLVM and that took a long time.
2640         // SO there might be something a bit faster, but this one is reasonable and branchless.
2641         byte8 mask_shift;
2642         mask_shift.ptr[0] = 7;
2643         mask_shift.ptr[1] = 6;
2644         mask_shift.ptr[2] = 5;
2645         mask_shift.ptr[3] = 4;
2646         mask_shift.ptr[4] = 3;
2647         mask_shift.ptr[5] = 2;
2648         mask_shift.ptr[6] = 1;
2649         mask_shift.ptr[7] = 0;
2650         byte8 mask_and = byte8(-128);
2651         byte8 lo = vget_low_u8(cast(byte16)a);
2652         byte8 hi = vget_high_u8(cast(byte16)a);
2653         lo = vand_u8(lo, mask_and);
2654         lo = vshr_u8(lo, mask_shift);
2655         hi = vand_u8(hi, mask_and);
2656         hi = vshr_u8(hi, mask_shift);
2657         lo = vpadd_u8(lo,lo);
2658         lo = vpadd_u8(lo,lo);
2659         lo = vpadd_u8(lo,lo);
2660         hi = vpadd_u8(hi,hi);
2661         hi = vpadd_u8(hi,hi);
2662         hi = vpadd_u8(hi,hi);
2663         return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]);
2664     }
2665     else
2666     {
2667         byte16 ai = cast(byte16)a;
2668         int r = 0;
2669         foreach(bit; 0..16)
2670         {
2671             if (ai.array[bit] < 0) r += (1 << bit);
2672         }
2673         return r;
2674     }
2675 }
2676 unittest
2677 {
2678     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0)));
2679 }
2680 
2681 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS
2682 int _mm_movemask_epi16 (__m128i a) pure @trusted
2683 {
2684     return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128()));
2685 }
2686 unittest
2687 {
2688     assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8)));
2689 }
2690 
2691 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 
2692 /// loating-point element in `v`.
2693 int _mm_movemask_pd(__m128d v) pure @safe
2694 {
2695     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
2696     static if (GDC_or_LDC_with_SSE2)
2697     {
2698         return __builtin_ia32_movmskpd(v);
2699     }
2700     else
2701     {
2702         long2 lv = cast(long2)v;
2703         int r = 0;
2704         if (lv.array[0] < 0) r += 1;
2705         if (lv.array[1] < 0) r += 2;
2706         return r;
2707     }
2708 }
2709 unittest
2710 {
2711     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2712     assert(_mm_movemask_pd(A) == 2);
2713 }
2714 
2715 /// Copy the lower 64-bit integer in `v`.
2716 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2717 {
2718     long2 lv = cast(long2)v;
2719     return long1(lv.array[0]);
2720 }
2721 unittest
2722 {
2723     __m128i A = _mm_set_epi64x(-1, -2);
2724     __m64 R = _mm_movepi64_pi64(A);
2725     assert(R.array[0] == -2);
2726 }
2727 
2728 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2729 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2730 {
2731     long2 r;
2732     r.ptr[0] = a.array[0];
2733     r.ptr[1] = 0;
2734     return cast(__m128i)r;
2735 }
2736 
2737 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 
2738 /// and store the unsigned 64-bit results.
2739 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2740 {    
2741     // PERF DMD D_SIMD
2742     static if (GDC_with_SSE2)
2743     {
2744         return cast(__m128i) __builtin_ia32_pmuludq128 (a, b);
2745     }
2746     else
2747     {
2748         version(LDC)
2749         {
2750             static if (__VERSION__ >= 2088)
2751             {
2752                 // Need LLVM9 for proper optimization
2753                 long2 la, lb;
2754                 la.ptr[0] = cast(uint)a.array[0];
2755                 la.ptr[1] = cast(uint)a.array[2];
2756                 lb.ptr[0] = cast(uint)b.array[0];
2757                 lb.ptr[1] = cast(uint)b.array[2];
2758             }
2759             else
2760             {
2761                 __m128i zero;
2762                 zero = 0;
2763                 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero);
2764                 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero);
2765             }
2766         }
2767         else
2768         {
2769             long2 la, lb;
2770             la.ptr[0] = cast(uint)a.array[0];
2771             la.ptr[1] = cast(uint)a.array[2];
2772             lb.ptr[0] = cast(uint)b.array[0];
2773             lb.ptr[1] = cast(uint)b.array[2];
2774         }
2775 
2776         version(DigitalMars)
2777         {
2778             // DMD has no long2 mul
2779             la.ptr[0] *= lb.array[0];
2780             la.ptr[1] *= lb.array[1];
2781             return cast(__m128i)(la);
2782         }
2783         else
2784         {
2785             static if (__VERSION__ >= 2076)
2786             {
2787                 return cast(__m128i)(la * lb);
2788             }
2789             else
2790             {
2791                 // long2 mul not supported before LDC 1.5
2792                 la.ptr[0] *= lb.array[0];
2793                 la.ptr[1] *= lb.array[1];
2794                 return cast(__m128i)(la);
2795             }
2796         }
2797     }
2798 }
2799 unittest
2800 {
2801     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2802     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2803     __m128i C = _mm_mul_epu32(A, B);
2804     long2 LC = cast(long2)C;
2805     assert(LC.array[0] == 18446744065119617025uL);
2806     assert(LC.array[1] == 12723420444339690338uL);
2807 }
2808 
2809 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 
2810 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2811 {
2812     pragma(inline, true);
2813     return a * b;
2814 }
2815 unittest
2816 {
2817     __m128d a = [-2.0, 1.5];
2818     a = _mm_mul_pd(a, a);
2819     assert(a.array == [4.0, 2.25]);
2820 }
2821 
2822 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 
2823 /// element of result, and copy the upper element from `a` to the upper element of result.
2824 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted
2825 {
2826     version(DigitalMars)
2827     {    
2828         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2829         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
2830         asm pure nothrow @nogc @trusted { nop;}
2831         a.array[0] = a.array[0] * b.array[0];
2832         return a;
2833     }
2834     else static if (GDC_with_SSE2)
2835     {
2836         return __builtin_ia32_mulsd(a, b);
2837     }
2838     else
2839     {
2840         a.ptr[0] *= b.array[0];
2841         return a;
2842     }
2843 }
2844 unittest
2845 {
2846     __m128d a = [-2.0, 1.5];
2847     a = _mm_mul_sd(a, a);
2848     assert(a.array == [4.0, 1.5]);
2849 }
2850 
2851 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2852 /// and get an unsigned 64-bit result.
2853 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2854 {
2855     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2856 }
2857 unittest
2858 {
2859     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2860     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2861     __m64 C = _mm_mul_su32(A, B);
2862     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2863 }
2864 
2865 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2866 /// high 16 bits of the intermediate integers.
2867 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted
2868 {
2869     static if (GDC_with_SSE2)
2870     {
2871         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2872     }
2873     else static if (LDC_with_SSE2)
2874     {
2875         return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b);
2876     }
2877     else
2878     {
2879         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h
2880         //        PERF: it seems the simde solution has one less instruction in ARM64.
2881         // PERF: Catastrophic in ARM32.
2882         short8 sa = cast(short8)a;
2883         short8 sb = cast(short8)b;
2884         short8 r = void;
2885         r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16;
2886         r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16;
2887         r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16;
2888         r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16;
2889         r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16;
2890         r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16;
2891         r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16;
2892         r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16;
2893         return cast(__m128i)r;
2894     }
2895 }
2896 unittest
2897 {
2898     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2899     __m128i B = _mm_set1_epi16(16384);
2900     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2901     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2902     assert(R.array == correct);
2903 }
2904 
2905 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 
2906 /// high 16 bits of the intermediate integers.
2907 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted
2908 {
2909     static if (GDC_with_SSE2)
2910     {
2911         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2912     }
2913     else static if (LDC_with_SSE2)
2914     {
2915         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b);
2916     }
2917     else
2918     {
2919         // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h
2920         //      it seems the simde solution has one less instruction in ARM64
2921         // PERF: Catastrophic in ARM32.
2922         short8 sa = cast(short8)a;
2923         short8 sb = cast(short8)b;
2924         short8 r = void;
2925         r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2926         r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2927         r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2928         r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2929         r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2930         r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2931         r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2932         r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2933         return cast(__m128i)r;
2934     }
2935 }
2936 unittest
2937 {
2938     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2939     __m128i B = _mm_set1_epi16(16384);
2940     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2941     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2942     assert(R.array == correct);
2943 }
2944 
2945 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 
2946 /// bits of the intermediate integers.
2947 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2948 {
2949     return cast(__m128i)(cast(short8)a * cast(short8)b);
2950 }
2951 unittest
2952 {
2953     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2954     __m128i B = _mm_set1_epi16(16384);
2955     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2956     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2957     assert(R.array == correct);
2958 }
2959 
2960 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS
2961 __m128i _mm_not_si128 (__m128i a) pure @safe
2962 {
2963     return ~a;
2964 }
2965 unittest
2966 {
2967     __m128i A = _mm_set1_epi32(-748);
2968     int4 notA = cast(int4) _mm_not_si128(A);
2969     int[4] correct = [747, 747, 747, 747];
2970     assert(notA.array == correct);
2971 }
2972 
2973 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
2974 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2975 {
2976     pragma(inline, true);
2977     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2978 }
2979 
2980 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`.
2981 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2982 {
2983     pragma(inline, true);
2984     return a | b;
2985 }
2986 
2987 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
2988 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted
2989 {
2990     static if (GDC_with_SSE2)
2991     {
2992         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2993     }    
2994     else static if (LDC_with_SSE2)
2995     {
2996         return cast(__m128i) __builtin_ia32_packssdw128(a, b);
2997     }
2998     else static if (LDC_with_ARM64)
2999     {
3000         short4 ra = vqmovn_s32(cast(int4)a);
3001         short4 rb = vqmovn_s32(cast(int4)b);
3002         return cast(__m128i)vcombine_s16(ra, rb);
3003     }
3004     else
3005     {
3006         // PERF: catastrophic on ARM32
3007         short8 r;
3008         r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]);
3009         r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]);
3010         r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]);
3011         r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]);
3012         r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]);
3013         r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]);
3014         r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]);
3015         r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]);
3016         return cast(__m128i)r;
3017     }
3018 }
3019 unittest
3020 {
3021     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
3022     short8 R = cast(short8) _mm_packs_epi32(A, A);
3023     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
3024     assert(R.array == correct);
3025 }
3026 
3027 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
3028 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted
3029 {
3030     static if (GDC_with_SSE2)
3031     {
3032         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3033     }
3034     else static if (LDC_with_SSE2)
3035     {
3036         return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b);
3037     }
3038     else static if (LDC_with_ARM64)
3039     {
3040         // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02
3041         byte8 ra = vqmovn_s16(cast(short8)a);
3042         byte8 rb = vqmovn_s16(cast(short8)b);
3043         return cast(__m128i)vcombine_s8(ra, rb);
3044     }
3045     else
3046     {
3047         // PERF: ARM32 is missing
3048         byte16 r;
3049         short8 sa = cast(short8)a;
3050         short8 sb = cast(short8)b;
3051         foreach(i; 0..8)
3052             r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]);
3053         foreach(i; 0..8)
3054             r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
3055         return cast(__m128i)r;
3056     }
3057 }
3058 unittest
3059 {
3060     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
3061     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
3062     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
3063                         127, -128, 127, 0, 127, -128, 127, 0];
3064     assert(R.array == correct);
3065 }
3066 
3067 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
3068 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
3069 {
3070     // PERF DMD catastrophic
3071     static if (GDC_with_SSE2)
3072     {
3073         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3074     }
3075     else static if (LDC_with_SSE2)
3076     {
3077         return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b);
3078     }
3079     else static if (LDC_with_ARM64)
3080     {
3081         // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02
3082         byte8 ra = vqmovun_s16(cast(short8)a);
3083         byte8 rb = vqmovun_s16(cast(short8)b);
3084         return cast(__m128i)vcombine_s8(ra, rb);
3085     }
3086     else
3087     {
3088         short8 sa = cast(short8)a;
3089         short8 sb = cast(short8)b;
3090         align(16) ubyte[16] result = void;
3091         for (int i = 0; i < 8; ++i)
3092         {
3093             short s = sa[i];
3094             if (s < 0) s = 0;
3095             if (s > 255) s = 255;
3096             result[i] = cast(ubyte)s;
3097 
3098             s = sb[i];
3099             if (s < 0) s = 0;
3100             if (s > 255) s = 255;
3101             result[i+8] = cast(ubyte)s;
3102         }
3103         return *cast(__m128i*)(result.ptr);
3104     }
3105 }
3106 unittest
3107 {
3108     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
3109     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
3110     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
3111                                                 0, 255, 0, 255, 255, 2, 1, 0];
3112     foreach(i; 0..16)
3113         assert(AA.array[i] == cast(byte)(correctResult[i]));
3114 }
3115 
3116 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 
3117 /// and power consumption of spin-wait loops.
3118 void _mm_pause() @trusted
3119 {
3120     version(GNU)
3121     {
3122         static if (GDC_with_SSE2)
3123         {
3124             __builtin_ia32_pause();
3125         }
3126         else version(X86)
3127         {
3128             asm pure nothrow @nogc @trusted
3129             {
3130                 "pause;\n" : : : ;
3131             }
3132         }
3133         else
3134             static assert(false);
3135     }
3136     else static if (LDC_with_SSE2)
3137     {
3138         __builtin_ia32_pause();
3139     }
3140     else static if (DMD_with_asm)
3141     {
3142         asm nothrow @nogc pure @safe
3143         {
3144             rep; nop; // F3 90 =  pause
3145         }
3146     }
3147     else version (LDC)
3148     {
3149         // PERF: Do nothing currently , could be the "yield" intruction on ARM.
3150     }
3151     else
3152         static assert(false);
3153 }
3154 unittest
3155 {
3156     _mm_pause();
3157 }
3158 
3159 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
3160 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
3161 /// low 16 bits of 64-bit elements in result.
3162 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted
3163 {
3164     static if (GDC_with_SSE2)
3165     {
3166         return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b);
3167     }
3168     else static if (LDC_with_SSE2)
3169     {
3170         return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b);
3171     }
3172     else static if (LDC_with_ARM64)
3173     {
3174         ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b));
3175 
3176         // PERF: Looks suboptimal vs addp
3177         ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]);
3178         ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]);
3179         ushort8 r = 0;
3180         r[0] = r0;
3181         r[4] = r4;
3182         return cast(__m128i) r;
3183     }
3184     else
3185     {
3186         // PERF: ARM32 is lacking
3187         byte16 ab = cast(byte16)a;
3188         byte16 bb = cast(byte16)b;
3189         ubyte[16] t;
3190         foreach(i; 0..16)
3191         {
3192             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
3193             if (diff < 0) diff = -diff;
3194             t[i] = cast(ubyte)(diff);
3195         }
3196         int4 r = _mm_setzero_si128();
3197         r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
3198         r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
3199         return r;
3200     }
3201 }
3202 unittest
3203 {
3204     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
3205     __m128i B = _mm_set1_epi8(1);
3206     __m128i R = _mm_sad_epu8(A, B);
3207     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
3208                       0,
3209                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
3210                       0];
3211     assert(R.array == correct);
3212 }
3213 
3214 /// Set packed 16-bit integers with the supplied values.
3215 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
3216 {
3217     short8 r = void;
3218     r.ptr[0] = e0;
3219     r.ptr[1] = e1;
3220     r.ptr[2] = e2;
3221     r.ptr[3] = e3;
3222     r.ptr[4] = e4;
3223     r.ptr[5] = e5;
3224     r.ptr[6] = e6;
3225     r.ptr[7] = e7;
3226     return cast(__m128i) r;
3227 }
3228 unittest
3229 {
3230     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
3231     short8 B = cast(short8) A;
3232     foreach(i; 0..8)
3233         assert(B.array[i] == i);
3234 }
3235 
3236 /// Set packed 32-bit integers with the supplied values.
3237 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3238 {
3239     // PERF: does a constant inline correctly? vs int4 field assignment
3240     align(16) int[4] r = [e0, e1, e2, e3];
3241     return *cast(int4*)&r;
3242 }
3243 unittest
3244 {
3245     __m128i A = _mm_set_epi32(3, 2, 1, 0);
3246     foreach(i; 0..4)
3247         assert(A.array[i] == i);
3248 }
3249 
3250 /// Set packed 64-bit integers with the supplied values.
3251 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
3252 {
3253     pragma(inline, true);
3254     long2 r = void;
3255     r.ptr[0] = e0.array[0];
3256     r.ptr[1] = e1.array[0];
3257     return cast(__m128i)(r);
3258 }
3259 unittest
3260 {
3261     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
3262     long2 B = cast(long2) A;
3263     assert(B.array[0] == 5678);
3264     assert(B.array[1] == 1234);
3265 }
3266 
3267 /// Set packed 64-bit integers with the supplied values.
3268 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
3269 {
3270     pragma(inline, true);
3271     long2 r = void;
3272     r.ptr[0] = e0;
3273     r.ptr[1] = e1;
3274     return cast(__m128i)(r);
3275 }
3276 unittest
3277 {
3278     __m128i A = _mm_set_epi64x(1234, -5678);
3279     long2 B = cast(long2) A;
3280     assert(B.array[0] == -5678);
3281     assert(B.array[1] == 1234);
3282 }
3283 
3284 /// Set packed 8-bit integers with the supplied values.
3285 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
3286                       byte e11, byte e10, byte e9, byte e8,
3287                       byte e7, byte e6, byte e5, byte e4,
3288                       byte e3, byte e2, byte e1, byte e0) pure @trusted
3289 {
3290     align(16) byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
3291                                  e8, e9, e10, e11, e12, e13, e14, e15];
3292     return *cast(__m128i*)(result.ptr);
3293 }
3294 unittest
3295 {
3296     byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3297     byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1];
3298     assert(R.array == correct);
3299 }
3300 
3301 /// Set packed double-precision (64-bit) floating-point elements with the supplied values.
3302 __m128d _mm_set_pd (double e1, double e0) pure @trusted
3303 {
3304     pragma(inline, true);
3305     double2 r = void;
3306     r.ptr[0] = e0;
3307     r.ptr[1] = e1;
3308     return r;
3309 }
3310 unittest
3311 {
3312     __m128d A = _mm_set_pd(61.0, 55.0);
3313     double[2] correct = [55.0, 61.0];
3314     assert(A.array == correct);
3315 }
3316 
3317 /// Broadcast double-precision (64-bit) floating-point value `a` to all element.
3318 __m128d _mm_set_pd1 (double a) pure @trusted
3319 {
3320     pragma(inline, true);
3321     __m128d r = void;
3322     r.ptr[0] = a;
3323     r.ptr[1] = a;
3324     return r;
3325 }
3326 unittest
3327 {
3328     __m128d A = _mm_set_pd1(61.0);
3329     double[2] correct = [61.0, 61.0];
3330     assert(A.array == correct);
3331 }
3332 
3333 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 
3334 /// and zero the upper element.
3335 __m128d _mm_set_sd (double a) pure @trusted
3336 {
3337     double2 r = void;
3338     r.ptr[0] = a;
3339     r.ptr[1] = 0.0;
3340     return r;
3341 }
3342 unittest
3343 {
3344     __m128d A = _mm_set_sd(61.0);
3345     double[2] correct = [61.0, 0.0];
3346     assert(A.array == correct);
3347 }
3348 
3349 /// Broadcast 16-bit integer a to all elements of dst.
3350 __m128i _mm_set1_epi16 (short a) pure @trusted
3351 {
3352     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 
3353     {
3354         short8 v = a;
3355         return cast(__m128i) v;
3356     }
3357     else
3358     {
3359         pragma(inline, true);
3360         return cast(__m128i)(short8(a));
3361     }
3362 }
3363 unittest
3364 {
3365     short8 a = cast(short8) _mm_set1_epi16(31);
3366     for (int i = 0; i < 8; ++i)
3367         assert(a.array[i] == 31);
3368 }
3369 
3370 /// Broadcast 32-bit integer `a` to all elements.
3371 __m128i _mm_set1_epi32 (int a) pure @trusted
3372 {
3373     pragma(inline, true);
3374     return cast(__m128i)(int4(a));
3375 }
3376 unittest
3377 {
3378     int4 a = cast(int4) _mm_set1_epi32(31);
3379     for (int i = 0; i < 4; ++i)
3380         assert(a.array[i] == 31);
3381 }
3382 
3383 /// Broadcast 64-bit integer `a` to all elements.
3384 __m128i _mm_set1_epi64 (__m64 a) pure @safe
3385 {
3386     return _mm_set_epi64(a, a);
3387 }
3388 unittest
3389 {
3390     long b = 0x1DEADCAFE; 
3391     __m64 a;
3392     a.ptr[0] = b;
3393     long2 c = cast(long2) _mm_set1_epi64(a);
3394     assert(c.array[0] == b);
3395     assert(c.array[1] == b);
3396 }
3397 
3398 /// Broadcast 64-bit integer `a` to all elements
3399 __m128i _mm_set1_epi64x (long a) pure @trusted
3400 {
3401     long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3402     return cast(__m128i)(b);
3403 }
3404 unittest
3405 {
3406     long b = 0x1DEADCAFE;
3407     long2 c = cast(long2) _mm_set1_epi64x(b);
3408     for (int i = 0; i < 2; ++i)
3409         assert(c.array[i] == b);
3410 }
3411 
3412 /// Broadcast 8-bit integer `a` to all elements.
3413 __m128i _mm_set1_epi8 (byte a) pure @trusted
3414 {
3415     pragma(inline, true);
3416     byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470
3417     return cast(__m128i)(b);
3418 }
3419 unittest
3420 {
3421     byte16 b = cast(byte16) _mm_set1_epi8(31);
3422     for (int i = 0; i < 16; ++i)
3423         assert(b.array[i] == 31);
3424 }
3425 
3426 alias _mm_set1_pd = _mm_set_pd1;
3427 
3428 /// Set packed 16-bit integers with the supplied values in reverse order.
3429 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
3430                         short e3, short e2, short e1, short e0) pure @trusted
3431 {
3432     short8 r = void;
3433     r.ptr[0] = e7;
3434     r.ptr[1] = e6;
3435     r.ptr[2] = e5;
3436     r.ptr[3] = e4;
3437     r.ptr[4] = e3;
3438     r.ptr[5] = e2;
3439     r.ptr[6] = e1;
3440     r.ptr[7] = e0;
3441     return cast(__m128i)(r);
3442 }
3443 unittest
3444 {
3445     short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0);
3446     short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0];
3447     assert(A.array == correct);
3448 }
3449 
3450 /// Set packed 32-bit integers with the supplied values in reverse order.
3451 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
3452 {
3453     // Performs better than = void; with GDC
3454     pragma(inline, true);
3455     align(16) int[4] result = [e3, e2, e1, e0];
3456     return *cast(__m128i*)(result.ptr);
3457 }
3458 unittest
3459 {
3460     int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647);
3461     int[4] correct = [-1, 0, -2147483648, 2147483647];
3462     assert(A.array == correct);
3463 }
3464 
3465 /// Set packed 64-bit integers with the supplied values in reverse order.
3466 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
3467 {
3468     long2 r = void;
3469     r.ptr[0] = e1;
3470     r.ptr[1] = e0;
3471     return cast(__m128i)(r);
3472 }
3473 unittest
3474 {
3475     long2 A = cast(long2) _mm_setr_epi64(-1, 0);
3476     long[2] correct = [-1, 0];
3477     assert(A.array == correct);
3478 }
3479 
3480 /// Set packed 8-bit integers with the supplied values in reverse order.
3481 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
3482                        byte e11, byte e10, byte e9,  byte e8,
3483                        byte e7,  byte e6,  byte e5,  byte e4,
3484                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
3485 {
3486     align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
3487                                  e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
3488     return *cast(__m128i*)(result.ptr);
3489 }
3490 unittest
3491 {
3492     byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
3493     byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
3494     assert(R.array == correct);
3495 }
3496 
3497 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.
3498 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
3499 {
3500     pragma(inline, true);
3501     double2 result;
3502     result.ptr[0] = e1;
3503     result.ptr[1] = e0;
3504     return result;
3505 }
3506 unittest
3507 {
3508     __m128d A = _mm_setr_pd(61.0, 55.0);
3509     double[2] correct = [61.0, 55.0];
3510     assert(A.array == correct);
3511 }
3512 
3513 /// Return vector of type `__m128d` with all elements set to zero.
3514 __m128d _mm_setzero_pd() pure @trusted
3515 {
3516     pragma(inline, true);
3517     double2 r = void;
3518     r.ptr[0] = 0.0;
3519     r.ptr[1] = 0.0;
3520     return r;
3521 }
3522 unittest
3523 {
3524     __m128d A = _mm_setzero_pd();
3525     double[2] correct = [0.0, 0.0];
3526     assert(A.array == correct);
3527 }
3528 
3529 /// Return vector of type `__m128i` with all elements set to zero.
3530 __m128i _mm_setzero_si128() pure @trusted
3531 {
3532     pragma(inline, true);
3533     int4 r = void;
3534     r.ptr[0] = 0;
3535     r.ptr[1] = 0;
3536     r.ptr[2] = 0;
3537     r.ptr[3] = 0;
3538     return r;
3539 }
3540 unittest
3541 {
3542     __m128i A = _mm_setzero_si128();
3543     int[4] correct = [0, 0, 0, 0];
3544     assert(A.array == correct);
3545 }
3546 
3547 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
3548 /// See_also: `_MM_SHUFFLE`.
3549 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted
3550 {
3551     // PERF DMD D_SIMD
3552     static if (GDC_with_SSE2)
3553     {
3554         return __builtin_ia32_pshufd(a, imm8);
3555     }
3556     else version(LDC)
3557     {
3558         return shufflevectorLDC!(int4, (imm8 >> 0) & 3,
3559                                  (imm8 >> 2) & 3,
3560                                  (imm8 >> 4) & 3,
3561                                  (imm8 >> 6) & 3)(a, a);
3562     }
3563     else
3564     {
3565         int4 r = void;
3566         r.ptr[0] = a.ptr[(imm8 >> 0) & 3];
3567         r.ptr[1] = a.ptr[(imm8 >> 2) & 3];
3568         r.ptr[2] = a.ptr[(imm8 >> 4) & 3];
3569         r.ptr[3] = a.ptr[(imm8 >> 6) & 3];
3570         return r;
3571     }
3572 }
3573 unittest
3574 {
3575     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
3576     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3577     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
3578     int[4] expectedB = [ 3, 2, 1, 0 ];
3579     assert(B.array == expectedB);
3580 }
3581 
3582 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`.
3583 /// See_also: `_MM_SHUFFLE2`.
3584 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted
3585 {
3586     // PERF DMD D_SIMD
3587     static if (GDC_with_SSE2)
3588     {
3589         return __builtin_ia32_shufpd(a, b, imm8);
3590     }
3591     else version(LDC)
3592     {
3593         return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ),
3594                                  2 + ( (imm8 >> 1) & 1 ))(a, b);
3595     }
3596     else
3597     {
3598         double2 r = void;
3599         r.ptr[0] = a.array[imm8 & 1];
3600         r.ptr[1] = b.array[(imm8 >> 1) & 1];
3601         return r;
3602     }
3603 }
3604 unittest
3605 {
3606     __m128d A = _mm_setr_pd(0.5, 2.0);
3607     __m128d B = _mm_setr_pd(4.0, 5.0);
3608     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
3609     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
3610     double[2] correct = [ 2.0, 5.0 ];
3611     assert(R.array == correct);
3612 }
3613 
3614 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 
3615 /// 64 bits of result, with the low 64 bits being copied from from `a` to result.
3616 /// See also: `_MM_SHUFFLE`.
3617 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted
3618 {
3619     // PERF DMD D_SIMD
3620     static if (GDC_with_SSE2)
3621     {
3622         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8);
3623     }
3624     else version(LDC)
3625     {
3626         return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3,
3627                                           4 + ( (imm8 >> 0) & 3 ),
3628                                           4 + ( (imm8 >> 2) & 3 ),
3629                                           4 + ( (imm8 >> 4) & 3 ),
3630                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
3631     }
3632     else
3633     {
3634         short8 r = cast(short8)a;
3635         short8 sa = cast(short8)a;
3636         r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ];
3637         r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ];
3638         r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ];
3639         r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ];
3640         return cast(__m128i) r;
3641     }
3642 }
3643 unittest
3644 {
3645     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3646     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3647     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
3648     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
3649     assert(C.array == expectedC);
3650 }
3651 
3652 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 
3653 /// bits of result, with the high 64 bits being copied from from `a` to result.
3654 /// See_also: `_MM_SHUFFLE`.
3655 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted
3656 {
3657     // PERF DMD D_SIMD
3658     static if (GDC_with_SSE2)
3659     {
3660         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8);
3661     }
3662     else version(LDC)
3663     {
3664         return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ),
3665                                                        ( (imm8 >> 2) & 3 ),
3666                                                        ( (imm8 >> 4) & 3 ),
3667                                                        ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
3668     }
3669     else
3670     {
3671         short8 r = cast(short8)a;
3672         short8 sa = cast(short8)a;
3673         r.ptr[0] = sa.array[(imm8 >> 0) & 3];
3674         r.ptr[1] = sa.array[(imm8 >> 2) & 3];
3675         r.ptr[2] = sa.array[(imm8 >> 4) & 3];
3676         r.ptr[3] = sa.array[(imm8 >> 6) & 3];
3677         return cast(__m128i) r;
3678     }
3679 }
3680 unittest
3681 {
3682     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3683     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
3684     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
3685     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
3686     assert(B.array == expectedB);
3687 }
3688 
3689 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros.
3690 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted
3691 {
3692     static if (LDC_with_SSE2)
3693     {
3694         return __builtin_ia32_pslld128(a, count);
3695     }
3696     else static if (GDC_with_SSE2)
3697     {
3698         return __builtin_ia32_pslld128(a, count);
3699     }
3700     else static if (DMD_with_32bit_asm)
3701     {
3702         asm pure nothrow @nogc @trusted
3703         {
3704             movdqu XMM0, a;
3705             movdqu XMM1, count;
3706             pslld XMM0, XMM1;
3707             movdqu a, XMM0;
3708         }
3709         return a;
3710     }
3711     else
3712     {
3713         int4 r = void;
3714         long2 lc = cast(long2)count;
3715         int bits = cast(int)(lc.array[0]);
3716         foreach(i; 0..4)
3717             r[i] = cast(uint)(a[i]) << bits;
3718         return r;
3719     }
3720 }
3721 
3722 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros.
3723 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted
3724 {
3725     static if (LDC_with_SSE2)
3726     {
3727         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3728     }
3729     else static if (GDC_with_SSE2)
3730     {
3731         return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count);
3732     }
3733     else static if (DMD_with_32bit_asm)
3734     {
3735         asm pure nothrow @nogc @trusted
3736         {
3737             movdqu XMM0, a;
3738             movdqu XMM1, count;
3739             psllq XMM0, XMM1;
3740             movdqu a, XMM0;
3741         }
3742         return a;
3743     }
3744     else
3745     {
3746         // ARM: good since LDC 1.12 -O2
3747         // ~but -O0 version is catastrophic
3748         long2 r = void;
3749         long2 sa = cast(long2)a;
3750         long2 lc = cast(long2)count;
3751         int bits = cast(int)(lc.array[0]);
3752         foreach(i; 0..2)
3753             r.array[i] = cast(ulong)(sa.array[i]) << bits;
3754         return cast(__m128i)r;
3755     }
3756 }
3757 
3758 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros.
3759 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
3760 {
3761     static if (LDC_with_SSE2)
3762     {
3763         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3764     }
3765     else static if (GDC_with_SSE2)
3766     {
3767         return cast(__m128i) _mm_sll_epi16(cast(short8)a, count);
3768     }
3769     else static if (DMD_with_32bit_asm)
3770     {
3771         asm pure nothrow @nogc
3772         {
3773             movdqu XMM0, a;
3774             movdqu XMM1, count;
3775             psllw XMM0, XMM1;
3776             movdqu a, XMM0;
3777         }
3778         return a;
3779     }
3780     else
3781     {
3782         short8 sa = cast(short8)a;
3783         long2 lc = cast(long2)count;
3784         int bits = cast(int)(lc.array[0]);
3785         short8 r = void;
3786         foreach(i; 0..8)
3787             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
3788         return cast(int4)r;
3789     }
3790 }
3791 
3792 
3793 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
3794 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted
3795 {
3796     static if (GDC_with_SSE2)
3797     {
3798         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3799     }
3800     else static if (LDC_with_SSE2)
3801     {
3802         return __builtin_ia32_pslldi128(a, cast(ubyte)imm8);
3803     }
3804     else
3805     {
3806         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3807         //       D says "It's illegal to shift by the same or more bits 
3808         //       than the size of the quantity being shifted"
3809         //       and it's UB instead.
3810         int4 r = _mm_setzero_si128();
3811 
3812         ubyte count = cast(ubyte) imm8;
3813         if (count > 31)
3814             return r;
3815         
3816         foreach(i; 0..4)
3817             r.array[i] = cast(uint)(a.array[i]) << count;
3818         return r;
3819     }
3820 }
3821 unittest
3822 {
3823     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3824     __m128i B = _mm_slli_epi32(A, 1);
3825     __m128i B2 = _mm_slli_epi32(A, 1 + 256);
3826     int[4] expectedB = [ 0, 4, 6, -8];
3827     assert(B.array == expectedB);
3828     assert(B2.array == expectedB);
3829 
3830     __m128i C = _mm_slli_epi32(A, 0);
3831     int[4] expectedC = [ 0, 2, 3, -4];
3832     assert(C.array == expectedC);
3833 
3834     __m128i D = _mm_slli_epi32(A, 65);
3835     int[4] expectedD = [ 0, 0, 0, 0];
3836     assert(D.array == expectedD);
3837 }
3838 
3839 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
3840 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted
3841 {
3842     static if (GDC_with_SSE2)
3843     {
3844         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3845     }
3846     else static if (LDC_with_SSE2)
3847     {
3848         return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8);
3849     }
3850     else
3851     {
3852         long2 sa = cast(long2)a;
3853 
3854         // Note: the intrinsics guarantee imm8[0..7] is taken, however
3855         //       D says "It's illegal to shift by the same or more bits 
3856         //       than the size of the quantity being shifted"
3857         //       and it's UB instead.
3858         long2 r = cast(long2) _mm_setzero_si128();
3859         ubyte count = cast(ubyte) imm8;
3860         if (count > 63)
3861             return cast(__m128i)r;
3862 
3863         r.ptr[0] = cast(ulong)(sa.array[0]) << count;
3864         r.ptr[1] = cast(ulong)(sa.array[1]) << count;
3865         return cast(__m128i)r;
3866     }
3867 }
3868 unittest
3869 {
3870     __m128i A = _mm_setr_epi64(8, -4);
3871     long2 B = cast(long2) _mm_slli_epi64(A, 1);
3872     long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024);
3873     long[2] expectedB = [ 16, -8];
3874     assert(B.array == expectedB);
3875     assert(B2.array == expectedB);
3876 
3877     long2 C = cast(long2) _mm_slli_epi64(A, 0);
3878     long[2] expectedC = [ 8, -4];
3879     assert(C.array == expectedC);
3880 
3881     long2 D = cast(long2) _mm_slli_epi64(A, 64);
3882     long[2] expectedD = [ 0, -0];
3883     assert(D.array == expectedD);
3884 }
3885 
3886 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
3887 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted
3888 {
3889     static if (GDC_with_SSE2)
3890     {
3891         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3892     }
3893     else static if (LDC_with_SSE2)
3894     {
3895         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8);
3896     }
3897     else static if (LDC_with_ARM64)
3898     {
3899         short8 sa = cast(short8)a;
3900         short8 r = cast(short8)_mm_setzero_si128();
3901         ubyte count = cast(ubyte) imm8;
3902         if (count > 15)
3903             return cast(__m128i)r;
3904         r = sa << short8(count);
3905         return cast(__m128i)r;
3906     }
3907     else
3908     {
3909         short8 sa = cast(short8)a;
3910         short8 r = cast(short8)_mm_setzero_si128();
3911         ubyte count = cast(ubyte) imm8;
3912         if (count > 15)
3913             return cast(__m128i)r;
3914         foreach(i; 0..8)
3915             r.ptr[i] = cast(short)(sa.array[i] << count);
3916         return cast(__m128i)r;
3917     }
3918 }
3919 unittest
3920 {
3921     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3922     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
3923     short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) );
3924     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
3925     assert(B.array == expectedB);
3926     assert(B2.array == expectedB);
3927 
3928     short8 C = cast(short8)( _mm_slli_epi16(A, 16) );
3929     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
3930     assert(C.array == expectedC);
3931 }
3932 
3933 
3934 /// Shift `a` left by `bytes` bytes while shifting in zeros.
3935 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
3936 {
3937     static if (bytes & 0xF0)
3938     {
3939         return _mm_setzero_si128();
3940     }
3941     else static if (DMD_with_DSIMD)
3942     {
3943         return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes);
3944     }
3945     else static if (GDC_with_SSE2)
3946     {
3947         return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 
3948     }
3949     else version(LDC)
3950     {
3951         return cast(__m128i) shufflevectorLDC!(byte16,
3952                                                16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
3953                                                22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
3954                                                28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
3955                                                (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
3956     }
3957     else static if (DMD_with_32bit_asm)
3958     {
3959         asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
3960         {
3961             movdqu XMM0, op;
3962             pslldq XMM0, bytes;
3963             movdqu op, XMM0;
3964         }
3965         return op;
3966     }
3967     else
3968     {
3969         byte16 A = cast(byte16)op;
3970         byte16 R = void;
3971         for (int n = 15; n >= bytes; --n)
3972             R.ptr[n] = A.array[n-bytes];
3973         for (int n = bytes-1; n >= 0; --n)
3974             R.ptr[n] = 0;
3975         return cast(__m128i)R;
3976     }
3977 }
3978 unittest
3979 {
3980     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3981     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
3982     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
3983     assert(R.array == correct);
3984 
3985     __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1));
3986     int[4] expectedB = [0, 0, 0, 0];
3987     assert(B.array == expectedB);
3988 }
3989 
3990 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`.
3991 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted
3992 {
3993     version(LDC)
3994     {
3995         // Disappeared with LDC 1.11
3996         static if (__VERSION__ < 2081)
3997             return __builtin_ia32_sqrtpd(vec);
3998         else
3999         {
4000             // PERF: use llvm_sqrt on the vector
4001             vec.array[0] = llvm_sqrt(vec.array[0]); 
4002             vec.array[1] = llvm_sqrt(vec.array[1]);
4003             return vec;
4004         }
4005     }
4006     else static if (GDC_with_SSE2)    
4007     {
4008         return __builtin_ia32_sqrtpd(vec);
4009     }
4010     else
4011     {
4012         vec.ptr[0] = sqrt(vec.array[0]);
4013         vec.ptr[1] = sqrt(vec.array[1]);
4014         return vec;
4015     }
4016 }
4017 
4018 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 
4019 /// the lower element of result, and copy the upper element from `a` to the upper element of result.
4020 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted
4021 {
4022     // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only.
4023     //       "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 
4024     //        The quadword at bits 127:64 of the destination operand remains unchanged."
4025     version(LDC)
4026     {
4027         // Disappeared with LDC 1.11
4028         static if (__VERSION__ < 2081)
4029         {
4030             __m128d c = __builtin_ia32_sqrtsd(b);
4031             a[0] = c[0];
4032             return a;
4033         }
4034         else
4035         {
4036             a.array[0] = llvm_sqrt(b.array[0]);
4037             return a;
4038         }
4039     }
4040     else static if (GDC_with_SSE2)
4041     {
4042         __m128d c = __builtin_ia32_sqrtsd(b);
4043         a.ptr[0] = c.array[0];
4044         return a;
4045     }
4046     else
4047     {
4048         a.ptr[0] = sqrt(b.array[0]);
4049         return a;
4050     }
4051 }
4052 unittest
4053 {
4054     __m128d A = _mm_setr_pd(1.0, 3.0);
4055     __m128d B = _mm_setr_pd(4.0, 5.0);
4056     __m128d R = _mm_sqrt_sd(A, B);
4057     double[2] correct = [2.0, 3.0 ];
4058     assert(R.array == correct);
4059 }
4060 
4061 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits.
4062 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted
4063 {
4064     static if (GDC_with_SSE2)
4065     {
4066         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
4067     }
4068     else static if (LDC_with_SSE2)
4069     {
4070         return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count);
4071     }
4072     else
4073     {
4074         short8 sa = cast(short8)a;
4075         long2 lc = cast(long2)count;
4076         int bits = cast(int)(lc.array[0]);
4077         short8 r = void;
4078         foreach(i; 0..8)
4079             r.ptr[i] = cast(short)(sa.array[i] >> bits);
4080         return cast(int4)r;
4081     }
4082 }
4083 
4084 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits.
4085 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted
4086 {
4087     static if (LDC_with_SSE2)
4088     {
4089         return __builtin_ia32_psrad128(a, count);
4090     }
4091     else static if (GDC_with_SSE2)
4092     {
4093         return __builtin_ia32_psrad128(a, count);
4094     }
4095     else
4096     {    
4097         int4 r = void;
4098         long2 lc = cast(long2)count;
4099         int bits = cast(int)(lc.array[0]);
4100         r.ptr[0] = (a.array[0] >> bits);
4101         r.ptr[1] = (a.array[1] >> bits);
4102         r.ptr[2] = (a.array[2] >> bits);
4103         r.ptr[3] = (a.array[3] >> bits);
4104         return r;
4105     }
4106 }
4107 
4108 
4109 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
4110 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted
4111 {
4112     static if (GDC_with_SSE2)
4113     {
4114         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4115     }
4116     else static if (LDC_with_SSE2)
4117     {
4118         return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8);
4119     }
4120     else static if (LDC_with_ARM64)
4121     {
4122         short8 sa = cast(short8)a;
4123         ubyte count = cast(ubyte)imm8;
4124         if (count > 15) 
4125             count = 15;
4126         short8 r = sa >> short8(count);
4127         return cast(__m128i)r;
4128     }
4129     else
4130     {
4131         short8 sa = cast(short8)a;
4132         short8 r = void;
4133 
4134         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4135         //       D says "It's illegal to shift by the same or more bits 
4136         //       than the size of the quantity being shifted"
4137         //       and it's UB instead.
4138         ubyte count = cast(ubyte)imm8;
4139         if (count > 15) 
4140             count = 15;
4141         foreach(i; 0..8)
4142             r.ptr[i] = cast(short)(sa.array[i] >> count);
4143         return cast(int4)r;
4144     }
4145 }
4146 unittest
4147 {
4148     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4149     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
4150     short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) );
4151     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
4152     assert(B.array == expectedB);
4153     assert(B2.array == expectedB);
4154 
4155     short8 C = cast(short8)( _mm_srai_epi16(A, 18) );
4156     short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ];
4157     assert(C.array == expectedC);
4158 }
4159 
4160 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
4161 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted
4162 {
4163     static if (LDC_with_SSE2)
4164     {
4165         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4166     }
4167     else static if (GDC_with_SSE2)
4168     {
4169         return __builtin_ia32_psradi128(a, cast(ubyte)imm8);
4170     }
4171     else
4172     {
4173         int4 r = void;
4174 
4175         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4176         //       D says "It's illegal to shift by the same or more bits 
4177         //       than the size of the quantity being shifted"
4178         //       and it's UB instead.
4179         ubyte count = cast(ubyte) imm8;
4180         if (count > 31)
4181             count = 31;
4182 
4183         r.ptr[0] = (a.array[0] >> count);
4184         r.ptr[1] = (a.array[1] >> count);
4185         r.ptr[2] = (a.array[2] >> count);
4186         r.ptr[3] = (a.array[3] >> count);
4187         return r;
4188     }
4189 }
4190 unittest
4191 {
4192     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4193     __m128i B = _mm_srai_epi32(A, 1);
4194     __m128i B2 = _mm_srai_epi32(A, 1 + 256);
4195     int[4] expectedB = [ 0, 1, 1, -2];
4196     assert(B.array == expectedB);
4197     assert(B2.array == expectedB);
4198 
4199     __m128i C = _mm_srai_epi32(A, 32);
4200     int[4] expectedC = [ 0, 0, 0, -1];
4201     assert(C.array == expectedC);
4202 
4203     __m128i D = _mm_srai_epi32(A, 0);
4204     int[4] expectedD = [ 0, 2, 3, -4];
4205     assert(D.array == expectedD);
4206 }
4207 
4208 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted
4209 {
4210     static if (LDC_with_SSE2)
4211     {
4212         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4213     }
4214     else static if (GDC_with_SSE2)
4215     {
4216         return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count);
4217     }
4218     else
4219     {
4220         short8 sa = cast(short8)a;
4221         long2 lc = cast(long2)count;
4222         int bits = cast(int)(lc.array[0]);
4223         short8 r = void;
4224         foreach(i; 0..8)
4225             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
4226         return cast(int4)r;
4227     }
4228 }
4229 
4230 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted
4231 {
4232     static if (LDC_with_SSE2)
4233     {
4234         return __builtin_ia32_psrld128(a, count);
4235     }
4236     else static if (GDC_with_SSE2)
4237     {
4238         return __builtin_ia32_psrld128(a, count);
4239     }
4240     else
4241     {
4242         int4 r = void;
4243         long2 lc = cast(long2)count;
4244         int bits = cast(int)(lc.array[0]);
4245         r.ptr[0] = cast(uint)(a.array[0]) >> bits;
4246         r.ptr[1] = cast(uint)(a.array[1]) >> bits;
4247         r.ptr[2] = cast(uint)(a.array[2]) >> bits;
4248         r.ptr[3] = cast(uint)(a.array[3]) >> bits;
4249         return r;
4250     }
4251 }
4252 
4253 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted
4254 {
4255     static if (LDC_with_SSE2)
4256     {
4257         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4258     }
4259     else static if (GDC_with_SSE2)
4260     {
4261         return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count);
4262     }
4263     else
4264     {
4265         // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047
4266         // => avoid void initialization.
4267         long2 r;
4268         long2 sa = cast(long2)a;
4269         long2 lc = cast(long2)count;
4270         int bits = cast(int)(lc.array[0]);
4271         r.ptr[0] = cast(ulong)(sa.array[0]) >> bits;
4272         r.ptr[1] = cast(ulong)(sa.array[1]) >> bits;
4273         return cast(__m128i)r;
4274     }
4275 }
4276 
4277 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
4278 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted
4279 {
4280     static if (GDC_with_SSE2)
4281     {
4282         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4283     }
4284     else static if (LDC_with_SSE2)
4285     {
4286         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8);
4287     }
4288     else static if (LDC_with_ARM64)
4289     {
4290         short8 sa = cast(short8)a;
4291         short8 r = cast(short8) _mm_setzero_si128();
4292 
4293         ubyte count = cast(ubyte)imm8;
4294         if (count >= 16)
4295             return cast(__m128i)r;
4296 
4297         r = sa >>> short8(count); // This facility offered with LDC, but not DMD.
4298         return cast(__m128i)r;
4299     }
4300     else
4301     {
4302         short8 sa = cast(short8)a;
4303         ubyte count = cast(ubyte)imm8;
4304 
4305         short8 r = cast(short8) _mm_setzero_si128();
4306         if (count >= 16)
4307             return cast(__m128i)r;
4308 
4309         foreach(i; 0..8)
4310             r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
4311         return cast(__m128i)r;
4312     }
4313 }
4314 unittest
4315 {
4316     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
4317     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
4318     short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) );
4319     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
4320     assert(B.array == expectedB);
4321     assert(B2.array == expectedB);
4322 
4323     short8 C = cast(short8)( _mm_srli_epi16(A, 16) );
4324     short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0];
4325     assert(C.array == expectedC);
4326 
4327     short8 D = cast(short8)( _mm_srli_epi16(A, 0) );
4328     short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ];
4329     assert(D.array == expectedD);
4330 }
4331 
4332 
4333 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
4334 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted
4335 {
4336     static if (GDC_with_SSE2)
4337     {
4338         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4339     }
4340     else static if (LDC_with_SSE2)
4341     {
4342         return __builtin_ia32_psrldi128(a, cast(ubyte)imm8);
4343     }
4344     else
4345     {
4346         ubyte count = cast(ubyte) imm8;
4347 
4348         // Note: the intrinsics guarantee imm8[0..7] is taken, however
4349         //       D says "It's illegal to shift by the same or more bits 
4350         //       than the size of the quantity being shifted"
4351         //       and it's UB instead.
4352         int4 r = _mm_setzero_si128();
4353         if (count >= 32)
4354             return r;
4355         r.ptr[0] = a.array[0] >>> count;
4356         r.ptr[1] = a.array[1] >>> count;
4357         r.ptr[2] = a.array[2] >>> count;
4358         r.ptr[3] = a.array[3] >>> count;
4359         return r;
4360     }
4361 }
4362 unittest
4363 {
4364     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
4365     __m128i B = _mm_srli_epi32(A, 1);
4366     __m128i B2 = _mm_srli_epi32(A, 1 + 256);
4367     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
4368     assert(B.array == expectedB);
4369     assert(B2.array == expectedB);
4370  
4371     __m128i C = _mm_srli_epi32(A, 255);
4372     int[4] expectedC = [ 0, 0, 0, 0 ];
4373     assert(C.array == expectedC);
4374 }
4375 
4376 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros.
4377 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted
4378 {
4379     static if (GDC_with_SSE2)
4380     {
4381         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4382     }
4383     else static if (LDC_with_SSE2)
4384     {
4385         return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8);
4386     }
4387     else
4388     {
4389         long2 r = cast(long2) _mm_setzero_si128();
4390         long2 sa = cast(long2)a;
4391 
4392         ubyte count = cast(ubyte) imm8;
4393         if (count >= 64)
4394             return cast(__m128i)r;
4395 
4396         r.ptr[0] = sa.array[0] >>> count;
4397         r.ptr[1] = sa.array[1] >>> count;
4398         return cast(__m128i)r;
4399     }
4400 }
4401 unittest
4402 {
4403     __m128i A = _mm_setr_epi64(8, -4);
4404     long2 B = cast(long2) _mm_srli_epi64(A, 1);
4405     long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512);
4406     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
4407     assert(B.array == expectedB);
4408     assert(B2.array == expectedB);
4409 
4410     long2 C = cast(long2) _mm_srli_epi64(A, 64);
4411     long[2] expectedC = [ 0, 0 ];
4412     assert(C.array == expectedC);
4413 }
4414 
4415 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4416 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted
4417 {
4418     static if (bytes & 0xF0)
4419     {
4420         return _mm_setzero_si128();
4421     }
4422     else static if (DMD_with_DSIMD)
4423     {
4424         return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes);
4425     }
4426     else static if (GDC_with_SSE2)
4427     {
4428         return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8));
4429     }
4430     else static if (DMD_with_32bit_asm)
4431     {
4432         asm pure nothrow @nogc @trusted
4433         {
4434             movdqu XMM0, v;
4435             psrldq XMM0, bytes;
4436             movdqu v, XMM0;
4437         }
4438         return v;
4439     }
4440     else version(LDC)
4441     {
4442         return cast(__m128i) shufflevectorLDC!(byte16,
4443                                                bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
4444                                                bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
4445                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
4446     }
4447     else
4448     {
4449         byte16 A = cast(byte16)v;
4450         byte16 R = void;
4451         for (int n = 0; n < bytes; ++n)
4452             R.ptr[15-n] = 0;
4453         for (int n = bytes; n < 16; ++n)
4454             R.ptr[15-n] = A.array[15 - n + bytes];
4455         return cast(__m128i)R;
4456     }
4457 }
4458 unittest
4459 {
4460     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1));
4461     int[4] correct = [-2, 3, 4, 0];
4462     assert(R.array == correct);
4463 
4464     __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1));
4465     int[4] expectedA = [0, 0, 0, 0];
4466     assert(A.array == expectedA);
4467 }
4468 
4469 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4470 /// #BONUS
4471 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
4472 {
4473     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
4474 }
4475 unittest
4476 {
4477     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
4478     float[4] correct = [3.0f, 4.0f, 0, 0];
4479     assert(R.array == correct);
4480 }
4481 
4482 /// Shift `v` right by `bytes` bytes while shifting in zeros.
4483 /// #BONUS
4484 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
4485 {
4486     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
4487 }
4488 
4489 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 
4490 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4491 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
4492 {
4493     pragma(inline, true);
4494     __m128d* aligned = cast(__m128d*)mem_addr;
4495     *aligned = a;
4496 }
4497 unittest
4498 {
4499     align(16) double[2] A;
4500     __m128d B = _mm_setr_pd(-8.0, 9.0);
4501     _mm_store_pd(A.ptr, B);
4502     assert(A == [-8.0, 9.0]);
4503 }
4504 
4505 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 
4506 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
4507 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
4508 {
4509     __m128d* aligned = cast(__m128d*)mem_addr;
4510     __m128d r; // PERF =void;
4511     r.ptr[0] = a.array[0];
4512     r.ptr[1] = a.array[0];
4513     *aligned = r;
4514 }
4515 
4516 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 
4517 /// be aligned on any particular boundary.
4518 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
4519 {
4520     pragma(inline, true);
4521     *mem_addr = a.array[0];
4522 }
4523 
4524 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 
4525 /// general-protection exception may be generated.
4526 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
4527 {
4528     pragma(inline, true);
4529     *mem_addr = a;
4530 }
4531 
4532 alias _mm_store1_pd = _mm_store_pd1; ///
4533 
4534 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory.
4535 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
4536 {
4537     pragma(inline, true);
4538     *mem_addr = a.array[1];
4539 }
4540 
4541 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
4542 // expectations from the user point of view. This problem also exist in C++.
4543 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
4544 {
4545     pragma(inline, true);
4546     long* dest = cast(long*)mem_addr;
4547     long2 la = cast(long2)a;
4548     *dest = la.array[0];
4549 }
4550 unittest
4551 {
4552     long[3] A = [1, 2, 3];
4553     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4554     long[3] correct = [1, 0x1_0000_0000, 3];
4555     assert(A == correct);
4556 }
4557 
4558 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory.
4559 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
4560 {
4561     pragma(inline, true);
4562     *mem_addr = a.array[0];
4563 }
4564 
4565 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 
4566 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 
4567 /// may be generated.
4568 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system
4569 {
4570     __m128d reversed = void;
4571     reversed.ptr[0] = a.array[1];
4572     reversed.ptr[1] = a.array[0];
4573     *cast(__m128d*)mem_addr = reversed;
4574 }
4575 unittest
4576 {
4577     align(16) double[2] A = [0.0, 1.0];
4578     _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0));
4579     assert(A[0] == 3.0 && A[1] == 2.0);
4580 }
4581 
4582 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 
4583 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
4584 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system
4585 {
4586     // PERF DMD
4587     pragma(inline, true);
4588     static if (GDC_with_SSE2)
4589     {
4590         __builtin_ia32_storeupd(mem_addr, a);
4591     }
4592     else version(LDC)
4593     {
4594         storeUnaligned!double2(a, mem_addr);
4595     }
4596     else
4597     {
4598         mem_addr[0] = a.array[0];
4599         mem_addr[1] = a.array[1];
4600     }
4601 }
4602 unittest
4603 {
4604     __m128d A = _mm_setr_pd(3.0, 4.0);
4605     align(16) double[4] R = [0.0, 0, 0, 0];
4606     double[2] correct = [3.0, 4.0];
4607     _mm_storeu_pd(&R[1], A);
4608     assert(R[1..3] == correct);
4609 }
4610 
4611 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 
4612 /// boundary.
4613 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system
4614 {
4615     // PERF: DMD
4616     pragma(inline, true);
4617     static if (GDC_with_SSE2)
4618     {
4619         __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a);
4620     }
4621     else version(LDC)
4622     {
4623         storeUnaligned!__m128i(a, cast(int*)mem_addr);
4624     }
4625     else
4626     {
4627         int* p = cast(int*)mem_addr;
4628         p[0] = a.array[0];
4629         p[1] = a.array[1];
4630         p[2] = a.array[2];
4631         p[3] = a.array[3];
4632     }
4633 }
4634 unittest
4635 {
4636     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
4637     align(16) int[6] R = [0, 0, 0, 0, 0, 0];
4638     int[4] correct = [1, 2, 3, 4];
4639     _mm_storeu_si128(cast(__m128i*)(&R[1]), A);
4640     assert(R[1..5] == correct);
4641 }
4642 
4643 /// Store 16-bit integer from the first element of `a` into memory. 
4644 /// `mem_addr` does not need to be aligned on any particular boundary.
4645 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system
4646 {
4647     short* dest = cast(short*)mem_addr;
4648     *dest = (cast(short8)a).array[0];
4649 }
4650 unittest
4651 {
4652     short[2] arr = [-24, 12];
4653     _mm_storeu_si16(&arr[1], _mm_set1_epi16(26));
4654     short[2] correct = [-24, 26];
4655     assert(arr == correct);
4656 }
4657 
4658 /// Store 32-bit integer from the first element of `a` into memory. 
4659 /// `mem_addr` does not need to be aligned on any particular boundary.
4660 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem
4661 {
4662     pragma(inline, true);
4663     int* dest = cast(int*)mem_addr;
4664     *dest = a.array[0];
4665 }
4666 unittest
4667 {
4668     int[2] arr = [-24, 12];
4669     _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7));
4670     assert(arr == [-24, -1]);
4671 }
4672 
4673 /// Store 64-bit integer from the first element of `a` into memory. 
4674 /// `mem_addr` does not need to be aligned on any particular boundary.
4675 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system
4676 {
4677     pragma(inline, true);
4678     long* dest = cast(long*)mem_addr;
4679     long2 la = cast(long2)a;
4680     *dest = la.array[0];
4681 }
4682 unittest
4683 {
4684     long[3] A = [1, 2, 3];
4685     _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
4686     long[3] correct = [1, 0x1_0000_0000, 3];
4687     assert(A == correct);
4688 }
4689 
4690 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
4691 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte
4692 /// boundary or a general-protection exception may be generated.
4693 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4694 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system
4695 {
4696     // PERF DMD D_SIMD
4697     static if (GDC_with_SSE2)
4698     {
4699         return __builtin_ia32_movntpd(mem_addr, a); 
4700     }
4701     else version(LDC)
4702     {
4703         enum prefix = `!0 = !{ i32 1 }`;
4704         enum ir = `
4705             store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0
4706             ret void`;
4707         LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a);
4708     }
4709     else
4710     {
4711         // Regular store instead.
4712         __m128d* dest = cast(__m128d*)mem_addr;
4713         *dest = a;
4714     }
4715 }
4716 unittest
4717 {
4718     align(16) double[2] A;
4719     __m128d B = _mm_setr_pd(-8.0, 9.0);
4720     _mm_stream_pd(A.ptr, B);
4721     assert(A == [-8.0, 9.0]);
4722 }
4723 
4724 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
4725 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception
4726 /// may be generated.
4727 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4728 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted
4729 {
4730     // PERF DMD D_SIMD
4731     static if (GDC_with_SSE2)
4732     {
4733         return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 
4734     }
4735     else version(LDC)
4736     {
4737         enum prefix = `!0 = !{ i32 1 }`;
4738         enum ir = `
4739             store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0
4740             ret void`;
4741         LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a);
4742     }
4743     else
4744     {
4745         // Regular store instead.
4746         __m128i* dest = cast(__m128i*)mem_addr;
4747         *dest = a;
4748     }
4749 }
4750 unittest
4751 {
4752     align(16) int[4] A;
4753     __m128i B = _mm_setr_epi32(-8, 9, 10, -11);
4754     _mm_stream_si128(cast(__m128i*)A.ptr, B);
4755     assert(A == [-8, 9, 10, -11]);
4756 }
4757 
4758 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
4759 /// pollution. If the cache line containing address `mem_addr` is already in the cache,
4760 /// the cache will be updated.
4761 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4762 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted
4763 {
4764     // PERF DMD D_SIMD
4765     static if (GDC_with_SSE2)
4766     {
4767         return __builtin_ia32_movnti(mem_addr, a);
4768     }
4769     else version(LDC)
4770     {
4771         enum prefix = `!0 = !{ i32 1 }`;
4772         enum ir = `
4773             store i32 %1, i32* %0, !nontemporal !0
4774             ret void`;
4775         LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a);
4776     }
4777     else
4778     {
4779         // Regular store instead.
4780         *mem_addr = a;
4781     }
4782 }
4783 unittest
4784 {
4785     int A;
4786     _mm_stream_si32(&A, -34);
4787     assert(A == -34);
4788 }
4789 
4790 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
4791 /// cache pollution. If the cache line containing address `mem_addr` is already
4792 /// in the cache, the cache will be updated.
4793 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
4794 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted
4795 {
4796     // PERF DMD D_SIMD
4797     static if (GDC_with_SSE2)
4798     {
4799         return __builtin_ia32_movnti64(mem_addr, a);
4800     }
4801     else version(LDC)
4802     {
4803         enum prefix = `!0 = !{ i32 1 }`;
4804         enum ir = `
4805             store i64 %1, i64* %0, !nontemporal !0
4806             ret void`;
4807         LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a);
4808 
4809     }
4810     else
4811     {
4812         // Regular store instead.
4813         *mem_addr = a;
4814     }
4815 }
4816 unittest
4817 {
4818     long A;
4819     _mm_stream_si64(&A, -46);
4820     assert(A == -46);
4821 }
4822 
4823 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
4824 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
4825 {
4826     pragma(inline, true);
4827     return cast(__m128i)(cast(short8)a - cast(short8)b);
4828 }
4829 unittest
4830 {
4831     __m128i A = _mm_setr_epi16(16,  32767, 1, 2,    3, 4, 6, 6);
4832     __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6);
4833     short8 C = cast(short8) _mm_sub_epi16(A, B);
4834     short[8] correct =        [ 1,     -1,-5,-6, -997, 3, 1, 0];
4835     assert(C.array == correct);
4836 }
4837 
4838 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
4839 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
4840 {
4841     pragma(inline, true);
4842     return cast(__m128i)(cast(int4)a - cast(int4)b);
4843 }
4844 unittest
4845 {
4846     __m128i A = _mm_setr_epi32(16, int.max, 1, 8);
4847     __m128i B = _mm_setr_epi32(15, int.min, 6, 2);
4848     int4 C = cast(int4) _mm_sub_epi32(A, B);
4849     int[4] correct =          [ 1,      -1,-5, 6];
4850     assert(C.array == correct);
4851 }
4852 
4853 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
4854 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
4855 {
4856     pragma(inline, true);
4857     return cast(__m128i)(cast(long2)a - cast(long2)b);
4858 }
4859 unittest
4860 {
4861     __m128i A = _mm_setr_epi64(  16, long.max);
4862     __m128i B = _mm_setr_epi64( 199, long.min);
4863     long2 C = cast(long2) _mm_sub_epi64(A, B);
4864     long[2] correct =         [-183,       -1];
4865     assert(C.array == correct);
4866 }
4867 
4868 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
4869 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
4870 {
4871     pragma(inline, true);
4872     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
4873 }
4874 unittest
4875 {
4876     __m128i A = _mm_setr_epi8(16,  127, 1, 2, 3, 4, 6, 6, 16,  127, 1, 2, 3, 4, 6, 6);
4877     __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16,  127, 1, 2, 3, 4, 6, 6);
4878     byte16 C = cast(byte16) _mm_sub_epi8(A, B);
4879     byte[16] correct =       [ 1,   -1,-5,-6, 0, 3, 1, 0,  0,    0, 0, 0, 0, 0, 0, 0];
4880     assert(C.array == correct);
4881 }
4882 
4883 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 
4884 /// floating-point elements in `a`.
4885 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
4886 {
4887     pragma(inline, true);
4888     return a - b;
4889 }
4890 unittest
4891 {
4892     __m128d A = _mm_setr_pd(4000.0, -8.0);
4893     __m128d B = _mm_setr_pd(12.0, -8450.0);
4894     __m128d C = _mm_sub_pd(A, B);
4895     double[2] correct =     [3988.0, 8442.0];
4896     assert(C.array == correct);
4897 }
4898 
4899 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 
4900 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the
4901 /// upper element of result.
4902 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted
4903 {
4904     version(DigitalMars)
4905     {
4906         // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
4907         // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again
4908         asm pure nothrow @nogc @trusted { nop;}
4909         a[0] = a[0] - b[0];
4910         return a;
4911     }
4912     else static if (GDC_with_SSE2)
4913     {
4914         return __builtin_ia32_subsd(a, b);
4915     }
4916     else
4917     {
4918         a.ptr[0] -= b.array[0];
4919         return a;
4920     }
4921 }
4922 unittest
4923 {
4924     __m128d a = [1.5, -2.0];
4925     a = _mm_sub_sd(a, a);
4926     assert(a.array == [0.0, -2.0]);
4927 }
4928 
4929 /// Subtract 64-bit integer `b` from 64-bit integer `a`.
4930 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
4931 {
4932     pragma(inline, true);
4933     return a - b;
4934 }
4935 unittest
4936 {
4937     __m64 A, B;
4938     A = -1214;
4939     B = 489415;
4940     __m64 C = _mm_sub_si64(B, A);
4941     assert(C.array[0] == 489415 + 1214);
4942 }
4943 
4944 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4945 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
4946 {
4947     version(LDC)
4948     {
4949         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
4950         {
4951             // Generates PSUBSW since LDC 1.15 -O0
4952             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4953             
4954             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
4955             enum ir = `
4956                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
4957                 ret <8 x i16> %r`;
4958             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
4959         }
4960         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
4961         {
4962             /// Add packed 16-bit signed integers in `a` and `b` using signed saturation.
4963             short[8] res; // PERF: =void;
4964             short8 sa = cast(short8)a;
4965             short8 sb = cast(short8)b;
4966             foreach(i; 0..8)
4967                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4968             return _mm_loadu_si128(cast(int4*)res.ptr);
4969         }
4970         else static if (LDC_with_SSE2)
4971         {
4972             return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4973         }
4974         else
4975             static assert(false);
4976     }
4977     else static if (GDC_with_SSE2)
4978     {
4979         return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b);
4980     }
4981     else
4982     {
4983         short[8] res; // PERF =void;
4984         short8 sa = cast(short8)a;
4985         short8 sb = cast(short8)b;
4986         foreach(i; 0..8)
4987             res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
4988         return _mm_loadu_si128(cast(int4*)res.ptr);
4989     }
4990 }
4991 unittest
4992 {
4993     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
4994                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
4995     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
4996     assert(res.array == correctResult);
4997 }
4998 
4999 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation.
5000 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
5001 {
5002     version(LDC)
5003     {
5004         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
5005         {
5006             // x86: Generates PSUBSB since LDC 1.15 -O0
5007             // ARM: Generates sqsub.16b since LDC 1.21 -O0
5008             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
5009             enum ir = `
5010                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
5011                 ret <16 x i8> %r`;
5012             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5013         }
5014         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
5015         {
5016             byte[16] res; // PERF =void;
5017             byte16 sa = cast(byte16)a;
5018             byte16 sb = cast(byte16)b;
5019             foreach(i; 0..16)
5020                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
5021             return _mm_loadu_si128(cast(int4*)res.ptr);
5022         }
5023         else static if (LDC_with_SSE2)
5024         {
5025             return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b);
5026         }
5027         else
5028             static assert(false);
5029     }
5030     else static if (GDC_with_SSE2)
5031     {
5032         return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b);
5033     }
5034     else
5035     {
5036         byte[16] res; // PERF =void;
5037         byte16 sa = cast(byte16)a;
5038         byte16 sb = cast(byte16)b;
5039         foreach(i; 0..16)
5040             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
5041         return _mm_loadu_si128(cast(int4*)res.ptr);
5042     }
5043 }
5044 unittest
5045 {
5046     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5047                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5048     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5049     assert(res.array == correctResult);
5050 }
5051 
5052 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation.
5053 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
5054 {
5055     version(LDC)
5056     {
5057         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
5058         {
5059             // x86: Generates PSUBUSW since LDC 1.15 -O0
5060             // ARM: Generates uqsub.8h since LDC 1.21 -O0
5061             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
5062             enum ir = `
5063                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
5064                 ret <8 x i16> %r`;
5065             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
5066         }
5067         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 
5068         {
5069             short[8] res; // PERF =void;
5070             short8 sa = cast(short8)a;
5071             short8 sb = cast(short8)b;
5072             foreach(i; 0..8)
5073             {
5074                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
5075                 res[i] = saturateSignedIntToUnsignedShort(sum);
5076             }
5077             return _mm_loadu_si128(cast(int4*)res.ptr);
5078         }
5079         else static if (LDC_with_SSE2)
5080         {
5081             return cast(__m128i) __builtin_ia32_psubusw128(a, b);
5082         }
5083         else 
5084             static assert(false);
5085     }
5086     else static if (GDC_with_SSE2)
5087     {
5088         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b);
5089     }
5090     else
5091     {
5092         short[8] res; // PERF =void;
5093         short8 sa = cast(short8)a;
5094         short8 sb = cast(short8)b;
5095         foreach(i; 0..8)
5096         {
5097             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
5098             res[i] = saturateSignedIntToUnsignedShort(sum);
5099         }
5100         return _mm_loadu_si128(cast(int4*)res.ptr);
5101     }
5102 }
5103 unittest
5104 {
5105     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
5106                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
5107     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
5108     assert(R.array == correct);
5109 }
5110 
5111 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
5112 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
5113 {
5114     version(LDC)
5115     {
5116         static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
5117         {
5118             // x86: Generates PSUBUSB since LDC 1.15 -O0
5119             // ARM: Generates uqsub.16b since LDC 1.21 -O0
5120             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
5121             enum ir = `
5122                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
5123                 ret <16 x i8> %r`;
5124             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
5125         }
5126         else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation
5127         {
5128             /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation.
5129             __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
5130             {
5131                 ubyte[16] res; // PERF =void;
5132                 byte16 sa = cast(byte16)a;
5133                 byte16 sb = cast(byte16)b;
5134                 foreach(i; 0..16)
5135                     res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
5136                 return _mm_loadu_si128(cast(int4*)res.ptr);
5137             }
5138         }
5139         else static if (LDC_with_SSE2)
5140         {
5141             return __builtin_ia32_psubusb128(a, b);
5142         }
5143         else 
5144             static assert(false);
5145     }
5146     else static if (GDC_with_SSE2)
5147     {
5148         return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b);
5149     }
5150     else
5151     {
5152         ubyte[16] res; // PERF =void;
5153         byte16 sa = cast(byte16)a;
5154         byte16 sb = cast(byte16)b;
5155         foreach(i; 0..16)
5156             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
5157         return _mm_loadu_si128(cast(int4*)res.ptr);
5158     }
5159 }
5160 unittest
5161 {
5162     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
5163                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
5164     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
5165     assert(res.array == correctResult);
5166 }
5167 
5168 // Note: the only difference between these intrinsics is the signalling
5169 //       behaviour of quiet NaNs. This is incorrect but the case where
5170 //       you would want to differentiate between qNaN and sNaN and then
5171 //       treat them differently on purpose seems extremely rare.
5172 alias _mm_ucomieq_sd = _mm_comieq_sd; ///
5173 alias _mm_ucomige_sd = _mm_comige_sd; ///
5174 alias _mm_ucomigt_sd = _mm_comigt_sd; ///
5175 alias _mm_ucomile_sd = _mm_comile_sd; ///
5176 alias _mm_ucomilt_sd = _mm_comilt_sd; ///
5177 alias _mm_ucomineq_sd = _mm_comineq_sd; ///
5178 
5179 /// Return vector of type `__m128d` with undefined elements.
5180 __m128d _mm_undefined_pd() pure @safe
5181 {
5182     pragma(inline, true);
5183     __m128d result = void;
5184     return result;
5185 }
5186 
5187 /// Return vector of type `__m128i` with undefined elements.
5188 __m128i _mm_undefined_si128() pure @safe
5189 {
5190     pragma(inline, true);
5191     __m128i result = void;
5192     return result;
5193 }
5194 
5195 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
5196 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted
5197 {
5198     // PERF DMD D_SIMD
5199     static if (GDC_with_SSE2)
5200     {
5201         return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b);
5202     }
5203     else version(LDC)
5204     {
5205         return cast(__m128i) shufflevectorLDC!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
5206                                               (cast(short8)a, cast(short8)b);
5207     }
5208     else static if (DMD_with_32bit_asm)
5209     {
5210         asm pure nothrow @nogc @trusted
5211         {
5212             movdqu XMM0, a;
5213             movdqu XMM1, b;
5214             punpckhwd XMM0, XMM1;
5215             movdqu a, XMM0;
5216         }
5217         return a;
5218     }   
5219     else
5220     {
5221         short8 r = void;
5222         short8 sa = cast(short8)a;
5223         short8 sb = cast(short8)b;
5224         r.ptr[0] = sa.array[4];
5225         r.ptr[1] = sb.array[4];
5226         r.ptr[2] = sa.array[5];
5227         r.ptr[3] = sb.array[5];
5228         r.ptr[4] = sa.array[6];
5229         r.ptr[5] = sb.array[6];
5230         r.ptr[6] = sa.array[7];
5231         r.ptr[7] = sb.array[7];
5232         return cast(__m128i)r;
5233     }
5234 }
5235 unittest
5236 {
5237     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
5238     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
5239     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
5240     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
5241     assert(C.array == correct);
5242 }
5243 
5244 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
5245 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted
5246 {
5247     static if (GDC_with_SSE2)
5248     {
5249         return __builtin_ia32_punpckhdq128(a, b);
5250     }
5251     else version(LDC)
5252     {
5253         return shufflevectorLDC!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
5254     }
5255     else
5256     {
5257         __m128i r = void;
5258         r.ptr[0] = a.array[2];
5259         r.ptr[1] = b.array[2];
5260         r.ptr[2] = a.array[3];
5261         r.ptr[3] = b.array[3];
5262         return r;
5263     }
5264 }
5265 unittest
5266 {
5267     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5268     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5269     __m128i C = _mm_unpackhi_epi32(A, B);
5270     int[4] correct = [3, 7, 4, 8];
5271     assert(C.array == correct);
5272 }
5273 
5274 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
5275 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
5276 {
5277     static if (GDC_with_SSE2)
5278     {
5279         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
5280     }
5281     else
5282     {
5283         __m128i r = cast(__m128i)b;
5284         r[0] = a[2];
5285         r[1] = a[3];
5286         return r; 
5287     }
5288 }
5289 unittest // Issue #36
5290 {
5291     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5292     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5293     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
5294     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
5295     assert(C.array == correct);
5296 }
5297 
5298 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
5299 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted
5300 {
5301     // PERF DMD D_SIMD
5302     static if (GDC_with_SSE2)
5303     {
5304         return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b);
5305     }
5306     else static if (DMD_with_32bit_asm)
5307     {
5308         asm pure nothrow @nogc @trusted
5309         {
5310             movdqu XMM0, a;
5311             movdqu XMM1, b;
5312             punpckhbw XMM0, XMM1;
5313             movdqu a, XMM0;
5314         }
5315         return a;
5316     }
5317     else version(LDC)
5318     {
5319         return cast(__m128i)shufflevectorLDC!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
5320                                                       12, 28, 13, 29, 14, 30, 15, 31)
5321             (cast(byte16)a, cast(byte16)b);
5322     }
5323     else
5324     {
5325         byte16 r = void;
5326         byte16 ba = cast(byte16)a;
5327         byte16 bb = cast(byte16)b;
5328         r.ptr[0] = ba.array[8];
5329         r.ptr[1] = bb.array[8];
5330         r.ptr[2] = ba.array[9];
5331         r.ptr[3] = bb.array[9];
5332         r.ptr[4] = ba.array[10];
5333         r.ptr[5] = bb.array[10];
5334         r.ptr[6] = ba.array[11];
5335         r.ptr[7] = bb.array[11];
5336         r.ptr[8] = ba.array[12];
5337         r.ptr[9] = bb.array[12];
5338         r.ptr[10] = ba.array[13];
5339         r.ptr[11] = bb.array[13];
5340         r.ptr[12] = ba.array[14];
5341         r.ptr[13] = bb.array[14];
5342         r.ptr[14] = ba.array[15];
5343         r.ptr[15] = bb.array[15];
5344         return cast(__m128i)r;
5345     }
5346 }
5347 unittest
5348 {
5349     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5350     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5351     byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B);
5352     byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31];
5353     assert(C.array == correct);
5354 }
5355 
5356 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`.
5357 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted
5358 {
5359     // PERF DMD D_SIMD
5360     static if (GDC_with_SSE2)
5361     {
5362         return __builtin_ia32_unpckhpd(a, b);
5363     }
5364     else version(LDC)
5365     {
5366         return shufflevectorLDC!(__m128d, 1, 3)(a, b);
5367     }
5368     else
5369     {
5370         double2 r = void;
5371         r.ptr[0] = a.array[1];
5372         r.ptr[1] = b.array[1];
5373         return r;
5374     }
5375 }
5376 unittest
5377 {
5378     __m128d A = _mm_setr_pd(4.0, 6.0);
5379     __m128d B = _mm_setr_pd(7.0, 9.0);
5380     __m128d C = _mm_unpackhi_pd(A, B);
5381     double[2] correct = [6.0, 9.0];
5382     assert(C.array == correct);
5383 }
5384 
5385 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
5386 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted
5387 {
5388     // PERF DMD SIMD
5389     static if (GDC_with_SSE2)
5390     {
5391         return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b);
5392     }
5393     else version(LDC)
5394     {
5395         return cast(__m128i) shufflevectorLDC!(short8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(short8)a, cast(short8)b);
5396     }
5397     else static if (DMD_with_32bit_asm)
5398     {
5399         asm pure nothrow @nogc @trusted
5400         {
5401             movdqu XMM0, a;
5402             movdqu XMM1, b;
5403             punpcklwd XMM0, XMM1;
5404             movdqu a, XMM0;
5405         }
5406         return a;
5407     }
5408     else
5409     {
5410         short8 r = void;
5411         short8 sa = cast(short8)a;
5412         short8 sb = cast(short8)b;
5413         r.ptr[0] = sa.array[0];
5414         r.ptr[1] = sb.array[0];
5415         r.ptr[2] = sa.array[1];
5416         r.ptr[3] = sb.array[1];
5417         r.ptr[4] = sa.array[2];
5418         r.ptr[5] = sb.array[2];
5419         r.ptr[6] = sa.array[3];
5420         r.ptr[7] = sb.array[3];
5421         return cast(__m128i)r;
5422     }
5423 }
5424 unittest
5425 {
5426     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
5427     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
5428     short8 C = cast(short8) _mm_unpacklo_epi16(A, B);
5429     short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11];
5430     assert(C.array == correct);
5431 }
5432 
5433 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
5434 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted
5435 {
5436     // PERF DMD
5437     static if (GDC_with_SSE2)
5438     {
5439         return __builtin_ia32_punpckldq128(a, b);
5440     }
5441     else version(LDC)
5442     {
5443         return shufflevectorLDC!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b);
5444     }
5445     else
5446     {
5447         __m128i r;
5448         r.ptr[0] = a.array[0];
5449         r.ptr[1] = b.array[0];
5450         r.ptr[2] = a.array[1];
5451         r.ptr[3] = b.array[1];
5452         return r;
5453     }
5454 }
5455 unittest
5456 {
5457     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
5458     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
5459     __m128i C = _mm_unpacklo_epi32(A, B);
5460     int[4] correct = [1, 5, 2, 6];
5461     assert(C.array == correct);
5462 }
5463 
5464 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
5465 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
5466 {
5467     static if (GDC_with_SSE2)
5468     {
5469         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
5470     }
5471     else
5472     {
5473         long2 lA = cast(long2)a;
5474         long2 lB = cast(long2)b;
5475         long2 R; // PERF =void;
5476         R.ptr[0] = lA.array[0];
5477         R.ptr[1] = lB.array[0];
5478         return cast(__m128i)R;
5479     }
5480 }
5481 unittest // Issue #36
5482 {
5483     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
5484     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
5485     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
5486     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
5487     assert(C.array == correct);
5488 }
5489 
5490 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
5491 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted
5492 {
5493     // PERF DMD D_SIMD
5494     static if (GDC_with_SSE2)
5495     {
5496         return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b);
5497     }
5498     else static if (DMD_with_32bit_asm)
5499     {
5500         asm pure nothrow @nogc @trusted
5501         {
5502             movdqu XMM0, a;
5503             movdqu XMM1, b;
5504             punpcklbw XMM0, XMM1;
5505             movdqu a, XMM0;
5506         }
5507         return a;
5508     }
5509     else version(LDC)
5510     {
5511         return cast(__m128i) shufflevectorLDC!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
5512                                                        4, 20, 5, 21, 6, 22, 7, 23)
5513                                                        (cast(byte16)a, cast(byte16)b); 
5514     }
5515     else
5516     {
5517         byte16 r = void;
5518         byte16 ba = cast(byte16)a;
5519         byte16 bb = cast(byte16)b;
5520         r.ptr[0] = ba.array[0];
5521         r.ptr[1] = bb.array[0];
5522         r.ptr[2] = ba.array[1];
5523         r.ptr[3] = bb.array[1];
5524         r.ptr[4] = ba.array[2];
5525         r.ptr[5] = bb.array[2];
5526         r.ptr[6] = ba.array[3];
5527         r.ptr[7] = bb.array[3];
5528         r.ptr[8] = ba.array[4];
5529         r.ptr[9] = bb.array[4];
5530         r.ptr[10] = ba.array[5];
5531         r.ptr[11] = bb.array[5];
5532         r.ptr[12] = ba.array[6];
5533         r.ptr[13] = bb.array[6];
5534         r.ptr[14] = ba.array[7];
5535         r.ptr[15] = bb.array[7];
5536         return cast(__m128i)r;
5537     }
5538 }
5539 unittest
5540 {
5541     __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15);
5542     __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
5543     byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B);
5544     byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23];
5545     assert(C.array == correct);
5546 }
5547 
5548 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`.
5549 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted
5550 {
5551     // PERF DMD D_SIMD
5552     static if (GDC_with_SSE2)
5553     {
5554         return __builtin_ia32_unpcklpd(a, b);
5555     }
5556     else version(LDC)
5557     {
5558         return shufflevectorLDC!(__m128d, 0, 2)(a, b);
5559     }
5560     else
5561     {
5562         double2 r = void;
5563         r.ptr[0] = a.array[0];
5564         r.ptr[1] = b.array[0];
5565         return r;
5566     }
5567 }
5568 unittest
5569 {
5570     __m128d A = _mm_setr_pd(4.0, 6.0);
5571     __m128d B = _mm_setr_pd(7.0, 9.0);
5572     __m128d C = _mm_unpacklo_pd(A, B);
5573     double[2] correct = [4.0, 7.0];
5574     assert(C.array == correct);
5575 }
5576 
5577 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
5578 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
5579 {
5580     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
5581 }
5582 unittest
5583 {
5584     __m128d A = _mm_setr_pd(-4.0, 6.0);
5585     __m128d B = _mm_setr_pd(4.0, -6.0);
5586     long2 R = cast(long2) _mm_xor_pd(A, B);
5587     long[2] correct = [long.min, long.min];
5588     assert(R.array == correct);
5589 }
5590 
5591 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`.
5592 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
5593 {
5594     return a ^ b;
5595 }
5596 unittest
5597 {
5598     __m128i A = _mm_setr_epi64(975394, 619809709);
5599     __m128i B = _mm_setr_epi64(-920275025, -6);
5600     long2 R = cast(long2) _mm_xor_si128(A, B);
5601     long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6];
5602     assert(R.array == correct);
5603 }
5604 
5605 unittest
5606 {
5607     float distance(float[4] a, float[4] b) nothrow @nogc
5608     {
5609         __m128 va = _mm_loadu_ps(a.ptr);
5610         __m128 vb = _mm_loadu_ps(b.ptr);
5611         __m128 diffSquared = _mm_sub_ps(va, vb);
5612         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
5613         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
5614         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
5615         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
5616     }
5617     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
5618 }