1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.emmintrin;
7 
8 public import inteli.types;
9 public import inteli.xmmintrin; // SSE2 includes SSE1
10 import inteli.mmx;
11 import inteli.internals;
12 
13 nothrow @nogc:
14 
15 
16 // SSE2 instructions
17 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
18 
19 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
20 {
21     return cast(__m128i)(cast(short8)a + cast(short8)b);
22 }
23 
24 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
25 {
26     return cast(__m128i)(cast(int4)a + cast(int4)b);
27 }
28 
29 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
30 {
31     return cast(__m128i)(cast(long2)a + cast(long2)b);
32 }
33 
34 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
35 {
36     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
37 }
38 
39 static if (GDC_with_SSE2)
40 {
41     alias _mm_add_sd = __builtin_ia32_addsd;
42 }
43 else version(DigitalMars)
44 {
45     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
46     __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
47     {
48         asm pure nothrow @nogc @trusted { nop;}
49         a[0] = a[0] + b[0];
50         return a;
51     }
52 }
53 else
54 {
55     __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
56     {
57         a[0] += b[0];
58         return a;
59     }
60 }
61 unittest
62 {
63     __m128d a = [1.5, -2.0];
64     a = _mm_add_sd(a, a);
65     assert(a.array == [3.0, -2.0]);
66 }
67 
68 
69 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
70 {
71     return a + b;
72 }
73 unittest
74 {
75     __m128d a = [1.5, -2.0];
76     a = _mm_add_pd(a, a);
77     assert(a.array == [3.0, -4.0]);
78 }
79 
80 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe
81 {
82     return a + b;
83 }
84 
85 static if (GDC_with_SSE2)
86 {
87     alias _mm_adds_epi16 = __builtin_ia32_paddsw128;
88 }
89 else version(LDC)
90 {
91     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
92     {
93         // Generates PADDSW since LDC 1.15 -O0
94         __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
95         {
96             enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
97             enum ir = `
98                 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
99                 ret <8 x i16> %r`;
100             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
101         }
102     }
103     else
104         alias _mm_adds_epi16 = __builtin_ia32_paddsw128;
105 }
106 else
107 {    
108     __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
109     {
110         short[8] res;
111         short8 sa = cast(short8)a;
112         short8 sb = cast(short8)b;
113         foreach(i; 0..8)
114             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
115         return _mm_loadu_si128(cast(int4*)res.ptr);
116     }
117 }
118 unittest
119 {
120     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
121                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
122     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
123     assert(res.array == correctResult);
124 }
125 
126 static if (GDC_with_SSE2)
127 {
128     alias _mm_adds_epi8 = __builtin_ia32_paddsb128;
129 }
130 else version(LDC)
131 {
132     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
133     {
134         // Generates PADDSB since LDC 1.15 -O0
135         __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
136         {
137             enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
138             enum ir = `
139                 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
140                 ret <16 x i8> %r`;
141             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
142         }
143     }
144     else
145         alias _mm_adds_epi8 = __builtin_ia32_paddsb128;
146 }
147 else
148 {
149     __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
150     {
151         byte[16] res;
152         byte16 sa = cast(byte16)a;
153         byte16 sb = cast(byte16)b;
154         foreach(i; 0..16)
155             res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]);
156         return _mm_loadu_si128(cast(int4*)res.ptr);
157     }
158 }
159 unittest
160 {
161     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
162                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
163     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
164                                                16, 18, 20, 22, 24, 26, 28, 30];
165     assert(res.array == correctResult);
166 }
167 
168 version(LDC)
169 {
170     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
171     {
172         // Generates PADDUSB since LDC 1.15 -O0
173         __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
174         {
175             enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
176             enum ir = `
177                 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
178                 ret <16 x i8> %r`;
179             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
180         }
181     }
182     else
183         alias _mm_adds_epu8 = __builtin_ia32_paddusb128;
184 }
185 else
186 {
187     __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
188     {
189         ubyte[16] res;
190         byte16 sa = cast(byte16)a;
191         byte16 sb = cast(byte16)b;
192         foreach(i; 0..16)
193             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
194         return _mm_loadu_si128(cast(int4*)res.ptr);
195     }
196 }
197 
198 version(LDC)
199 {
200     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
201     {
202         // Generates PADDUSW since LDC 1.15 -O0
203         __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
204         {
205             enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
206             enum ir = `
207                 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
208                 ret <8 x i16> %r`;
209             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
210         }
211     }
212     else
213         alias _mm_adds_epu16 = __builtin_ia32_paddusw128;
214 }
215 else
216 {
217     __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
218     {
219         ushort[8] res;
220         short8 sa = cast(short8)a;
221         short8 sb = cast(short8)b;
222         foreach(i; 0..8)
223             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
224         return _mm_loadu_si128(cast(int4*)res.ptr);
225     }
226 }
227 
228 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
229 {
230     return cast(__m128d)( cast(__m128i)a & cast(__m128i)b );
231 }
232 
233 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
234 {
235     return a & b;
236 }
237 unittest
238 {
239     __m128i A = _mm_set1_epi32(7);
240     __m128i B = _mm_set1_epi32(14);
241     __m128i R = _mm_and_si128(A, B);
242     int[4] correct = [6, 6, 6, 6];
243     assert(R.array == correct);
244 }
245 
246 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
247 {
248     return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b );
249 }
250 
251 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
252 {
253     return (~a) & b;
254 }
255 unittest
256 {
257     __m128i A = _mm_set1_epi32(7);
258     __m128i B = _mm_set1_epi32(14);
259     __m128i R = _mm_andnot_si128(A, B);
260     int[4] correct = [8, 8, 8, 8];
261     assert(R.array == correct);
262 }
263 
264 static if (GDC_with_SSE2)
265 {
266     alias _mm_avg_epu16 = __builtin_ia32_pavgw128;
267 }
268 else version(LDC)
269 {
270     __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe
271     {
272         // Generates pavgw even in LDC 1.0, even in -O0
273         enum ir = `
274             %ia = zext <8 x i16> %0 to <8 x i32>
275             %ib = zext <8 x i16> %1 to <8 x i32>
276             %isum = add <8 x i32> %ia, %ib
277             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
278             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
279             %r = trunc <8 x i32> %isums to <8 x i16>
280             ret <8 x i16> %r`;
281         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);
282     }
283 }
284 else
285 {
286     __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe
287     {
288         short8 sa = cast(short8)a;
289         short8 sb = cast(short8)b;
290         short8 sr = void;
291         foreach(i; 0..8)
292         {
293             sr[i] = cast(ushort)( (cast(ushort)(sa[i]) + cast(ushort)(sb[i]) + 1) >> 1 );
294         }
295         return cast(int4)sr;
296     }
297 }
298 unittest
299 {
300     __m128i A = _mm_set1_epi16(31);
301     __m128i B = _mm_set1_epi16(64);
302     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
303     foreach(i; 0..8)
304         assert(avg.array[i] == 48);
305 }
306 
307 static if (GDC_with_SSE2)
308 {
309     alias _mm_avg_epu8 = __builtin_ia32_pavgb128;
310 }
311 else version(LDC)
312 {
313     __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe
314     {
315         // Generates pavgb even in LDC 1.0, even in -O0
316         enum ir = `
317             %ia = zext <16 x i8> %0 to <16 x i16>
318             %ib = zext <16 x i8> %1 to <16 x i16>
319             %isum = add <16 x i16> %ia, %ib
320             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
321             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
322             %r = trunc <16 x i16> %isums to <16 x i8>
323             ret <16 x i8> %r`;
324         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
325     }
326 }
327 else
328 {
329     __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe
330     {
331         byte16 sa = cast(byte16)a;
332         byte16 sb = cast(byte16)b;
333         byte16 sr = void;
334         foreach(i; 0..16)
335         {
336             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
337         }
338         return cast(int4)sr;
339     }
340 }
341 unittest
342 {
343     __m128i A = _mm_set1_epi8(31);
344     __m128i B = _mm_set1_epi8(64);
345     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
346     foreach(i; 0..16)
347         assert(avg.array[i] == 48);
348 }
349 
350 
351 alias _mm_bslli_si128 = _mm_slli_si128;
352 
353 unittest
354 {
355     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
356     byte[16] exact =               [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
357     __m128i result = _mm_bslli_si128!5(toShift);
358     assert( (cast(byte16)result).array == exact);
359 }
360 
361 alias _mm_bsrli_si128 = _mm_srli_si128;
362 
363 unittest
364 {
365     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
366     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
367     __m128i result = _mm_bsrli_si128!5(toShift);
368     assert( (cast(byte16)result).array == exact);
369 }
370 
371 __m128 _mm_castpd_ps (__m128d a) pure @safe
372 {
373     return cast(__m128)a;
374 }
375 
376 __m128i _mm_castpd_si128 (__m128d a) pure @safe
377 {
378     return cast(__m128i)a;
379 }
380 
381 __m128d _mm_castps_pd (__m128 a) pure @safe
382 {
383     return cast(__m128d)a;
384 }
385 
386 __m128i _mm_castps_si128 (__m128 a) pure @safe
387 {
388     return cast(__m128i)a;
389 }
390 
391 __m128d _mm_castsi128_pd (__m128i a) pure @safe
392 {
393     return cast(__m128d)a;
394 }
395 
396 __m128 _mm_castsi128_ps (__m128i a) pure @safe
397 {
398     return cast(__m128)a;
399 }
400 
401 static if (GDC_with_SSE2)
402 {
403     void _mm_clflush (const(void)* p) pure @safe
404     {
405         return __builtin_ia32_clflush(p);
406     }
407 }
408 else version(LDC)
409 {
410     alias _mm_clflush = __builtin_ia32_clflush;
411 }
412 else
413 {
414     void _mm_clflush (const(void)* p) pure @safe
415     {
416         version(D_InlineAsm_X86)
417         {
418             asm pure nothrow @nogc @safe
419             {
420                 mov EAX, p;
421                 clflush [EAX];
422             }
423         }
424         else version(D_InlineAsm_X86_64)
425         {
426             asm pure nothrow @nogc @safe
427             {
428                 mov RAX, p;
429                 clflush [RAX];
430             }
431         }
432         else 
433         {
434             // Do nothing. Invalidating cacheline does
435             // not affect correctness.            
436         }
437     }
438 }
439 unittest
440 {
441     ubyte[64] cacheline;
442     _mm_clflush(cacheline.ptr);
443 }
444 
445 static if (GDC_with_SSE2)
446 {
447     alias _mm_cmpeq_epi16 = __builtin_ia32_pcmpeqw128;
448 }
449 else
450 {
451     __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
452     {
453         return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
454     }
455 }
456 unittest
457 {
458     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
459     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
460     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
461     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
462     assert(R.array == E);
463 }
464 
465 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
466 {
467     static if (GDC_with_SSE2)
468     {
469         return __builtin_ia32_pcmpeqd128(a, b);
470     }
471     else
472     {
473         return equalMask!__m128i(a, b);
474     }
475 }
476 unittest
477 {
478     int4   A = [-3, -2, -1,  0];
479     int4   B = [ 4, -2,  2,  0];
480     int[4] E = [ 0, -1,  0, -1];
481     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
482     assert(R.array == E);
483 }
484 
485 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
486 {
487     static if (GDC_with_SSE2)
488     {
489         return __builtin_ia32_pcmpeqb128(a, b); 
490     }
491     else
492     {
493         return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
494     }
495 }
496 unittest
497 {
498     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
499     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
500     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
501     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
502     assert(C.array == correct);
503 }
504 
505 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
506 {
507     static if (GDC_with_SSE2)
508     {
509         return __builtin_ia32_cmpeqpd(a, b);
510     }
511     else
512     {
513         return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
514     }
515 }
516 
517 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
518 {
519     static if (GDC_with_SSE2)
520     {
521         return __builtin_ia32_cmpeqsd(a, b);
522     }
523     else
524     {
525         return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
526     }
527 }
528 
529 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
530 {
531     static if (GDC_with_SSE2)
532     {
533         return __builtin_ia32_cmpgepd(a, b);
534     }
535     else
536     {
537         return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
538     }
539 }
540 
541 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
542 {
543     // Note: There is no __builtin_ia32_cmpgesd builtin.
544     static if (GDC_with_SSE2)
545     {
546         return __builtin_ia32_cmpnltsd(b, a);
547     }
548     else
549     {
550         return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
551     }
552 }
553 
554 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
555 {
556     static if (GDC_with_SSE2)
557     {
558         return __builtin_ia32_pcmpgtw128(a, b); 
559     }
560     else
561     {
562         return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
563     }
564 }
565 unittest
566 {
567     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
568     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
569     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
570     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
571     assert(R.array == E);
572 }
573 
574 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
575 {
576     static if (GDC_with_SSE2)
577     {
578         return __builtin_ia32_pcmpgtd128(a, b); 
579     }
580     else
581     {
582         return cast(__m128i)( greaterMask!int4(a, b));
583     }
584 }
585 unittest
586 {
587     int4   A = [-3,  2, -1,  0];
588     int4   B = [ 4, -2,  2,  0];
589     int[4] E = [ 0, -1,  0,  0];
590     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
591     assert(R.array == E);
592 }
593 
594 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
595 {
596     static if (GDC_with_SSE2)
597     {
598         return __builtin_ia32_pcmpgtb128(a, b); 
599     }
600     else
601     {
602         return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
603     }
604 }
605 unittest
606 {
607     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
608     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
609     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
610     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
611     __m128i D = _mm_cmpeq_epi8(A, B);
612     assert(C.array == correct);
613 }
614 
615 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
616 {
617     static if (GDC_with_SSE2)
618     {
619         return __builtin_ia32_cmpgtpd(a, b); 
620     }
621     else
622     {
623         return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
624     }
625 }
626 
627 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
628 {
629     // Note: There is no __builtin_ia32_cmpgtsd builtin.
630     static if (GDC_with_SSE2)
631     {
632         return __builtin_ia32_cmpnlesd(b, a);
633     }
634     else
635     {
636         return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
637     }
638 }
639 
640 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
641 {
642     static if (GDC_with_SSE2)
643     {
644         return __builtin_ia32_cmplepd(a, b); 
645     }
646     else
647     {
648         return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
649     }
650 }
651 
652 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
653 {
654     static if (GDC_with_SSE2)
655     {
656         return __builtin_ia32_cmplesd(a, b); 
657     }
658     else
659     {
660         return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
661     }
662 }
663 
664 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
665 {
666     return _mm_cmpgt_epi16(b, a);
667 }
668 
669 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
670 {
671     return _mm_cmpgt_epi32(b, a);
672 }
673 
674 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
675 {
676     return _mm_cmpgt_epi8(b, a);
677 }
678 
679 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
680 {
681     static if (GDC_with_SSE2)
682     {
683         return __builtin_ia32_cmpltpd(a, b); 
684     }
685     else
686     {
687         return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
688     }
689 }
690 
691 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
692 {
693     static if (GDC_with_SSE2)
694     {
695         return __builtin_ia32_cmpltsd(a, b); 
696     }
697     else
698     {
699         return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
700     }
701 }
702 
703 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
704 {
705     static if (GDC_with_SSE2)
706     {
707         return __builtin_ia32_cmpneqpd(a, b); 
708     }
709     else
710     {
711         return cast(__m128d) cmppd!(FPComparison.une)(a, b);
712     }
713 }
714 
715 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
716 {
717     static if (GDC_with_SSE2)
718     {
719         return __builtin_ia32_cmpneqsd(a, b); 
720     }
721     else
722     {
723         return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
724     }
725 }
726 
727 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
728 {
729     static if (GDC_with_SSE2)
730     {
731         return __builtin_ia32_cmpngepd(a, b); 
732     }
733     else
734     {
735         return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
736     }
737 }
738 
739 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
740 {
741     // Note: There is no __builtin_ia32_cmpngesd builtin.
742     static if (GDC_with_SSE2)
743     {
744         return __builtin_ia32_cmpltsd(b, a); 
745     }
746     else
747     {
748         return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
749     }
750 }
751 
752 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
753 {
754     static if (GDC_with_SSE2)
755     {
756         return __builtin_ia32_cmpngtpd(a, b);
757     }
758     else
759     {
760         return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
761     }
762 }
763 
764 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
765 {
766     // Note: There is no __builtin_ia32_cmpngtsd builtin.
767     static if (GDC_with_SSE2)
768     {
769         return __builtin_ia32_cmplesd(b, a);
770     }
771     else
772     {
773         return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
774     }
775 }
776 
777 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
778 {
779     static if (GDC_with_SSE2)
780     {
781         return __builtin_ia32_cmpnlepd(a, b);
782     }
783     else
784     {
785         return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
786     }
787 }
788 
789 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
790 {
791     static if (GDC_with_SSE2)
792     {
793         return __builtin_ia32_cmpnlesd(a, b);
794     }
795     else
796     {
797         return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
798     }
799 }
800 
801 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
802 {
803     static if (GDC_with_SSE2)
804     {
805         return __builtin_ia32_cmpnltpd(a, b);
806     }
807     else
808     {
809         return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
810     }
811 }
812 
813 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
814 {
815     static if (GDC_with_SSE2)
816     {
817         return __builtin_ia32_cmpnltsd(a, b);
818     }
819     else
820     {
821         return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
822     }
823 }
824 
825 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
826 {
827     static if (GDC_with_SSE2)
828     {
829         return __builtin_ia32_cmpordpd(a, b);
830     }
831     else
832     {
833         return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
834     }
835 }
836 
837 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
838 {
839     static if (GDC_with_SSE2)
840     {
841         return __builtin_ia32_cmpordsd(a, b);
842     }
843     else
844     {
845         return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
846     }
847 }
848 
849 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
850 {
851     static if (GDC_with_SSE2)
852     {
853         return __builtin_ia32_cmpunordpd(a, b);
854     }
855     else
856     {
857         return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
858     }
859 }
860 
861 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
862 {
863     static if (GDC_with_SSE2)
864     {
865         return __builtin_ia32_cmpunordsd(a, b);
866     }
867     else
868     {
869         return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
870     }
871 }
872 
873 
874 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
875 // Some such comparisons yields true for NaNs, other don't.
876 
877 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
878 {
879     static if (GDC_with_SSE2)
880     {
881         return __builtin_ia32_comieq(a, b);
882     }
883     else
884     {
885         return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC
886     }
887 }
888 
889 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
890 {
891     static if (GDC_with_SSE2)
892     {
893         return __builtin_ia32_comige(a, b);
894     }
895     else
896     {
897         return comsd!(FPComparison.oge)(a, b);
898     }
899 }
900 
901 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
902 {
903     static if (GDC_with_SSE2)
904     {
905         return __builtin_ia32_comigt(a, b);
906     }
907     else
908     {
909         return comsd!(FPComparison.ogt)(a, b);
910     }
911 }
912 
913 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
914 {
915     static if (GDC_with_SSE2)
916     {
917         return __builtin_ia32_comile(a, b);
918     }
919     else
920     {
921         return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC
922     }
923 }
924 
925 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
926 {
927     static if (GDC_with_SSE2)
928     {
929         return __builtin_ia32_comilt(a, b);
930     }
931     else
932     {
933         return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC
934     }
935 }
936 
937 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
938 {
939     static if (GDC_with_SSE2)
940     {
941         return __builtin_ia32_comineq(a, b);
942     }
943     else
944     {
945         return comsd!(FPComparison.one)(a, b);
946     }
947 }
948 
949 version(LDC)
950 {
951      __m128d _mm_cvtepi32_pd (__m128i a) pure  @safe
952     {
953         // Generates cvtdq2pd since LDC 1.0, even without optimizations
954         enum ir = `
955             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
956             %r = sitofp <2 x i32> %v to <2 x double>
957             ret <2 x double> %r`;
958         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
959     }
960 }
961 else
962 {
963     static if (GDC_with_SSE2)
964     {
965 
966         __m128d _mm_cvtepi32_pd (__m128i a) pure  @safe
967         {
968             return __builtin_ia32_cvtdq2pd(a); 
969         }
970     }
971     else
972     {
973         __m128d _mm_cvtepi32_pd (__m128i a) pure  @safe
974         {
975             double2 r = void;
976             r[0] = a[0];
977             r[1] = a[1];
978             return r;
979         }
980     }
981 }
982 unittest
983 {
984     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
985     assert(A.array[0] == 54.0);
986     assert(A.array[1] == 54.0);
987 }
988 
989 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe
990 {
991     static if (GDC_with_SSE2)
992     {
993         return __builtin_ia32_cvtdq2ps(a);
994     }
995     else
996     {
997         // Generates cvtdq2ps since LDC 1.0.0 -O1
998         __m128 res;
999         res.array[0] = cast(float)a.array[0];
1000         res.array[1] = cast(float)a.array[1];
1001         res.array[2] = cast(float)a.array[2];
1002         res.array[3] = cast(float)a.array[3];
1003         return res;
1004     }
1005 }
1006 unittest
1007 {
1008     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
1009     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
1010 }
1011 
1012 
1013 version(LDC)
1014 {
1015     // Like in clang, implemented with a magic intrinsic right now
1016     alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq;
1017 
1018 /* Unfortunately this generates a cvttpd2dq instruction
1019     __m128i _mm_cvtpd_epi32 (__m128d a) pure  @safe
1020     {
1021         enum ir = `
1022             %i = fptosi <2 x double> %0 to <2 x i32>
1023             %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1024             ret <4 x i32> %r`;
1025 
1026         return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a);
1027     } */
1028 }
1029 else
1030 {
1031     static if (GDC_with_SSE2)
1032     {
1033         alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq;
1034     }
1035     else
1036     {
1037         __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe
1038         {
1039             __m128i r = _mm_setzero_si128();
1040             r[0] = convertDoubleToInt32UsingMXCSR(a[0]);
1041             r[1] = convertDoubleToInt32UsingMXCSR(a[1]);
1042             return r;
1043         }
1044     }
1045 }
1046 unittest
1047 {
1048     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
1049     assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0);
1050 }
1051 
1052 /// Convert packed double-precision (64-bit) floating-point elements in `v`
1053 //  to packed 32-bit integers
1054 __m64 _mm_cvtpd_pi32 (__m128d v) pure @safe
1055 {
1056     return to_m64(_mm_cvtpd_epi32(v));
1057 }
1058 unittest
1059 {
1060     int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0));
1061     assert(A.array[0] == 55 && A.array[1] == 61);
1062 }
1063 
1064 version(LDC)
1065 {
1066     alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately
1067 }
1068 else
1069 {
1070     static if (GDC_with_SSE2)
1071     {
1072         alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately
1073     }
1074     else
1075     {
1076          __m128 _mm_cvtpd_ps (__m128d a) pure @safe
1077         {
1078             __m128 r = void;
1079             r[0] = a[0];
1080             r[1] = a[1];
1081             r[2] = 0;
1082             r[3] = 0;
1083             return r;
1084         }    
1085     }
1086 }
1087 unittest
1088 {
1089     __m128d A = _mm_set_pd(5.25, 4.0);
1090     __m128 B = _mm_cvtpd_ps(A);
1091     assert(B.array == [4.0f, 5.25f, 0, 0]);
1092 }
1093 
1094 /// Convert packed 32-bit integers in `v` to packed double-precision 
1095 /// (64-bit) floating-point elements.
1096 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe
1097 {
1098     return _mm_cvtepi32_pd(to_m128i(v));
1099 }
1100 unittest
1101 {
1102     __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5));
1103     assert(A.array[0] == 4.0 && A.array[1] == -5.0);
1104 }
1105 
1106 version(LDC)
1107 {
1108     // Disabled, since it fail with optimizations unfortunately
1109     //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
1110 
1111      __m128i _mm_cvtps_epi32 (__m128 a) pure @trusted
1112     {
1113         return __asm!__m128i("cvtps2dq $1,$0","=x,x",a);
1114     }
1115 }
1116 else
1117 {
1118     static if (GDC_with_SSE2)
1119     {
1120         alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
1121     }
1122     else
1123     {
1124         __m128i _mm_cvtps_epi32 (__m128 a) pure @safe
1125         {
1126             __m128i r = void;
1127             r[0] = convertFloatToInt32UsingMXCSR(a[0]);
1128             r[1] = convertFloatToInt32UsingMXCSR(a[1]);
1129             r[2] = convertFloatToInt32UsingMXCSR(a[2]);
1130             r[3] = convertFloatToInt32UsingMXCSR(a[3]);
1131             return r;
1132         }
1133     }
1134 }
1135 unittest
1136 {
1137     uint savedRounding = _MM_GET_ROUNDING_MODE();
1138 
1139     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1140     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1141     assert(A.array == [1, -2, 54, -3]);
1142 
1143     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1144     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1145     assert(A.array == [1, -3, 53, -3]);
1146 
1147     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1148     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1149     assert(A.array == [2, -2, 54, -2]);
1150 
1151     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1152     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
1153     assert(A.array == [1, -2, 53, -2]);
1154 
1155     _MM_SET_ROUNDING_MODE(savedRounding);
1156 }
1157 
1158 
1159 version(LDC)
1160 {
1161     __m128d _mm_cvtps_pd (__m128 a) pure  @safe
1162     {
1163         // Generates cvtps2pd since LDC 1.0, no opt
1164         enum ir = `
1165             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
1166             %r = fpext <2 x float> %v to <2 x double>
1167             ret <2 x double> %r`;
1168         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
1169     }
1170 }
1171 else
1172 {
1173     static if (GDC_with_SSE2)
1174     {
1175         alias _mm_cvtps_pd = __builtin_ia32_cvtps2pd;
1176     }
1177     else
1178     {
1179         __m128d _mm_cvtps_pd (__m128 a) pure  @safe
1180         {
1181             double2 r = void;
1182             r[0] = a[0];
1183             r[1] = a[1];
1184             return r;
1185         }
1186     }
1187 }
1188 unittest
1189 {
1190     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
1191     assert(A.array[0] == 54.0);
1192     assert(A.array[1] == 54.0);
1193 }
1194 
1195 double _mm_cvtsd_f64 (__m128d a) pure @safe
1196 {
1197     return a.array[0];
1198 }
1199 
1200 version(LDC)
1201 {
1202     alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si;
1203 }
1204 else
1205 {
1206     static if (GDC_with_SSE2)
1207     {
1208         alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si;
1209     }
1210     else
1211     {
1212         int _mm_cvtsd_si32 (__m128d a) pure @safe
1213         {
1214             return convertDoubleToInt32UsingMXCSR(a[0]);
1215         }
1216     }
1217 }
1218 unittest
1219 {
1220     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
1221 }
1222 
1223 version(LDC)
1224 {
1225     // Unfortunately this builtin crashes in 32-bit
1226     version(X86_64)
1227         alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
1228     else
1229     {
1230         long _mm_cvtsd_si64 (__m128d a) pure @safe
1231         {
1232             return convertDoubleToInt64UsingMXCSR(a[0]);
1233         }
1234     }
1235 }
1236 else
1237 {
1238     long _mm_cvtsd_si64 (__m128d a) pure @safe
1239     {
1240         return convertDoubleToInt64UsingMXCSR(a.array[0]);
1241     }
1242 }
1243 unittest
1244 {
1245     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
1246 
1247     uint savedRounding = _MM_GET_ROUNDING_MODE();
1248 
1249     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1250     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5)));
1251 
1252     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1253     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
1254 
1255     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1256     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
1257 
1258     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1259     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
1260 
1261     _MM_SET_ROUNDING_MODE(savedRounding);
1262 }
1263 
1264 alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
1265 
1266 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe
1267 {
1268     static if (GDC_with_SSE2)
1269     {
1270         return __builtin_ia32_cvtsd2ss(a, b); 
1271     }
1272     else
1273     {
1274         // Generates cvtsd2ss since LDC 1.3 -O0
1275         a[0] = b[0];
1276         return a;
1277     }
1278 }
1279 unittest
1280 {
1281     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
1282     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
1283 }
1284 
1285 int _mm_cvtsi128_si32 (__m128i a) pure @safe
1286 {
1287     return a.array[0];
1288 }
1289 
1290 long _mm_cvtsi128_si64 (__m128i a) pure @safe
1291 {
1292     long2 la = cast(long2)a;
1293     return la.array[0];
1294 }
1295 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
1296 
1297 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted
1298 {
1299     v.ptr[0] = cast(double)x;
1300     return v;
1301 }
1302 unittest
1303 {
1304     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
1305     assert(a.array == [42.0, 0]);
1306 }
1307 
1308 __m128i _mm_cvtsi32_si128 (int a) pure @trusted
1309 {
1310     int4 r = [0, 0, 0, 0];
1311     r.ptr[0] = a;
1312     return r;
1313 }
1314 unittest
1315 {
1316     __m128i a = _mm_cvtsi32_si128(65);
1317     assert(a.array == [65, 0, 0, 0]);
1318 }
1319 
1320 
1321 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
1322 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted
1323 {
1324     v.ptr[0] = cast(double)x;
1325     return v;
1326 }
1327 unittest
1328 {
1329     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
1330     assert(a.array == [42.0, 0]);
1331 }
1332 
1333 __m128i _mm_cvtsi64_si128 (long a) pure @trusted
1334 {
1335     long2 r = [0, 0];
1336     r.ptr[0] = a;
1337     return cast(__m128i)(r);
1338 }
1339 
1340 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
1341 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
1342 
1343 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted
1344 {
1345     v.ptr[0] = x.array[0];
1346     return v;
1347 }
1348 unittest
1349 {
1350     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
1351     assert(a.array == [42.0, 0]);
1352 }
1353 
1354 long _mm_cvttss_si64 (__m128 a) pure @safe
1355 {
1356     return cast(long)(a.array[0]); // Generates cvttss2si as expected
1357 }
1358 unittest
1359 {
1360     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1361 }
1362 
1363 version(LDC)
1364 {
1365     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1366 }
1367 else
1368 {
1369     static if (GDC_with_SSE2)
1370     {
1371         alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
1372     }
1373     else
1374     {
1375         __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe
1376         {
1377             // Note: doesn't generate cvttpd2dq as of LDC 1.13
1378             __m128i r;
1379             r.array[0] = cast(int)a.array[0];
1380             r.array[1] = cast(int)a.array[1];
1381             r.array[2] = 0;
1382             r.array[3] = 0;
1383             return r;
1384         }
1385     }
1386 }
1387 unittest
1388 {
1389     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
1390     assert(R.array == [-4, 45641, 0, 0]);
1391 }
1392 
1393 
1394 /// Convert packed double-precision (64-bit) floating-point elements in `v` 
1395 /// to packed 32-bit integers with truncation.
1396 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe
1397 {
1398     return to_m64(_mm_cvttpd_epi32(v));
1399 }
1400 unittest
1401 {
1402     int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f));
1403     int[2] correct = [-4, 45641];
1404     assert(R.array == correct);
1405 }
1406 
1407 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted
1408 {
1409     // Note: Generates cvttps2dq since LDC 1.3 -O2
1410     __m128i r;
1411     r.ptr[0] = cast(int)a.array[0];
1412     r.ptr[1] = cast(int)a.array[1];
1413     r.ptr[2] = cast(int)a.array[2];
1414     r.ptr[3] = cast(int)a.array[3];
1415     return r;
1416 }
1417 unittest
1418 {
1419     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
1420     assert(R.array == [-4, 45641, 0, 1]);
1421 }
1422 
1423 int _mm_cvttsd_si32 (__m128d a)
1424 {
1425     // Generates cvttsd2si since LDC 1.3 -O0
1426     return cast(int)a.array[0];
1427 }
1428 
1429 long _mm_cvttsd_si64 (__m128d a)
1430 {
1431     // Generates cvttsd2si since LDC 1.3 -O0
1432     // but in 32-bit instead, it's a long sequence that resort to FPU
1433     return cast(long)a.array[0];
1434 }
1435 
1436 alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
1437 
1438 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe
1439 {
1440     return a / b;
1441 }
1442 
1443 static if (GDC_with_SSE2)
1444 {
1445     __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted
1446     {
1447         return __builtin_ia32_divsd(a, b);
1448     }
1449 }
1450 else version(DigitalMars)
1451 {
1452     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1453     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1454     {
1455         asm pure nothrow @nogc @trusted { nop;}
1456         a.array[0] = a.array[0] / b.array[0];
1457         return a;
1458     }
1459 }
1460 else
1461 {
1462     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1463     {
1464         a.array[0] /= b.array[0];
1465         return a;
1466     }
1467 }
1468 unittest
1469 {
1470     __m128d a = [2.0, 4.5];
1471     a = _mm_div_sd(a, a);
1472     assert(a.array == [1.0, 4.5]);
1473 }
1474 
1475 /// Extract a 16-bit integer from `v`, selected with `index`
1476 int _mm_extract_epi16(__m128i v, int index) pure @safe
1477 {
1478     short8 r = cast(short8)v;
1479     return cast(ushort)(r.array[index]);
1480 }
1481 unittest
1482 {
1483     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1);
1484     assert(_mm_extract_epi16(A, 6) == 6);
1485     assert(_mm_extract_epi16(A, 0) == 65535);
1486 }
1487 
1488 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`.
1489 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1490 {
1491     short8 r = cast(short8)v;
1492     r.ptr[index & 7] = cast(short)i;
1493     return cast(__m128i)r;
1494 }
1495 unittest
1496 {
1497     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1498     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1499     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1500     assert(R.array == correct);
1501 }
1502 
1503 version(GNU)
1504 {
1505     void _mm_lfence() pure @trusted
1506     {
1507         static if (GDC_with_SSE2)
1508         {
1509             __builtin_ia32_lfence();
1510         }
1511         else version(X86)
1512         {
1513             asm pure nothrow @nogc @trusted
1514             {
1515                 "lfence;\n" : : : ;
1516             }
1517         }
1518         else
1519             static assert(false);
1520     }
1521 }
1522 else version(LDC)
1523 {
1524     alias _mm_lfence = __builtin_ia32_lfence;
1525 }
1526 else static if (DMD_with_asm)
1527 {
1528     void _mm_lfence() pure @safe
1529     {
1530         asm nothrow @nogc pure @safe
1531         {
1532             lfence;
1533         }
1534     }
1535 }
1536 else
1537     static assert(false);
1538 unittest
1539 {
1540     _mm_lfence();
1541 }
1542 
1543 
1544 __m128d _mm_load_pd (const(double) * mem_addr) pure
1545 {
1546     __m128d* aligned = cast(__m128d*)mem_addr;
1547     return *aligned;
1548 }
1549 
1550 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1551 {
1552     double[2] arr = [*mem_addr, *mem_addr];
1553     return loadUnaligned!(double2)(&arr[0]);
1554 }
1555 
1556 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted
1557 {
1558     double2 r = [0, 0];
1559     r.ptr[0] = *mem_addr;
1560     return r;
1561 }
1562 unittest
1563 {
1564     double x = -42;
1565     __m128d a = _mm_load_sd(&x);
1566     assert(a.array == [-42.0, 0.0]);
1567 }
1568 
1569 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
1570 {
1571     return *mem_addr;
1572 }
1573 
1574 alias _mm_load1_pd = _mm_load_pd1;
1575 
1576 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted
1577 {
1578     a.ptr[1] = *mem_addr;
1579     return a;
1580 }
1581 
1582 // Note: strange signature since the memory doesn't have to aligned
1583 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted
1584 {
1585     auto pLong = cast(const(long)*)mem_addr;
1586     long2 r = [0, 0];
1587     r.ptr[0] = *pLong;
1588     return cast(__m128i)(r);
1589 }
1590 
1591 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted
1592 {
1593     a.ptr[0] = *mem_addr;
1594     return a;
1595 }
1596 
1597 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted
1598 {
1599     __m128d a = *cast(__m128d*)(mem_addr);
1600     __m128d r;
1601     r.ptr[0] = a.array[1];
1602     r.ptr[1] = a.array[0];
1603     return r;
1604 }
1605 
1606 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
1607 {
1608     static if (GDC_with_SSE2)
1609     {
1610         return __builtin_ia32_loadupd(mem_addr); 
1611     }
1612     else
1613     {
1614         return loadUnaligned!(double2)(mem_addr);
1615     }
1616 }
1617 
1618 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1619 {
1620     static if (GDC_with_SSE2)
1621     {
1622         return __builtin_ia32_loaddqu(cast(const(char*))mem_addr);
1623     }
1624     else
1625     {
1626         return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1627     }
1628 }
1629 
1630 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1631 {
1632     int r = *cast(int*)(mem_addr);
1633     int4 result = [0, 0, 0, 0];
1634     result.ptr[0] = r;
1635     return result;
1636 }
1637 unittest
1638 {
1639     int r = 42;
1640     __m128i A = _mm_loadu_si32(&r);
1641     int[4] correct = [42, 0, 0, 0];
1642     assert(A.array == correct);
1643 }
1644 
1645 static if (GDC_with_SSE2)
1646 {
1647     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1648     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1649     /// and pack the results in destination.
1650     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1651 }
1652 else version(LDC)
1653 {
1654     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1655     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1656     /// and pack the results in destination.
1657     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1658 }
1659 else
1660 {
1661     /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
1662     /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
1663     /// and pack the results in destination.
1664     __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe
1665     {
1666         short8 sa = cast(short8)a;
1667         short8 sb = cast(short8)b;
1668 
1669         int4 r;
1670         foreach(i; 0..4)
1671         {
1672             r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
1673         }
1674         return r;
1675     }
1676 }
1677 unittest
1678 {
1679     short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1680     short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767];
1681     int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B);
1682     int[4] correct = [1, 13, -2147483648, 2*32767*32767];
1683     assert(R.array == correct);
1684 }
1685 
1686 version(LDC)
1687 {
1688     /// Conditionally store 8-bit integer elements from `a` into memory using `mask`
1689     /// (elements are not stored when the highest bit is not set in the corresponding element)
1690     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular
1691     /// boundary.
1692     alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR
1693 }
1694 else
1695 {
1696     static if (GDC_with_SSE2)
1697     {
1698         ///ditto
1699         void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1700         {
1701             return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr);
1702         }
1703     }
1704     else
1705     {
1706         ///ditto
1707         void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1708         {
1709             byte16 b = cast(byte16)a;
1710             byte16 m = cast(byte16)mask;
1711             byte* dest = cast(byte*)(mem_addr);
1712             foreach(j; 0..16)
1713             {
1714                 if (m.array[j] & 128)
1715                 {
1716                     dest[j] = b.array[j];
1717                 }
1718             }
1719         }
1720     }
1721 }
1722 unittest
1723 {
1724     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
1725     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
1726     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
1727     _mm_maskmoveu_si128(A, mask, dest.ptr);
1728     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
1729     assert(dest == correct);
1730 }
1731 
1732 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
1733 {
1734     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1735     __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
1736     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1737     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1738     return _mm_xor_si128(b, mask);
1739 }
1740 unittest
1741 {
1742     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1743                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1744     short[8] correct =                                  [45, 1,  9,  7, 9,  7, 0,  0];
1745     assert(R.array == correct);
1746 }
1747 
1748 
1749 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1750 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
1751 {
1752     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1753     __m128i value128 = _mm_set1_epi8(-128);
1754     __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1755     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1756     __m128i mask = _mm_and_si128(aTob, higher);
1757     return _mm_xor_si128(b, mask);
1758 }
1759 unittest
1760 {
1761     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1762                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1763     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
1764     assert(R.array == correct);
1765 }
1766 
1767 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe
1768 {
1769     static if (GDC_with_SSE2)
1770     {
1771         return __builtin_ia32_maxpd(a, b);
1772     }
1773     else
1774     {
1775         // Generates maxpd starting with LDC 1.9
1776         a[0] = (a[0] > b[0]) ? a[0] : b[0];
1777         a[1] = (a[1] > b[1]) ? a[1] : b[1];
1778         return a;
1779     }
1780 }
1781 unittest
1782 {
1783     __m128d A = _mm_setr_pd(4.0, 1.0);
1784     __m128d B = _mm_setr_pd(1.0, 8.0);
1785     __m128d M = _mm_max_pd(A, B);
1786     assert(M.array[0] == 4.0);
1787     assert(M.array[1] == 8.0);
1788 }
1789 
1790 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe
1791 {
1792     static if (GDC_with_SSE2)
1793     {
1794         return __builtin_ia32_maxsd(a, b);
1795     }
1796     else
1797     {
1798          __m128d r = a;
1799         // Generates maxsd starting with LDC 1.3
1800         r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0];
1801         return r;
1802     }
1803 }
1804 unittest
1805 {
1806     __m128d A = _mm_setr_pd(1.0, 1.0);
1807     __m128d B = _mm_setr_pd(4.0, 2.0);
1808     __m128d M = _mm_max_sd(A, B);
1809     assert(M.array[0] == 4.0);
1810     assert(M.array[1] == 1.0);
1811 }
1812 
1813 version(GNU)
1814 {
1815     void _mm_mfence() pure @trusted
1816     {
1817         static if (GDC_with_SSE2)
1818         {
1819             __builtin_ia32_mfence();
1820         }
1821         else version(X86)
1822         {
1823             asm pure nothrow @nogc @trusted
1824             {
1825                 "mfence;\n" : : : ;
1826             }
1827         }
1828         else
1829             static assert(false);
1830     }
1831 }
1832 else version(LDC)
1833 {
1834     alias _mm_mfence = __builtin_ia32_mfence;
1835 }
1836 else static if (DMD_with_asm)
1837 {
1838     void _mm_mfence() pure @safe
1839     {
1840         asm nothrow @nogc pure @safe
1841         {
1842             mfence;
1843         }
1844     }
1845 }
1846 else
1847     static assert(false);
1848 unittest
1849 {
1850     _mm_mfence();
1851 }
1852 
1853 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
1854 {
1855     // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?)
1856     // Implemented using masks and XOR
1857     __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
1858     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1859     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1860     return _mm_xor_si128(b, mask);
1861 }
1862 unittest
1863 {
1864     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1865                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1866     short[8] correct =  [-4,-8, -4, -8, 0,-57, 0, -57];
1867     assert(R.array == correct);
1868 }
1869 
1870 
1871 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
1872 {
1873     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1874     __m128i value128 = _mm_set1_epi8(-128);
1875     __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1876     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1877     __m128i mask = _mm_and_si128(aTob, lower);
1878     return _mm_xor_si128(b, mask);
1879 }
1880 unittest
1881 {
1882     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1883                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1884     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
1885     assert(R.array == correct);
1886 }
1887 
1888 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe
1889 {
1890     static if (GDC_with_SSE2)
1891     {
1892         return __builtin_ia32_minpd(a, b);
1893     }
1894     else
1895     {
1896         // Generates minpd starting with LDC 1.9
1897         a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
1898         a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1];
1899         return a;
1900     }
1901 }
1902 unittest
1903 {
1904     __m128d A = _mm_setr_pd(1.0, 2.0);
1905     __m128d B = _mm_setr_pd(4.0, 1.0);
1906     __m128d M = _mm_min_pd(A, B);
1907     assert(M.array[0] == 1.0);
1908     assert(M.array[1] == 1.0);
1909 }
1910 
1911 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
1912 {
1913     static if (GDC_with_SSE2)
1914     {
1915         return __builtin_ia32_minsd(a, b);
1916     }
1917     else
1918     {
1919         // Generates minsd starting with LDC 1.3
1920         __m128d r = a;
1921         r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0];
1922         return r;
1923     }
1924 }
1925 unittest
1926 {
1927     __m128d A = _mm_setr_pd(1.0, 3.0);
1928     __m128d B = _mm_setr_pd(4.0, 2.0);
1929     __m128d M = _mm_min_sd(A, B);
1930     assert(M.array[0] == 1.0);
1931     assert(M.array[1] == 3.0);
1932 }
1933 
1934 __m128i _mm_move_epi64 (__m128i a) pure @safe
1935 {
1936     static if (GDC_with_SSE2)
1937     {
1938         return __builtin_ia32_movq128(a);
1939     }
1940     else
1941     {
1942         long2 result = [ 0, 0 ];
1943         long2 la = cast(long2) a;
1944         result.array[0] = la.array[0];
1945         return cast(__m128i)(result);
1946     }
1947 }
1948 unittest
1949 {
1950     long2 A = [13, 47];
1951     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
1952     long[2] correct = [13, 0];
1953     assert(B.array == correct);
1954 }
1955 
1956 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
1957 {
1958     static if (GDC_with_SSE2)
1959     {
1960         return __builtin_ia32_movsd(a, b); 
1961     }
1962     else
1963     {
1964         b.array[1] = a.array[1];
1965         return b;
1966     }
1967 }
1968 unittest
1969 {
1970     double2 A = [13.0, 47.0];
1971     double2 B = [34.0, 58.0];
1972     double2 C = _mm_move_sd(A, B);
1973     double[2] correct = [34.0, 47.0];
1974     assert(C.array == correct);
1975 }
1976 
1977 version(LDC)
1978 {
1979     /// Create mask from the most significant bit of each 8-bit element in `v`.
1980     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
1981 }
1982 else
1983 {
1984     static if (GDC_with_SSE2)
1985     {
1986         /// Create mask from the most significant bit of each 8-bit element in `v`.
1987         alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
1988     }
1989     else
1990     {
1991         /// Create mask from the most significant bit of each 8-bit element in `v`.
1992         int _mm_movemask_epi8(__m128i v) pure @safe
1993         {
1994             byte16 ai = cast(byte16)v;
1995             int r = 0;
1996             foreach(bit; 0..16)
1997             {
1998                 if (ai.array[bit] < 0) r += (1 << bit);
1999             }
2000             return r;
2001         }
2002     }
2003 }
2004 unittest
2005 {
2006     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0)));
2007 }
2008 
2009 version(LDC)
2010 {
2011     /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2012     /// packed double-precision (64-bit) floating-point element in `v`.
2013     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2014 }
2015 else
2016 {
2017     static if (GDC_with_SSE2)
2018     {
2019         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2020         /// packed double-precision (64-bit) floating-point element in `v`.
2021         alias _mm_movemask_pd = __builtin_ia32_movmskpd;
2022     }
2023     else
2024     {
2025         /// Set each bit of mask `dst` based on the most significant bit of the corresponding
2026         /// packed double-precision (64-bit) floating-point element in `v`.
2027         int _mm_movemask_pd(__m128d v) pure @safe
2028         {
2029             long2 lv = cast(long2)v;
2030             int r = 0;
2031             if (lv.array[0] < 0) r += 1;
2032             if (lv.array[1] < 0) r += 2;
2033             return r;
2034         }
2035     }
2036 }
2037 unittest
2038 {
2039     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
2040     assert(_mm_movemask_pd(A) == 2);
2041 }
2042 
2043 /// Copy the lower 64-bit integer in `v`.
2044 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe
2045 {
2046     long2 lv = cast(long2)v;
2047     return long1(lv.array[0]);
2048 }
2049 unittest
2050 {
2051     __m128i A = _mm_set_epi64x(-1, -2);
2052     __m64 R = _mm_movepi64_pi64(A);
2053     assert(R.array[0] == -2);
2054 }
2055 
2056 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element.
2057 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted
2058 {
2059     long2 r;
2060     r.ptr[0] = a.array[0];
2061     r.ptr[1] = 0;
2062     return cast(__m128i)r;
2063 }
2064 
2065 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC
2066 // and is SSE4.1 in GDC
2067 // but seems there in clang
2068 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted
2069 {
2070     __m128i zero = _mm_setzero_si128();
2071     long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
2072     long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
2073     static if (__VERSION__ >= 2076)
2074     {
2075         return cast(__m128i)(la * lb);
2076     }
2077     else
2078     {
2079         // long2 mul not supported before LDC 1.5
2080         la.ptr[0] *= lb.array[0];
2081         la.ptr[1] *= lb.array[1];
2082         return cast(__m128i)(la);
2083     }
2084 }
2085 unittest
2086 {
2087     __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff);
2088     __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff);
2089     __m128i C = _mm_mul_epu32(A, B);
2090     long2 LC = cast(long2)C;
2091     assert(LC.array[0] == 18446744065119617025uL);
2092     assert(LC.array[1] == 12723420444339690338uL);
2093 }
2094 
2095 
2096 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
2097 {
2098     return a * b;
2099 }
2100 unittest
2101 {
2102     __m128d a = [-2.0, 1.5];
2103     a = _mm_mul_pd(a, a);
2104     assert(a.array == [4.0, 2.25]);
2105 }
2106 
2107 version(DigitalMars)
2108 {
2109     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2110     __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2111     {
2112         asm pure nothrow @nogc @trusted { nop;}
2113         a.array[0] = a.array[0] * b.array[0];
2114         return a;
2115     }
2116 }
2117 else
2118 {
2119     static if (GDC_with_SSE2)
2120     {
2121         alias _mm_mul_sd = __builtin_ia32_mulsd;
2122     }
2123     else
2124     {
2125         __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
2126         {
2127             a.array[0] *= b.array[0];
2128             return a;
2129         }
2130     }
2131 }
2132 unittest
2133 {
2134     __m128d a = [-2.0, 1.5];
2135     a = _mm_mul_sd(a, a);
2136     assert(a.array == [4.0, 1.5]);
2137 }
2138 
2139 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 
2140 /// and get an unsigned 64-bit result.
2141 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe
2142 {
2143     return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b)));
2144 }
2145 unittest
2146 {
2147     __m64 A = _mm_set_pi32(42, 0xDEADBEEF);
2148     __m64 B = _mm_set_pi32(42, 0xCAFEBABE);
2149     __m64 C = _mm_mul_su32(A, B);
2150     assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL);
2151 }
2152 
2153 version(LDC)
2154 {
2155     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
2156 }
2157 else
2158 {
2159     static if (GDC_with_SSE2)
2160     {
2161         alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
2162     }
2163     else
2164     {
2165         __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe
2166         {
2167             short8 sa = cast(short8)a;
2168             short8 sb = cast(short8)b;
2169             short8 r = void;
2170             r.array[0] = (sa.array[0] * sb.array[0]) >> 16;
2171             r.array[1] = (sa.array[1] * sb.array[1]) >> 16;
2172             r.array[2] = (sa.array[2] * sb.array[2]) >> 16;
2173             r.array[3] = (sa.array[3] * sb.array[3]) >> 16;
2174             r.array[4] = (sa.array[4] * sb.array[4]) >> 16;
2175             r.array[5] = (sa.array[5] * sb.array[5]) >> 16;
2176             r.array[6] = (sa.array[6] * sb.array[6]) >> 16;
2177             r.array[7] = (sa.array[7] * sb.array[7]) >> 16;
2178             return cast(__m128i)r;
2179         }
2180     }
2181 }
2182 unittest
2183 {
2184     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2185     __m128i B = _mm_set1_epi16(16384);
2186     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
2187     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
2188     assert(R.array == correct);
2189 }
2190 
2191 version(LDC)
2192 {
2193     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
2194 }
2195 else
2196 {
2197     static if (GDC_with_SSE2)
2198     {
2199         alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
2200     }
2201     else
2202     {
2203         __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe
2204         {
2205             short8 sa = cast(short8)a;
2206             short8 sb = cast(short8)b;
2207             short8 r = void;
2208             r.array[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 );
2209             r.array[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 );
2210             r.array[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 );
2211             r.array[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 );
2212             r.array[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 );
2213             r.array[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 );
2214             r.array[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 );
2215             r.array[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 );
2216             return cast(__m128i)r;
2217         }
2218     }
2219 }
2220 unittest
2221 {
2222     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
2223     __m128i B = _mm_set1_epi16(16384);
2224     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
2225     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
2226     assert(R.array == correct);
2227 }
2228 
2229 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe
2230 {
2231     return cast(__m128i)(cast(short8)a * cast(short8)b);
2232 }
2233 unittest
2234 {
2235     __m128i A = _mm_setr_epi16(16384, -16, 0,      3, 4, 1, 16, 7);
2236     __m128i B = _mm_set1_epi16(16384);
2237     short8 R = cast(short8)_mm_mullo_epi16(A, B);
2238     short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384];
2239     assert(R.array == correct);
2240 }
2241 
2242 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
2243 {
2244     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
2245 }
2246 
2247 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
2248 {
2249     return a | b;
2250 }
2251 
2252 version(LDC)
2253 {
2254     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
2255 }
2256 else
2257 {
2258     static if (GDC_with_SSE2)
2259     {
2260         alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
2261     }
2262     else
2263     {
2264         __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe
2265         {
2266             short8 r;
2267             r.array[0] = saturateSignedIntToSignedShort(a.array[0]);
2268             r.array[1] = saturateSignedIntToSignedShort(a.array[1]);
2269             r.array[2] = saturateSignedIntToSignedShort(a.array[2]);
2270             r.array[3] = saturateSignedIntToSignedShort(a.array[3]);
2271             r.array[4] = saturateSignedIntToSignedShort(b.array[0]);
2272             r.array[5] = saturateSignedIntToSignedShort(b.array[1]);
2273             r.array[6] = saturateSignedIntToSignedShort(b.array[2]);
2274             r.array[7] = saturateSignedIntToSignedShort(b.array[3]);
2275             return cast(__m128i)r;
2276         }
2277     }
2278 }
2279 unittest
2280 {
2281     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
2282     short8 R = cast(short8) _mm_packs_epi32(A, A);
2283     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
2284     assert(R.array == correct);
2285 }
2286 
2287 version(LDC)
2288 {
2289     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
2290 }
2291 else
2292 {
2293     static if (GDC_with_SSE2)
2294     {
2295         alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
2296     }
2297     else
2298     {
2299         __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe
2300         {
2301             byte16 r;
2302             short8 sa = cast(short8)a;
2303             short8 sb = cast(short8)b;
2304             foreach(i; 0..8)
2305                 r.array[i] = saturateSignedWordToSignedByte(sa.array[i]);
2306             foreach(i; 0..8)
2307                 r.array[i+8] = saturateSignedWordToSignedByte(sb.array[i]);
2308             return cast(__m128i)r;
2309         }
2310     }
2311 }
2312 unittest
2313 {
2314     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
2315     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
2316     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
2317                         127, -128, 127, 0, 127, -128, 127, 0];
2318     assert(R.array == correct);
2319 }
2320 
2321 version(LDC)
2322 {
2323     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
2324 }
2325 else
2326 {
2327     static if (GDC_with_SSE2)
2328     {
2329         alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
2330     }
2331     else
2332     {
2333         __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted
2334         {
2335             short8 sa = cast(short8)a;
2336             short8 sb = cast(short8)b;
2337             ubyte[16] result = void;
2338             for (int i = 0; i < 8; ++i)
2339             {
2340                 short s = sa[i];
2341                 if (s < 0) s = 0;
2342                 if (s > 255) s = 255;
2343                 result[i] = cast(ubyte)s;
2344 
2345                 s = sb[i];
2346                 if (s < 0) s = 0;
2347                 if (s > 255) s = 255;
2348                 result[i+8] = cast(ubyte)s;
2349             }
2350             return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
2351         }
2352     }
2353 }
2354 unittest
2355 {
2356     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
2357     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
2358     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
2359                                                 0, 255, 0, 255, 255, 2, 1, 0];
2360     foreach(i; 0..16)
2361         assert(AA.array[i] == cast(byte)(correctResult[i]));
2362 }
2363 
2364 
2365 version(GNU)
2366 {
2367     void _mm_pause() pure @trusted
2368     {
2369         static if (GDC_with_SSE2)
2370         {
2371             __builtin_ia32_pause();
2372         }
2373         else version(X86)
2374         {
2375             asm pure nothrow @nogc @trusted
2376             {
2377                 "pause;\n" : : : ;
2378             }
2379         }
2380         else
2381             static assert(false);
2382     }
2383 }
2384 else version(LDC)
2385 {
2386     alias _mm_pause = __builtin_ia32_pause;
2387 }
2388 else static if (DMD_with_asm)
2389 {
2390     void _mm_pause() pure @safe
2391     {
2392         asm nothrow @nogc pure @safe
2393         {
2394             rep; nop; // F3 90 =  pause
2395         }
2396     }
2397 }
2398 else
2399     static assert(false);
2400 unittest
2401 {
2402     _mm_pause();
2403 }
2404 
2405 
2406 version(LDC)
2407 {
2408     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
2409 }
2410 else
2411 {
2412     static if (GDC_with_SSE2)
2413     {
2414         alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
2415     }
2416     else
2417     {
2418         __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe
2419         {
2420             byte16 ab = cast(byte16)a;
2421             byte16 bb = cast(byte16)b;
2422             ubyte[16] t;
2423             foreach(i; 0..16)
2424             {
2425                 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
2426                 if (diff < 0) diff = -diff;
2427                 t[i] = cast(ubyte)(diff);
2428             }
2429             int4 r = _mm_setzero_si128();
2430             r.array[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
2431             r.array[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
2432             return r;
2433         }
2434     }
2435 }
2436 unittest
2437 {
2438     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
2439     __m128i B = _mm_set1_epi8(1);
2440     __m128i R = _mm_sad_epu8(A, B);
2441     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
2442                       0,
2443                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
2444                       0];
2445     assert(R.array == correct);
2446 }
2447 
2448 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
2449 {
2450     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
2451     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
2452 }
2453 unittest
2454 {
2455     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
2456     short8 B = cast(short8) A;
2457     foreach(i; 0..8)
2458         assert(B.array[i] == i);
2459 }
2460 
2461 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2462 {
2463     int[4] result = [e0, e1, e2, e3];
2464     return loadUnaligned!(int4)(result.ptr);
2465 }
2466 unittest
2467 {
2468     __m128i A = _mm_set_epi32(3, 2, 1, 0);
2469     foreach(i; 0..4)
2470         assert(A.array[i] == i);
2471 }
2472 
2473 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
2474 {
2475     long[2] result = [e0.array[0], e1.array[0]];
2476     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2477 }
2478 unittest
2479 {
2480     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
2481     long2 B = cast(long2) A;
2482     assert(B.array[0] == 5678);
2483     assert(B.array[1] == 1234);
2484 }
2485 
2486 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
2487 {
2488     long[2] result = [e0, e1];
2489     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2490 }
2491 unittest
2492 {
2493     __m128i A = _mm_set_epi64x(1234, 5678);
2494     long2 B = cast(long2) A;
2495     assert(B.array[0] == 5678);
2496     assert(B.array[1] == 1234);
2497 }
2498 
2499 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
2500                       byte e11, byte e10, byte e9, byte e8,
2501                       byte e7, byte e6, byte e5, byte e4,
2502                       byte e3, byte e2, byte e1, byte e0) pure @trusted
2503 {
2504     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
2505                      e8, e9, e10, e11, e12, e13, e14, e15];
2506     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2507 }
2508 
2509 __m128d _mm_set_pd (double e1, double e0) pure @trusted
2510 {
2511     double[2] result = [e0, e1];
2512     return loadUnaligned!(double2)(result.ptr);
2513 }
2514 unittest
2515 {
2516     __m128d A = _mm_set_pd(61.0, 55.0);
2517     double[2] correct = [55.0, 61.0];
2518     assert(A.array == correct);
2519 }
2520 
2521 __m128d _mm_set_pd1 (double a) pure @trusted
2522 {
2523     double[2] result = [a, a];
2524     return loadUnaligned!(double2)(result.ptr);
2525 }
2526 unittest
2527 {
2528     __m128d A = _mm_set_pd1(61.0);
2529     double[2] correct = [61.0, 61.0];
2530     assert(A.array == correct);
2531 }
2532 
2533 __m128d _mm_set_sd (double a) pure @trusted
2534 {
2535     double[2] result = [a, 0];
2536     return loadUnaligned!(double2)(result.ptr);
2537 }
2538 
2539 __m128i _mm_set1_epi16 (short a) pure @trusted
2540 {
2541     return cast(__m128i)(short8(a));
2542 }
2543 
2544 __m128i _mm_set1_epi32 (int a) pure @trusted
2545 {
2546     return cast(__m128i)(int4(a));
2547 }
2548 unittest
2549 {
2550     __m128 a = _mm_set1_ps(-1.0f);
2551     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
2552     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
2553 }
2554 
2555 /// Broadcast 64-bit integer `a` to all elements of `dst`.
2556 __m128i _mm_set1_epi64 (__m64 a) pure @safe
2557 {
2558     return _mm_set_epi64(a, a);
2559 }
2560 
2561 __m128i _mm_set1_epi64x (long a) pure @trusted
2562 {
2563     return cast(__m128i)(long2(a));
2564 }
2565 
2566 __m128i _mm_set1_epi8 (byte a) pure @trusted
2567 {
2568     return cast(__m128i)(byte16(a));
2569 }
2570 
2571 alias _mm_set1_pd = _mm_set_pd1;
2572 
2573 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 
2574                         short e3, short e2, short e1, short e0) pure @trusted
2575 {
2576     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
2577     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
2578 }
2579 
2580 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
2581 {
2582     int[4] result = [e3, e2, e1, e0];
2583     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2584 }
2585 
2586 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
2587 {
2588     long[2] result = [e1, e0];
2589     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
2590 }
2591 
2592 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
2593                        byte e11, byte e10, byte e9,  byte e8,
2594                        byte e7,  byte e6,  byte e5,  byte e4,
2595                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
2596 {
2597     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
2598                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
2599     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
2600 }
2601 
2602 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
2603 {
2604     double[2] result = [e1, e0];
2605     return loadUnaligned!(double2)(result.ptr);
2606 }
2607 unittest
2608 {
2609     __m128d A = _mm_setr_pd(61.0, 55.0);
2610     double[2] correct = [61.0, 55.0];
2611     assert(A.array == correct);
2612 }
2613 
2614 __m128d _mm_setzero_pd () pure @trusted
2615 {
2616     double[2] result = [0.0, 0.0];
2617     return loadUnaligned!(double2)(result.ptr);
2618 }
2619 
2620 __m128i _mm_setzero_si128() pure @trusted
2621 {
2622     int[4] result = [0, 0, 0, 0];
2623     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
2624 }
2625 
2626 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
2627 {
2628     static if (GDC_with_SSE2)
2629     {
2630         return __builtin_ia32_pshufd(a, imm8);
2631     }
2632     else
2633     {
2634         return shufflevector!(int4, (imm8 >> 0) & 3,
2635                                     (imm8 >> 2) & 3,
2636                                     (imm8 >> 4) & 3,
2637                                     (imm8 >> 6) & 3)(a, a);
2638     }
2639 }
2640 unittest
2641 {
2642     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
2643     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2644     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
2645     int[4] expectedB = [ 3, 2, 1, 0 ];
2646     assert(B.array == expectedB);
2647 }
2648 
2649 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
2650 {
2651     static if (GDC_with_SSE2)
2652     {
2653         return __builtin_ia32_shufpd(a, b, imm8);
2654     }
2655     else
2656     {
2657         return shufflevector!(double2, 0 + ( imm8 & 1 ),
2658                                        2 + ( (imm8 >> 1) & 1 ))(a, b);
2659     }
2660 }
2661 unittest
2662 {
2663     __m128d A = _mm_setr_pd(0.5, 2.0);
2664     __m128d B = _mm_setr_pd(4.0, 5.0);
2665     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
2666     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
2667     double[2] correct = [ 2.0, 5.0 ];
2668     assert(R.array == correct);
2669 }
2670 
2671 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
2672 {
2673     static if (GDC_with_SSE2)
2674     {
2675         return __builtin_ia32_pshufhw(a, imm8);
2676     }
2677     else
2678     {
2679         return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
2680                                           4 + ( (imm8 >> 0) & 3 ),
2681                                           4 + ( (imm8 >> 2) & 3 ),
2682                                           4 + ( (imm8 >> 4) & 3 ),
2683                                           4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
2684     }
2685 }
2686 unittest
2687 {
2688     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2689     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2690     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
2691     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
2692     assert(C.array == expectedC);
2693 }
2694 
2695 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
2696 {
2697     static if (GDC_with_SSE2)
2698     {
2699         return __builtin_ia32_pshuflw(a, imm8);
2700     }
2701     else
2702     {
2703         return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
2704                                                     ( (imm8 >> 2) & 3 ),
2705                                                     ( (imm8 >> 4) & 3 ),
2706                                                     ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
2707     }
2708 }
2709 unittest
2710 {
2711     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2712     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2713     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
2714     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
2715     assert(B.array == expectedB);
2716 }
2717 
2718 version(LDC)
2719 {
2720     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
2721 }
2722 else static if (GDC_with_SSE2)
2723 {
2724     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
2725 }
2726 else static if (DMD_with_32bit_asm)
2727 {
2728     __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
2729     {
2730         asm pure nothrow @nogc @trusted
2731         {
2732             movdqu XMM0, a;
2733             movdqu XMM1, count;
2734             pslld XMM0, XMM1;
2735             movdqu a, XMM0;
2736         }
2737         return a;
2738     }
2739 }
2740 else
2741 {
2742 
2743     __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
2744     {
2745         int4 r = void;
2746         long2 lc = cast(long2)count;
2747         int bits = cast(int)(lc.array[0]);
2748         foreach(i; 0..4)
2749             r[i] = cast(uint)(a[i]) << bits;
2750         return r;
2751     }
2752 }
2753 unittest
2754 {
2755     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2756     __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1));
2757     int[4] expectedB = [ 0, 4, 6, -8];
2758     assert(B.array == expectedB);
2759 }
2760 
2761 version(LDC)
2762 {
2763     alias _mm_sll_epi64  = __builtin_ia32_psllq128;
2764 }
2765 else static if (GDC_with_SSE2)
2766 {
2767     alias _mm_sll_epi64  = __builtin_ia32_psllq128;
2768 }
2769 else static if (DMD_with_32bit_asm)
2770 {
2771     __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
2772     {
2773         asm pure nothrow @nogc @trusted
2774         {
2775             movdqu XMM0, a;
2776             movdqu XMM1, count;
2777             psllq XMM0, XMM1;
2778             movdqu a, XMM0;
2779         }
2780         return a;
2781     }
2782 }
2783 else
2784 {
2785     __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
2786     {
2787         long2 r = void;
2788         long2 sa = cast(long2)a;
2789         long2 lc = cast(long2)count;
2790         int bits = cast(int)(lc.array[0]);
2791         foreach(i; 0..2)
2792             r.array[i] = cast(ulong)(sa.array[i]) << bits;
2793         return cast(__m128i)r;
2794     }
2795 }
2796 unittest
2797 {
2798     __m128i A = _mm_setr_epi64(8, -4);
2799     long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1));
2800     long[2] expectedB = [ 16, -8];
2801     assert(B.array == expectedB);
2802 }
2803 
2804 version(LDC)
2805 {
2806     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
2807 }
2808 else static if (GDC_with_SSE2)
2809 {
2810     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
2811 }
2812 else static if (DMD_with_32bit_asm)
2813 {
2814     __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
2815     {
2816         asm pure nothrow @nogc
2817         {
2818             movdqu XMM0, a;
2819             movdqu XMM1, count;
2820             psllw XMM0, XMM1;
2821             movdqu a, XMM0;
2822         }
2823         return a;
2824     }
2825 }
2826 else
2827 {
2828     __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted
2829     {
2830         short8 sa = cast(short8)a;
2831         long2 lc = cast(long2)count;
2832         int bits = cast(int)(lc.array[0]);
2833         short8 r = void;
2834         foreach(i; 0..8)
2835             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits);
2836         return cast(int4)r;
2837     }
2838 }
2839 unittest
2840 {
2841     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2842     short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) );
2843     short[8] expectedB =     [ 0, 2, 4, 6, -8, -10, 12, 14 ];
2844     assert(B.array == expectedB);
2845 }
2846 
2847 version(LDC)
2848 {
2849     alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
2850 }
2851 else
2852 {
2853     static if (GDC_with_SSE2)
2854     {
2855         alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
2856     }
2857     else
2858     {
2859         __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
2860         {
2861             int4 r = void;
2862             foreach(i; 0..4)
2863                 r.array[i] = cast(uint)(a.array[i]) << imm8;
2864             return r;
2865         }
2866     }
2867 }
2868 unittest
2869 {
2870     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2871     __m128i B = _mm_slli_epi32(A, 1);
2872     int[4] expectedB = [ 0, 4, 6, -8];
2873     assert(B.array == expectedB);
2874 }
2875 
2876 version(LDC)
2877 {
2878     alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
2879 }
2880 else
2881 {
2882     static if (GDC_with_SSE2)
2883     {
2884         alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
2885     }
2886     else
2887     {
2888         __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe
2889         {
2890             long2 r = void;
2891             long2 sa = cast(long2)a;
2892             foreach(i; 0..2)
2893                 r.array[i] = cast(ulong)(sa.array[i]) << imm8;
2894             return cast(__m128i)r;
2895         }
2896     }
2897 }
2898 unittest
2899 {
2900     __m128i A = _mm_setr_epi64(8, -4);
2901     long2 B = cast(long2) _mm_slli_epi64(A, 1);
2902     long[2] expectedB = [ 16, -8];
2903     assert(B.array == expectedB);
2904 }
2905 
2906 version(LDC)
2907 {
2908     alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
2909 }
2910 else
2911 {
2912     static if (GDC_with_SSE2)
2913     {
2914         alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
2915     }
2916     else
2917     {
2918         __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe
2919         {
2920             short8 sa = cast(short8)a;
2921             short8 r = void;
2922             foreach(i; 0..8)
2923                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) << imm8);
2924             return cast(int4)r;
2925         }
2926     }
2927 }
2928 unittest
2929 {
2930     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2931     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
2932     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
2933     assert(B.array == expectedB);
2934 }
2935 
2936 
2937 /// Shift `a` left by `bytes` bytes while shifting in zeros.
2938 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted
2939 {
2940     static if (bytes & 0xF0)
2941     {
2942         return _mm_setzero_si128();
2943     }
2944     else
2945     {
2946         static if (GDC_with_SSE2)
2947         {
2948             return __builtin_ia32_i128(op, cast(ubyte)(bytes * 8)); 
2949         }
2950         else version(DigitalMars)
2951         {
2952             version(D_InlineAsm_X86)
2953             {
2954                 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64
2955                 {
2956                     movdqu XMM0, op;
2957                     pslldq XMM0, bytes;
2958                     movdqu op, XMM0;
2959                 }
2960                 return op;
2961             }
2962             else
2963             {
2964                 byte16 A = cast(byte16)op;
2965                 byte16 R;
2966                 for (int n = 15; n >= bytes; --n)
2967                     R.ptr[n] = A.array[n-bytes];
2968                 for (int n = bytes-1; n >= 0; --n)
2969                     R.ptr[n] = 0;
2970                 return cast(__m128i)R;
2971             }
2972         }
2973         else
2974         {
2975             return cast(__m128i) shufflevector!(byte16,
2976             16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes,
2977             22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes,
2978             28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes)
2979             (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
2980         }
2981     }
2982 }
2983 unittest
2984 {
2985     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2986     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
2987     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
2988     assert(R.array == correct);
2989 }
2990 
2991 version(LDC)
2992 {
2993     // Disappeared with LDC 1.11
2994     static if (__VERSION__ < 2081)
2995         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
2996     else
2997     {
2998         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
2999         {
3000             vec.array[0] = llvm_sqrt(vec.array[0]);
3001             vec.array[1] = llvm_sqrt(vec.array[1]);
3002             return vec;
3003         }
3004     }
3005 }
3006 else
3007 {
3008     static if (GDC_with_SSE2)
3009     {
3010         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
3011     }
3012     else
3013     {
3014         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
3015         {
3016             vec.array[0] = sqrt(vec.array[0]);
3017             vec.array[1] = sqrt(vec.array[1]);
3018             return vec;
3019         }
3020     }
3021 }
3022 
3023 
3024 version(LDC)
3025 {
3026     // Disappeared with LDC 1.11
3027     static if (__VERSION__ < 2081)
3028         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3029     else
3030     {
3031         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3032         {
3033             vec.array[0] = llvm_sqrt(vec.array[0]);
3034             vec.array[1] = vec.array[1];
3035             return vec;
3036         }
3037     }
3038 }
3039 else
3040 {
3041     static if (GDC_with_SSE2)
3042     {
3043         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
3044     }
3045     else
3046     {
3047         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
3048         {
3049             vec.array[0] = sqrt(vec.array[0]);
3050             vec.array[1] = vec.array[1];
3051             return vec;
3052         }
3053     }
3054 }
3055 
3056 
3057 version(LDC)
3058 {
3059     alias _mm_sra_epi16 = __builtin_ia32_psraw128;
3060 }
3061 else
3062 {
3063     static if (GDC_with_SSE2)
3064     {
3065         alias _mm_sra_epi16 = __builtin_ia32_psraw128;
3066     }
3067     else
3068     {
3069         __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
3070         {
3071             short8 sa = cast(short8)a;
3072             long2 lc = cast(long2)count;
3073             int bits = cast(int)(lc.array[0]);
3074             short8 r = void;
3075             foreach(i; 0..8)
3076                 r.array[i] = cast(short)(sa.array[i] >> bits);
3077             return cast(int4)r;
3078         }
3079     }
3080 }
3081 unittest
3082 {
3083     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3084     short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) );
3085     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3086     assert(B.array == expectedB);
3087 }
3088 
3089 version(LDC)
3090 {
3091     alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3092 }
3093 else
3094 {
3095     static if (GDC_with_SSE2)
3096     {
3097         alias _mm_sra_epi32  = __builtin_ia32_psrad128;
3098     }
3099     else
3100     {
3101         __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe
3102         {
3103             int4 r = void;
3104             long2 lc = cast(long2)count;
3105             int bits = cast(int)(lc.array[0]);
3106             foreach(i; 0..4)
3107                 r.array[i] = (a.array[i] >> bits);
3108             return r;
3109         }
3110     }
3111 }
3112 unittest
3113 {
3114     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3115     __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1));
3116     int[4] expectedB = [ 0, 1, 1, -2];
3117     assert(B.array == expectedB);
3118 }
3119 
3120 
3121 version(LDC)
3122 {
3123     alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
3124 }
3125 else
3126 {
3127     static if (GDC_with_SSE2)
3128     {
3129         alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
3130     }
3131     else
3132     {
3133         __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe
3134         {
3135             short8 sa = cast(short8)a;
3136             short8 r = void;
3137             foreach(i; 0..8)
3138                 r.array[i] = cast(short)(sa.array[i] >> imm8);
3139             return cast(int4)r;
3140         }
3141     }
3142 }
3143 unittest
3144 {
3145     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3146     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
3147     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
3148     assert(B.array == expectedB);
3149 }
3150 
3151 version(LDC)
3152 {
3153     alias _mm_srai_epi32  = __builtin_ia32_psradi128;
3154 }
3155 else
3156 {
3157     static if (GDC_with_SSE2)
3158     {
3159         alias _mm_srai_epi32  = __builtin_ia32_psradi128;
3160     }
3161     else
3162     {
3163         __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe
3164         {
3165             int4 r = void;
3166             foreach(i; 0..4)
3167                 r.array[i] = (a.array[i] >> imm8);
3168             return r;
3169         }
3170     }
3171 }
3172 unittest
3173 {
3174     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3175     __m128i B = _mm_srai_epi32(A, 1);
3176     int[4] expectedB = [ 0, 1, 1, -2];
3177     assert(B.array == expectedB);
3178 }
3179 
3180 version(LDC)
3181 {
3182     alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
3183 }
3184 else
3185 {
3186     static if (GDC_with_SSE2)
3187     {
3188         alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
3189     }
3190     else
3191     {
3192         __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
3193         {
3194             short8 sa = cast(short8)a;
3195             long2 lc = cast(long2)count;
3196             int bits = cast(int)(lc.array[0]);
3197             short8 r = void;
3198             foreach(i; 0..8)
3199                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits);
3200             return cast(int4)r;
3201         }
3202     }
3203 }
3204 unittest
3205 {
3206     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3207     short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) );
3208     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3209     assert(B.array == expectedB);
3210 }
3211 
3212 version(LDC)
3213 {
3214     alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3215 }
3216 else
3217 {
3218     static if (GDC_with_SSE2)
3219     {
3220         alias _mm_srl_epi32  = __builtin_ia32_psrld128;
3221     }
3222     else
3223     {
3224         __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe
3225         {
3226             int4 r = void;
3227             long2 lc = cast(long2)count;
3228             int bits = cast(int)(lc.array[0]);
3229             foreach(i; 0..4)
3230                 r.array[i] = cast(uint)(a.array[i]) >> bits;
3231             return r;
3232         }
3233     }
3234 }
3235 unittest
3236 {
3237     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3238     __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1));
3239     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3240     assert(B.array == expectedB);
3241 }
3242 
3243 version(LDC)
3244 {
3245     alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
3246 }
3247 else
3248 {
3249     static if (GDC_with_SSE2)
3250     {
3251         alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
3252     }
3253     else
3254     {
3255         __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
3256         {
3257             long2 r = void;
3258             long2 sa = cast(long2)a;
3259             long2 lc = cast(long2)count;
3260             int bits = cast(int)(lc.array[0]);
3261             foreach(i; 0..2)
3262                 r.array[i] = cast(ulong)(sa.array[i]) >> bits;
3263             return cast(__m128i)r;
3264         }
3265     }
3266 }
3267 unittest
3268 {
3269     __m128i A = _mm_setr_epi64(8, -4);
3270     long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1));
3271     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
3272     assert(B.array == expectedB);
3273 }
3274 
3275 version(LDC)
3276 {
3277     alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
3278 }
3279 else
3280 {
3281     static if (GDC_with_SSE2)
3282     {
3283         alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
3284     }
3285     else
3286     {
3287         __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
3288         {
3289             short8 sa = cast(short8)a;
3290             short8 r = void;
3291             foreach(i; 0..8)
3292                 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> imm8);
3293             return cast(int4)r;
3294         }
3295     }
3296 }
3297 unittest
3298 {
3299     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
3300     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
3301     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
3302     assert(B.array == expectedB);
3303 }
3304 
3305 version(LDC)
3306 {
3307     alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
3308 }
3309 else
3310 {
3311     static if (GDC_with_SSE2)
3312     {
3313         alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
3314     }
3315     else
3316     {
3317         __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe
3318         {
3319             int4 r = void;
3320             foreach(i; 0..4)
3321                 r.array[i] = cast(uint)(a.array[i]) >> imm8;
3322             return r;
3323         }
3324     }
3325 }
3326 unittest
3327 {
3328     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
3329     __m128i B = _mm_srli_epi32(A, 1);
3330     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
3331     assert(B.array == expectedB);
3332 }
3333 
3334 version(LDC)
3335 {
3336     alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
3337 }
3338 else
3339 {
3340     static if (GDC_with_SSE2)
3341     {
3342         alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
3343     }
3344     else
3345     {
3346         __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe
3347         {
3348             long2 r = void;
3349             long2 sa = cast(long2)a;
3350             foreach(i; 0..2)
3351                 r.array[i] = cast(ulong)(sa.array[i]) >> imm8;
3352             return cast(__m128i)r;
3353         }
3354     }
3355 }
3356 unittest
3357 {
3358     __m128i A = _mm_setr_epi64(8, -4);
3359     long2 B = cast(long2) _mm_srli_epi64(A, 1);
3360     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
3361     assert(B.array == expectedB);
3362 }
3363 
3364 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3365 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
3366 {
3367     static if (bytes & 0xF0)
3368     {
3369         return _mm_setzero_si128();
3370     }
3371     else
3372     {
3373         static if (GDC_with_SSE2)
3374         {
3375             return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8));
3376         }
3377         else static if (DMD_with_32bit_asm)
3378         {
3379             asm pure nothrow @nogc @trusted
3380             {
3381                 movdqu XMM0, v;
3382                 psrldq XMM0, bytes;
3383                 movdqu v, XMM0;
3384             }
3385             return v;
3386         }
3387         else
3388         {
3389             return cast(__m128i) shufflevector!(byte16,
3390                                                 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
3391                                                 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
3392                                                (cast(byte16) v, cast(byte16)_mm_setzero_si128());
3393         }
3394     }
3395 
3396 }
3397 
3398 unittest
3399 {
3400     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
3401     int[4] correct = [2, 3, 4, 0];
3402     assert(R.array == correct);
3403 }
3404 
3405 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3406 /// #BONUS
3407 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
3408 {
3409     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
3410 }
3411 unittest
3412 {
3413     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
3414     float[4] correct = [3.0f, 4.0f, 0, 0];
3415     assert(R.array == correct);
3416 }
3417 
3418 /// Shift `v` right by `bytes` bytes while shifting in zeros.
3419 /// #BONUS
3420 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
3421 {
3422     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
3423 }
3424 
3425 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted
3426 {
3427     __m128d* aligned = cast(__m128d*)mem_addr;
3428     *aligned = a;
3429 }
3430 
3431 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted
3432 {
3433     __m128d* aligned = cast(__m128d*)mem_addr;
3434     __m128d r;
3435     r.ptr[0] = a.array[0];
3436     r.ptr[1] = a.array[0];
3437     *aligned = r;
3438 }
3439 
3440 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
3441 {
3442     *mem_addr = a.array[0];
3443 }
3444 
3445 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
3446 {
3447     *mem_addr = a;
3448 }
3449 
3450 alias _mm_store1_pd = _mm_store_pd1;
3451 
3452 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
3453 {
3454     *mem_addr = a.array[1];
3455 }
3456 
3457 // Note: `mem_addr` doesn't have to actually be aligned, which breaks
3458 // expectations from the user point of view. This problem also exist in C++.
3459 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
3460 {
3461     long* dest = cast(long*)mem_addr;
3462     long2 la = cast(long2)a;
3463     *dest = la.array[0];
3464 }
3465 unittest
3466 {
3467     long[3] A = [1, 2, 3];
3468     _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000));
3469     long[3] correct = [1, 0x1_0000_0000, 3];
3470     assert(A == correct);
3471 }
3472 
3473 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
3474 {
3475     *mem_addr = a.array[0];
3476 }
3477 
3478 void _mm_storer_pd (double* mem_addr, __m128d a) pure
3479 {
3480     __m128d* aligned = cast(__m128d*)mem_addr;
3481     *aligned = shufflevector!(double2, 1, 0)(a, a);
3482 }
3483 
3484 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
3485 {
3486     storeUnaligned!double2(a, mem_addr);
3487 }
3488 
3489 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
3490 {
3491     storeUnaligned!__m128i(a, cast(int*)mem_addr);
3492 }
3493 
3494 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
3495 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte
3496 /// boundary or a general-protection exception may be generated.
3497 void _mm_stream_pd (double* mem_addr, __m128d a)
3498 {
3499     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3500     __m128d* dest = cast(__m128d*)mem_addr;
3501     *dest = a;
3502 }
3503 
3504 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint.
3505 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception
3506 /// may be generated.
3507 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
3508 {
3509     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3510     __m128i* dest = cast(__m128i*)mem_addr;
3511     *dest = a;
3512 }
3513 
3514 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache
3515 /// pollution. If the cache line containing address mem_addr is already in the cache,
3516 /// the cache will be updated.
3517 void _mm_stream_si32 (int* mem_addr, int a)
3518 {
3519     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3520     *mem_addr = a;
3521 }
3522 
3523 /// Store 64-bit integer a into memory using a non-temporal hint to minimize
3524 /// cache pollution. If the cache line containing address mem_addr is already
3525 /// in the cache, the cache will be updated.
3526 void _mm_stream_si64 (long* mem_addr, long a)
3527 {
3528     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
3529     *mem_addr = a;
3530 }
3531 
3532 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
3533 {
3534     return cast(__m128i)(cast(short8)a - cast(short8)b);
3535 }
3536 
3537 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
3538 {
3539     return cast(__m128i)(cast(int4)a - cast(int4)b);
3540 }
3541 
3542 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
3543 {
3544     return cast(__m128i)(cast(long2)a - cast(long2)b);
3545 }
3546 
3547 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
3548 {
3549     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
3550 }
3551 
3552 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
3553 {
3554     return a - b;
3555 }
3556 
3557 version(DigitalMars)
3558 {
3559     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
3560     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
3561     {
3562         asm pure nothrow @nogc @trusted { nop;}
3563         a[0] = a[0] - b[0];
3564         return a;
3565     }
3566 }
3567 else static if (GDC_with_SSE2)
3568 {
3569     alias _mm_sub_sd = __builtin_ia32_subsd;
3570 }
3571 else
3572 {
3573     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
3574     {
3575         a.array[0] -= b.array[0];
3576         return a;
3577     }
3578 }
3579 unittest
3580 {
3581     __m128d a = [1.5, -2.0];
3582     a = _mm_sub_sd(a, a);
3583     assert(a.array == [0.0, -2.0]);
3584 }
3585 
3586 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe
3587 {
3588     return a - b;
3589 }
3590 
3591 version(LDC)
3592 {
3593     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3594     {
3595         // Generates PSUBSW since LDC 1.15 -O0
3596         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
3597         {
3598             enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
3599             enum ir = `
3600                 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
3601                 ret <8 x i16> %r`;
3602             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
3603         }
3604     }
3605     else
3606         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
3607 }
3608 else
3609 {
3610     static if (GDC_with_SSE2)
3611     {
3612         alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
3613     }
3614     else
3615     {
3616         __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
3617         {
3618             short[8] res;
3619             short8 sa = cast(short8)a;
3620             short8 sb = cast(short8)b;
3621             foreach(i; 0..8)
3622                 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
3623             return _mm_loadu_si128(cast(int4*)res.ptr);
3624         }
3625     }
3626 }
3627 unittest
3628 {
3629     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
3630                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
3631     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
3632     assert(res.array == correctResult);
3633 }
3634 
3635 version(LDC)
3636 {
3637     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3638     {
3639         // Generates PSUBSB since LDC 1.15 -O0
3640         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
3641         {
3642             enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
3643             enum ir = `
3644                 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
3645                 ret <16 x i8> %r`;
3646             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
3647         }
3648     }
3649     else
3650         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
3651 }
3652 else
3653 {
3654     static if (GDC_with_SSE2)
3655     {
3656         alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
3657     }
3658     else
3659     {
3660         __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
3661         {
3662             byte[16] res;
3663             byte16 sa = cast(byte16)a;
3664             byte16 sb = cast(byte16)b;
3665             foreach(i; 0..16)
3666                 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
3667             return _mm_loadu_si128(cast(int4*)res.ptr);
3668         }
3669     }
3670 }
3671 unittest
3672 {
3673     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
3674                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
3675     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3676     assert(res.array == correctResult);
3677 }
3678 
3679 version(LDC)
3680 {
3681     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3682     {
3683         // Generates PSUBUSW since LDC 1.15 -O0
3684         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
3685         {
3686             enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`;
3687             enum ir = `
3688                 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1)
3689                 ret <8 x i16> %r`;
3690             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b);
3691         }
3692     }
3693     else
3694         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
3695 }
3696 else
3697 {
3698     static if (GDC_with_SSE2)
3699     {
3700         alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
3701     }
3702     else
3703     {
3704         __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
3705         {
3706             short[8] res;
3707             short8 sa = cast(short8)a;
3708             short8 sb = cast(short8)b;
3709             foreach(i; 0..8)
3710             {
3711                 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
3712                 res[i] = saturateSignedIntToUnsignedShort(sum);
3713             }
3714             return _mm_loadu_si128(cast(int4*)res.ptr);
3715         }
3716     }
3717 }
3718 unittest
3719 {
3720     __m128i A = _mm_setr_epi16(cast(short)65534,   0, 5, 4, 3, 2, 1, 0);
3721     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
3722                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
3723     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
3724     assert(R.array == correct);
3725 }
3726 
3727 version(LDC)
3728 {
3729     static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8
3730     {
3731         // Generates PSUBUSB since LDC 1.15 -O0
3732         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
3733         {
3734             enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`;
3735             enum ir = `
3736                 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1)
3737                 ret <16 x i8> %r`;
3738             return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);
3739         }
3740     }
3741     else    
3742         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
3743 }
3744 else
3745 {
3746     static if (GDC_with_SSE2)
3747     {
3748         alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
3749     }
3750     else
3751     {
3752         __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
3753         {
3754             ubyte[16] res;
3755             byte16 sa = cast(byte16)a;
3756             byte16 sb = cast(byte16)b;
3757             foreach(i; 0..16)
3758                 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
3759             return _mm_loadu_si128(cast(int4*)res.ptr);
3760         }
3761     }
3762 }
3763 unittest
3764 {
3765     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
3766                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
3767     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
3768     assert(res.array == correctResult);
3769 }
3770 
3771 // Note: the only difference between these intrinsics is the signalling
3772 //       behaviour of quiet NaNs. This is incorrect but the case where
3773 //       you would want to differentiate between qNaN and sNaN and then
3774 //       treat them differently on purpose seems extremely rare.
3775 alias _mm_ucomieq_sd = _mm_comieq_sd;
3776 alias _mm_ucomige_sd = _mm_comige_sd;
3777 alias _mm_ucomigt_sd = _mm_comigt_sd;
3778 alias _mm_ucomile_sd = _mm_comile_sd;
3779 alias _mm_ucomilt_sd = _mm_comilt_sd;
3780 alias _mm_ucomineq_sd = _mm_comineq_sd;
3781 
3782 __m128d _mm_undefined_pd() pure @safe
3783 {
3784     __m128d result = void;
3785     return result;
3786 }
3787 __m128i _mm_undefined_si128() pure @safe
3788 {
3789     __m128i result = void;
3790     return result;
3791 }
3792 
3793 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
3794 {
3795     static if (GDC_with_SSE2)
3796     {
3797         return __builtin_ia32_punpckhwd128(a, b);
3798     }
3799     else static if (DMD_with_32bit_asm)
3800     {
3801         asm pure nothrow @nogc @trusted
3802         {
3803             movdqu XMM0, a;
3804             movdqu XMM1, b;
3805             punpckhwd XMM0, XMM1;
3806             movdqu a, XMM0;
3807         }
3808         return a;
3809     }
3810     else
3811     {
3812         return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
3813                                            (cast(short8)a, cast(short8)b);
3814     }
3815 }
3816 unittest
3817 {
3818     __m128i A = _mm_setr_epi16(4,   5,  6,  7,  8,  9, 10, 11);
3819     __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19);
3820     short8 C = cast(short8)(_mm_unpackhi_epi16(A, B));
3821     short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19];
3822     assert(C.array == correct);
3823 }
3824 
3825 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
3826 {
3827     static if (GDC_with_SSE2)
3828     {
3829         return __builtin_ia32_punpckhdq128(a, b);
3830     }
3831     else
3832     {
3833         return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
3834     }
3835 }
3836 
3837 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted
3838 {
3839     static if (GDC_with_SSE2)
3840     {
3841         return __builtin_ia32_punpckhqdq128(a, b);
3842     }
3843     else
3844     {
3845         __m128i r = cast(__m128i)b;
3846         r[0] = a[2];
3847         r[1] = a[3];
3848         return r; 
3849     }
3850 }
3851 unittest // Issue #36
3852 {
3853     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
3854     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
3855     long2 C = cast(long2)(_mm_unpackhi_epi64(A, B));
3856     long[2] correct = [0x33333333_33333333, 0x55555555_55555555];
3857     assert(C.array == correct);
3858 }
3859 
3860 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
3861 {
3862     static if (GDC_with_SSE2)
3863     {
3864         return __builtin_ia32_punpckhbw128(a, b);
3865     }
3866     else
3867     {
3868         return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
3869                                                    12, 28, 13, 29, 14, 30, 15, 31)
3870                                                    (cast(byte16)a, cast(byte16)b);
3871     }
3872 }
3873 
3874 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
3875 {
3876     static if (GDC_with_SSE2)
3877     {
3878         return __builtin_ia32_unpckhpd(a, b);
3879     }
3880     else
3881     {
3882         return shufflevector!(__m128d, 1, 3)(a, b);
3883     }
3884 }
3885 
3886 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
3887 {
3888     static if (GDC_with_SSE2)
3889     {
3890         return __builtin_ia32_punpcklwd128(a, b);
3891     }
3892     else
3893     {
3894         return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
3895                                            (cast(short8)a, cast(short8)b);
3896     }
3897 }
3898 
3899 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
3900 {
3901     static if (GDC_with_SSE2)
3902     {
3903         return __builtin_ia32_punpckldq128(a, b);
3904     }
3905     else
3906     {
3907         return shufflevector!(int4, 0, 4, 1, 5)
3908                              (cast(int4)a, cast(int4)b);
3909     }
3910 }
3911 
3912 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted
3913 {
3914     static if (GDC_with_SSE2)
3915     {
3916         return __builtin_ia32_punpcklqdq128(a, b);
3917     }
3918     else
3919     {
3920         long2 lA = cast(long2)a;
3921         long2 lB = cast(long2)b;
3922         long2 R;
3923         R.ptr[0] = lA.array[0];
3924         R.ptr[1] = lB.array[0];
3925         return cast(__m128i)R;        
3926     }
3927 }
3928 unittest // Issue #36
3929 {
3930     __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333);
3931     __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555);
3932     long2 C = cast(long2)(_mm_unpacklo_epi64(A, B));
3933     long[2] correct = [0x22222222_22222222, 0x44444444_44444444];
3934     assert(C.array == correct);
3935 }
3936 
3937 
3938 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
3939 {
3940     static if (GDC_with_SSE2)
3941     {
3942         return __builtin_ia32_punpcklbw128(a, b);
3943     }
3944     else
3945     {
3946         return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
3947                                                     4, 20, 5, 21, 6, 22, 7, 23)
3948                                            (cast(byte16)a, cast(byte16)b);
3949     }
3950 }
3951 
3952 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
3953 {
3954     static if (GDC_with_SSE2)
3955     {
3956         return __builtin_ia32_unpcklpd(a, b);
3957     }
3958     else
3959     {
3960         return shufflevector!(__m128d, 0, 2)(a, b);
3961     }
3962 }
3963 
3964 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
3965 {
3966     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
3967 }
3968 
3969 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
3970 {
3971     return a ^ b;
3972 }
3973 
3974 unittest
3975 {
3976     // distance between two points in 4D
3977     float distance(float[4] a, float[4] b) nothrow @nogc
3978     {
3979         __m128 va = _mm_loadu_ps(a.ptr);
3980         __m128 vb = _mm_loadu_ps(b.ptr);
3981         __m128 diffSquared = _mm_sub_ps(va, vb);
3982         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
3983         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
3984         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
3985         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
3986     }
3987     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
3988 }