1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.emmintrin;
7 
8 public import inteli.types;
9 public import inteli.xmmintrin; // SSE2 includes SSE1
10 import inteli.mmx;
11 import inteli.internals;
12 
13 nothrow @nogc:
14 
15 // SSE2 instructions
16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
17 
18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
19 {
20     return cast(__m128i)(cast(short8)a + cast(short8)b);
21 }
22 
23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
24 {
25     return cast(__m128i)(cast(int4)a + cast(int4)b);
26 }
27 
28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
29 {
30     return cast(__m128i)(cast(long2)a + cast(long2)b);
31 }
32 
33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
34 {
35     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
36 }
37 
38 version(DigitalMars)
39 {
40     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
41     __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
42     {
43         pragma(inline, false);
44         a[0] = a[0] + b[0];
45         return a;
46     }
47 }
48 else
49 {
50     __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
51     {
52         a[0] += b[0];
53         return a;
54     }
55 }
56 unittest
57 {
58     __m128d a = [1.5, -2.0];
59     a = _mm_add_sd(a, a);
60     assert(a.array == [3.0, -2.0]);
61 }
62 
63 
64 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
65 {
66     return a + b;
67 }
68 unittest
69 {
70     __m128d a = [1.5, -2.0];
71     a = _mm_add_pd(a, a);
72     assert(a.array == [3.0, -4.0]);
73 }
74 
75 // TODO: _mm_add_si64
76 
77 version(LDC)
78 {
79     alias _mm_adds_epi16 = __builtin_ia32_paddsw128;
80 }
81 else
82 {
83     __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
84     {
85         short[8] res;
86         short8 sa = cast(short8)a;
87         short8 sb = cast(short8)b;
88         foreach(i; 0..8)
89             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
90         return _mm_loadu_si128(cast(int4*)res.ptr);
91     }
92 }
93 unittest
94 {
95     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
96                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
97     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
98     assert(res.array == correctResult);
99 }
100 
101 version(LDC)
102 {
103     alias _mm_adds_epi8 = __builtin_ia32_paddsb128;
104 }
105 else
106 {
107     __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
108     {
109         byte[16] res;
110         byte16 sa = cast(byte16)a;
111         byte16 sb = cast(byte16)b;
112         foreach(i; 0..16)
113             res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
114         return _mm_loadu_si128(cast(int4*)res.ptr);
115     }
116 }
117 unittest
118 {
119     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
120                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
121     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
122                                                16, 18, 20, 22, 24, 26, 28, 30];
123     assert(res.array == correctResult);
124 }
125 
126 version(LDC)
127 {
128     alias _mm_adds_epu8 = __builtin_ia32_paddusb128;
129 }
130 else
131 {
132     __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
133     {
134         ubyte[16] res;
135         byte16 sa = cast(byte16)a;
136         byte16 sb = cast(byte16)b;
137         foreach(i; 0..16)
138             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
139         return _mm_loadu_si128(cast(int4*)res.ptr);
140     }
141 }
142 
143 version(LDC)
144 {
145     alias _mm_adds_epu16 = __builtin_ia32_paddusw128;
146 }
147 else
148 {
149     __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
150     {
151         ushort[8] res;
152         short8 sa = cast(short8)a;
153         short8 sb = cast(short8)b;
154         foreach(i; 0..8)
155             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
156         return _mm_loadu_si128(cast(int4*)res.ptr);
157     }
158 }
159 
160 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
161 {
162     return cast(__m128d)( cast(__m128i)a & cast(__m128i)b );
163 }
164 
165 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
166 {
167     return a & b;
168 }
169 unittest
170 {
171     __m128i A = _mm_set1_epi32(7);
172     __m128i B = _mm_set1_epi32(14);
173     __m128i R = _mm_and_si128(A, B);
174     int[4] correct = [6, 6, 6, 6];
175     assert(R.array == correct);
176 }
177 
178 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
179 {
180     return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b );
181 }
182 
183 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
184 {
185     return (~a) & b;
186 }
187 unittest
188 {
189     __m128i A = _mm_set1_epi32(7);
190     __m128i B = _mm_set1_epi32(14);
191     __m128i R = _mm_andnot_si128(A, B);
192     int[4] correct = [8, 8, 8, 8];
193     assert(R.array == correct);
194 }
195 
196 version(LDC)
197 {
198     __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe
199     {
200         // Generates pavgw even in LDC 1.0, even in -O0
201         enum ir = `
202             %ia = zext <8 x i16> %0 to <8 x i32>
203             %ib = zext <8 x i16> %1 to <8 x i32>
204             %isum = add <8 x i32> %ia, %ib
205             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
206             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
207             %r = trunc <8 x i32> %isums to <8 x i16>
208             ret <8 x i16> %r`;
209         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);        
210     }
211 }
212 else
213 {
214     __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe
215     {
216         short8 sa = cast(short8)a;
217         short8 sb = cast(short8)b;
218         short8 sr = void;
219         foreach(i; 0..8)
220         {
221             sr[i] = cast(ushort)( (cast(ushort)(sa[i]) + cast(ushort)(sb[i]) + 1) >> 1 );
222         }
223         return cast(int4)sr;
224     }
225 }
226 unittest
227 {
228     __m128i A = _mm_set1_epi16(31);
229     __m128i B = _mm_set1_epi16(64);
230     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
231     foreach(i; 0..8)
232         assert(avg[i] == 48);
233 }
234 
235 version(LDC)
236 {
237     __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe
238     {
239         // Generates pavgb even in LDC 1.0, even in -O0
240         enum ir = `
241             %ia = zext <16 x i8> %0 to <16 x i16>
242             %ib = zext <16 x i8> %1 to <16 x i16>
243             %isum = add <16 x i16> %ia, %ib
244             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
245             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
246             %r = trunc <16 x i16> %isums to <16 x i8>
247             ret <16 x i8> %r`;
248         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);        
249     }
250 }
251 else
252 {
253     __m128i _mm_avg_epu8 (__m128i a, __m128i b)
254     {
255         byte16 sa = cast(byte16)a;
256         byte16 sb = cast(byte16)b;
257         byte16 sr = void;
258         foreach(i; 0..16)
259         {
260             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
261         }
262         return cast(int4)sr;
263     }
264 }
265 unittest
266 {
267     __m128i A = _mm_set1_epi8(31);
268     __m128i B = _mm_set1_epi8(64);
269     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
270     foreach(i; 0..16)
271         assert(avg[i] == 48);
272 }
273 
274 // Note: unlike Intel API, shift amount is a compile-time parameter.
275 __m128i _mm_bslli_si128(int bits)(__m128i a) pure @safe
276 {
277     // Generates pslldq starting with LDC 1.1 -O2
278     __m128i zero = _mm_setzero_si128();
279     return cast(__m128i) 
280         shufflevector!(byte16, 16 - bits, 17 - bits, 18 - bits, 19 - bits,
281                                20 - bits, 21 - bits, 22 - bits, 23 - bits,
282                                24 - bits, 25 - bits, 26 - bits, 27 - bits,
283                                28 - bits, 29 - bits, 30 - bits, 31 - bits)
284         (cast(byte16)zero, cast(byte16)a);
285 }
286 unittest
287 {
288     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
289     byte[16] exact =              [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
290     __m128i result = _mm_bslli_si128!5(toShift);
291     assert(  (cast(byte16)result).array == exact);
292 }
293 
294 // Note: unlike Intel API, shift amount is a compile-time parameter.
295 __m128i _mm_bsrli_si128(int bits)(__m128i a) pure @safe
296 {
297     // Generates psrldq starting with LDC 1.1 -O2
298     __m128i zero = _mm_setzero_si128();
299     return  cast(__m128i) 
300         shufflevector!(byte16, 0 + bits, 1 + bits, 2 + bits, 3 + bits,
301                                4 + bits, 5 + bits, 6 + bits, 7 + bits,
302                                8 + bits, 9 + bits, 10 + bits, 11 + bits,
303                                12 + bits, 13 + bits, 14 + bits, 15 + bits)
304         (cast(byte16)a, cast(byte16)zero);
305 }
306 unittest
307 {
308     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
309     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
310     __m128i result = _mm_bsrli_si128!5(toShift);
311     assert( (cast(byte16)result).array == exact);
312 }
313 
314 __m128 _mm_castpd_ps (__m128d a) pure @safe
315 {
316     return cast(__m128)a;
317 }
318 
319 __m128i _mm_castpd_si128 (__m128d a) pure @safe
320 {
321     return cast(__m128i)a;
322 }
323 
324 __m128d _mm_castps_pd (__m128 a) pure @safe
325 {
326     return cast(__m128d)a;
327 }
328 
329 __m128i _mm_castps_si128 (__m128 a) pure @safe
330 {
331     return cast(__m128i)a;
332 }
333 
334 __m128d _mm_castsi128_pd (__m128i a) pure @safe
335 {
336     return cast(__m128d)a;
337 }
338 
339 __m128 _mm_castsi128_ps (__m128i a) pure @safe
340 {
341     return cast(__m128)a;
342 }
343 
344 version(LDC)
345 {
346     alias _mm_clflush = __builtin_ia32_clflush;
347 }
348 else
349 {
350     void _mm_clflush (const(void)* p) pure @safe
351     {
352         version(D_InlineAsm_X86)
353         {
354             asm pure nothrow @nogc @safe
355             {
356                 mov EAX, p;
357                 clflush [EAX];
358             }
359         }
360         else version(D_InlineAsm_X86_64)
361         {
362             asm pure nothrow @nogc @safe
363             {
364                 mov RAX, p;
365                 clflush [RAX];
366             }
367         }
368     }
369 }
370 unittest
371 {
372     ubyte[64] cacheline;
373     _mm_clflush(cacheline.ptr);
374 }
375 
376 
377 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
378 {
379     return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
380 }
381 unittest
382 {
383     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
384     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
385     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
386     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
387     assert(R.array == E);
388 }
389 
390 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
391 {
392     return equalMask!__m128i(a, b);
393 }
394 unittest
395 {
396     int4   A = [-3, -2, -1,  0];
397     int4   B = [ 4, -2,  2,  0];
398     int[4] E = [ 0, -1,  0, -1];
399     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
400     assert(R.array == E);
401 }
402 
403 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
404 {
405     return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
406 }
407 unittest
408 {
409     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
410     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
411     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
412     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
413     assert(C.array == correct);
414 }
415 
416 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
417 {
418     return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
419 }
420 
421 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
422 {
423     return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
424 }
425 
426 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
427 {
428     return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
429 }
430 
431 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
432 {
433     return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
434 }
435 
436 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
437 {
438     return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
439 }
440 unittest
441 {
442     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
443     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
444     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
445     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
446     assert(R.array == E);
447 }
448 
449 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
450 {
451     return cast(__m128i)( greaterMask!int4(a, b));
452 }
453 unittest
454 {
455     int4   A = [-3,  2, -1,  0];
456     int4   B = [ 4, -2,  2,  0];
457     int[4] E = [ 0, -1,  0,  0];
458     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
459     assert(R.array == E);
460 }
461 
462 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
463 {
464     return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
465 }
466 unittest
467 {
468     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
469     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
470     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
471     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
472     __m128i D = _mm_cmpeq_epi8(A, B);
473     assert(C.array == correct);
474 }
475 
476 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
477 {
478     return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
479 }
480 
481 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
482 {
483     return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
484 }
485 
486 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
487 {
488     return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
489 }
490 
491 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
492 {
493     return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
494 }
495 
496 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
497 {
498     return _mm_cmpgt_epi16(b, a);
499 }
500 
501 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
502 {
503     return _mm_cmpgt_epi32(b, a);
504 }
505 
506 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
507 {
508     return _mm_cmpgt_epi8(b, a);
509 }
510 
511 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
512 {
513     return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
514 }
515 
516 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
517 {
518     return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
519 }
520 
521 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
522 {
523     return cast(__m128d) cmppd!(FPComparison.une)(a, b);
524 }
525 
526 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
527 {
528     return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
529 }
530 
531 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
532 {
533     return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
534 }
535 
536 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
537 {
538     return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
539 }
540 
541 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
542 {
543     return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
544 }
545 
546 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
547 {
548     return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
549 }
550 
551 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
552 {
553     return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
554 }
555 
556 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
557 {
558     return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
559 }
560 
561 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
562 {
563     return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
564 }
565 
566 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
567 {
568     return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
569 }
570 
571 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
572 {
573     return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
574 }
575 
576 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
577 {
578     return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
579 }
580 
581 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
582 {
583     return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
584 }
585 
586 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
587 {
588     return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
589 }
590 
591 
592 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
593 // Some such comparisons yields true for NaNs, other don't.
594 
595 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
596 {
597     return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC
598 }
599 
600 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
601 {
602     return comsd!(FPComparison.oge)(a, b);
603 }
604 
605 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
606 {
607     return comsd!(FPComparison.ogt)(a, b);
608 }
609 
610 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
611 {
612     return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC
613 }
614 
615 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
616 {
617     return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC
618 }
619 
620 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
621 {
622     return comsd!(FPComparison.one)(a, b);
623 }
624 
625 version(LDC)
626 {
627      __m128d _mm_cvtepi32_pd (__m128i a) pure  @safe
628     {
629         // Generates cvtdq2pd since LDC 1.0, even without optimizations
630         enum ir = `
631             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
632             %r = sitofp <2 x i32> %v to <2 x double>
633             ret <2 x double> %r`;
634         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
635     }
636 }
637 else
638 {
639     __m128d _mm_cvtepi32_pd (__m128i a) pure  @safe
640     {
641         double2 r = void;
642         r[0] = a[0];
643         r[1] = a[1];
644         return r;
645     }
646 }
647 unittest
648 {
649     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
650     assert(A[0] == 54.0);
651     assert(A[1] == 54.0);
652 }
653 
654 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe
655 {
656     // Generates cvtdq2ps since LDC 1.0.0 -O1
657     __m128 res;
658     res.array[0] = cast(float)a.array[0];
659     res.array[1] = cast(float)a.array[1];
660     res.array[2] = cast(float)a.array[2];
661     res.array[3] = cast(float)a.array[3];
662     return res;
663 }
664 unittest
665 {
666     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
667     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
668 }
669 
670 
671 version(LDC) 
672 {
673     // Like in clang, implemented with a magic intrinsic right now
674     alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq;
675 
676 /* Unfortunately this generates a cvttpd2dq instruction
677     __m128i _mm_cvtpd_epi32 (__m128d a) pure  @safe
678     {
679         enum ir = `
680             %i = fptosi <2 x double> %0 to <2 x i32>
681             %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>        
682             ret <4 x i32> %r`;
683 
684         return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a);
685     } */
686 }
687 else
688 {
689     __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe
690     {
691         __m128i r = _mm_setzero_si128();
692         r[0] = convertDoubleToInt32UsingMXCSR(a[0]);
693         r[1] = convertDoubleToInt32UsingMXCSR(a[1]);
694         return r; 
695     }
696 }
697 unittest
698 {
699     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
700     assert(A[0] == 55 && A[1] == 61 && A[2] == 0 && A[3] == 0);
701 }
702 
703 // TODO: _mm_cvtpd_pi32
704 
705 version(LDC)
706 {
707     alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately
708 }
709 else
710 {
711     __m128 _mm_cvtpd_ps (__m128d a) pure @safe
712     {
713         __m128 r = void;
714         r[0] = a[0];
715         r[1] = a[1];
716         r[2] = 0;
717         r[3] = 0;
718         return r;
719     }
720 }
721 unittest
722 {
723     __m128d A = _mm_set_pd(5.25, 4.0);
724     __m128 B = _mm_cvtpd_ps(A);
725     assert(B.array == [4.0f, 5.25f, 0, 0]);
726 }
727 
728 // TODO: _mm_cvtpi32_pd
729 
730 version(LDC)
731 {
732     // Disabled, since it fail with optimizations unfortunately
733     //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
734 
735      __m128i _mm_cvtps_epi32 (__m128 a) pure @trusted
736     {
737         return __asm!__m128i("cvtps2dq $1,$0","=x,x",a);
738     }
739 }
740 else
741 {
742     __m128i _mm_cvtps_epi32 (__m128 a) pure @safe
743     {
744         __m128i r = void;
745         r[0] = convertFloatToInt32UsingMXCSR(a[0]);
746         r[1] = convertFloatToInt32UsingMXCSR(a[1]);
747         r[2] = convertFloatToInt32UsingMXCSR(a[2]);
748         r[3] = convertFloatToInt32UsingMXCSR(a[3]);
749         return r; 
750     }
751 }
752 unittest
753 {
754     uint savedRounding = _MM_GET_ROUNDING_MODE();
755 
756     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
757     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
758     assert(A.array == [1, -2, 54, -3]);
759 
760     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
761     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
762     assert(A.array == [1, -3, 53, -3]);
763 
764     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
765     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
766     assert(A.array == [2, -2, 54, -2]);
767 
768     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
769     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
770     assert(A.array == [1, -2, 53, -2]);
771 
772     _MM_SET_ROUNDING_MODE(savedRounding);
773 }
774 
775 
776 version(LDC)
777 {
778     __m128d _mm_cvtps_pd (__m128 a) pure  @safe
779     {
780         // Generates cvtps2pd since LDC 1.0, no opt
781         enum ir = `
782             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
783             %r = fpext <2 x float> %v to <2 x double>
784             ret <2 x double> %r`;
785         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
786     }
787 }
788 else
789 {
790      __m128d _mm_cvtps_pd (__m128 a) pure  @safe
791     {
792         double2 r = void;
793         r[0] = a[0];
794         r[1] = a[1];
795         return r;
796     }
797 }
798 unittest
799 {
800     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
801     assert(A[0] == 54.0);
802     assert(A[1] == 54.0);
803 }
804 
805 double _mm_cvtsd_f64 (__m128d a) pure @safe
806 {
807     return a[0];
808 }
809 
810 version(LDC)
811 {
812     alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si;
813 }
814 else
815 {
816     int _mm_cvtsd_si32 (__m128d a) pure @safe
817     {
818         return convertDoubleToInt32UsingMXCSR(a[0]);
819     }
820 }
821 unittest
822 {
823     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
824 }
825 
826 version(LDC)
827 {
828     // Unfortunately this builtin crashes in 32-bit
829     version(X86_64)
830         alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
831     else
832     {
833         long _mm_cvtsd_si64 (__m128d a) pure @safe
834         {
835             return convertDoubleToInt64UsingMXCSR(a[0]);
836         }
837     }
838 }
839 else
840 {
841     long _mm_cvtsd_si64 (__m128d a) pure @safe
842     {
843         return convertDoubleToInt64UsingMXCSR(a[0]);
844     }
845 }
846 unittest
847 {
848     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
849 
850     uint savedRounding = _MM_GET_ROUNDING_MODE();
851 
852     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
853     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5)));
854 
855     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
856     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
857 
858     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
859     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
860 
861     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
862     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
863 
864     _MM_SET_ROUNDING_MODE(savedRounding);    
865 }
866 
867 alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
868 
869 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe
870 {
871     // Generates cvtsd2ss since LDC 1.3 -O0
872     a[0] = b[0];
873     return a;
874 }
875 unittest
876 {
877     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
878     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
879 }
880 
881 int _mm_cvtsi128_si32 (__m128i a) pure @safe
882 {
883     return a[0];
884 }
885 
886 long _mm_cvtsi128_si64 (__m128i a) pure @safe
887 {
888     long2 la = cast(long2)a;
889     return la[0];
890 }
891 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
892 
893 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe
894 {
895     v[0] = cast(double)x;
896     return v;
897 }
898 unittest
899 {
900     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
901     assert(a.array == [42.0, 0]);
902 }
903 
904 __m128i _mm_cvtsi32_si128 (int a) pure @safe
905 {
906     int4 r = [0, 0, 0, 0];
907     r[0] = a;
908     return r;
909 }
910 unittest
911 {
912     __m128i a = _mm_cvtsi32_si128(65);
913     assert(a.array == [65, 0, 0, 0]);
914 }
915 
916 
917 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
918 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe
919 {
920     v[0] = cast(double)x;
921     return v;
922 }
923 unittest
924 {
925     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
926     assert(a.array == [42.0, 0]);
927 }
928 
929 __m128i _mm_cvtsi64_si128 (long a) pure @safe
930 {
931     long2 r = [0, 0];
932     r[0] = a;
933     return cast(__m128i)(r);
934 }
935 
936 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
937 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
938 
939 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe
940 {
941     v[0] = x[0];
942     return v;
943 }
944 unittest
945 {
946     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
947     assert(a.array == [42.0, 0]);
948 }
949 
950 long _mm_cvttss_si64 (__m128 a) pure @safe
951 {
952     return cast(long)(a[0]); // Generates cvttss2si as expected
953 }
954 unittest
955 {
956     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
957 }
958 
959 version(LDC)
960 {
961     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
962 }
963 else
964 {
965     __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe
966     {
967         // Note: doesn't generate cvttpd2dq as of LDC 1.13
968         __m128i r;
969         r[0] = cast(int)a[0];
970         r[1] = cast(int)a[1];
971         r[2] = 0;
972         r[3] = 0;
973         return r;
974     }
975 }
976 unittest
977 {
978     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
979     assert(R.array == [-4, 45641, 0, 0]);
980 }
981 
982 //TODO: _mm_cvttpd_pi32
983 
984 __m128i _mm_cvttps_epi32 (__m128 a) pure @safe
985 {
986     // Note: Generates cvttps2dq since LDC 1.3 -O2
987     __m128i r;
988     r[0] = cast(int)a[0];
989     r[1] = cast(int)a[1];
990     r[2] = cast(int)a[2];
991     r[3] = cast(int)a[3];
992     return r;
993 }
994 unittest
995 {
996     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
997     assert(R.array == [-4, 45641, 0, 1]);
998 }
999 
1000 int _mm_cvttsd_si32 (__m128d a)
1001 {
1002     // Generates cvttsd2si since LDC 1.3 -O0
1003     return cast(int)a[0];
1004 }
1005 
1006 long _mm_cvttsd_si64 (__m128d a)
1007 {
1008     // Generates cvttsd2si since LDC 1.3 -O0
1009     // but in 32-bit instead, it's a long sequence that resort to FPU
1010     return cast(long)a[0];
1011 }
1012 
1013 alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
1014 
1015 __m128d _mm_div_ps(__m128d a, __m128d b)
1016 {
1017     return a / b;
1018 }
1019 
1020 version(DigitalMars)
1021 {
1022     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1023     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1024     {
1025         pragma(inline, false);
1026         a[0] = a[0] / b[0];
1027         return a;
1028     }
1029 }
1030 else
1031 {
1032     __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
1033     {
1034         a[0] /= b[0];
1035         return a;
1036     }
1037 }
1038 unittest
1039 {
1040     __m128d a = [2.0, 4.5];
1041     a = _mm_div_sd(a, a);
1042     assert(a.array == [1.0, 4.5]);
1043 }
1044 
1045 /// Extract a 16-bit integer from `v`, selected with `index`
1046 int _mm_extract_epi16(__m128i v, int index) pure @safe
1047 {
1048     short8 r = cast(short8)v;
1049     return r[index];
1050 }
1051 unittest
1052 {
1053     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1054     assert(_mm_extract_epi16(A, 6) == 6);
1055 }
1056 
1057 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 
1058 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted
1059 {
1060     short8 r = cast(short8)v;
1061     r[index] = cast(short)i;
1062     return cast(__m128i)r;
1063 }
1064 unittest
1065 {
1066     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1067     short8 R = cast(short8) _mm_insert_epi16(A, 42, 6);
1068     short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7];
1069     assert(R.array == correct);
1070 }
1071 
1072 version(LDC)
1073 {
1074     alias _mm_lfence = __builtin_ia32_lfence;
1075 }
1076 else
1077 {
1078     void _mm_lfence() pure @safe
1079     {
1080         asm nothrow @nogc pure @safe
1081         {
1082             lfence;
1083         }
1084     }
1085 }
1086 unittest
1087 {
1088     _mm_lfence();
1089 }
1090 
1091 
1092 __m128d _mm_load_pd (const(double) * mem_addr) pure
1093 {
1094     __m128d* aligned = cast(__m128d*)mem_addr;
1095     return *aligned;
1096 }
1097 
1098 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1099 {
1100     double[2] arr = [*mem_addr, *mem_addr];
1101     return loadUnaligned!(double2)(&arr[0]);
1102 }
1103 
1104 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe
1105 {
1106     double2 r = [0, 0];
1107     r[0] = *mem_addr;
1108     return r;
1109 }
1110 unittest
1111 {
1112     double x = -42;
1113     __m128d a = _mm_load_sd(&x);
1114     assert(a.array == [-42.0, 0.0]);
1115 }
1116 
1117 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
1118 {
1119     return *mem_addr;
1120 }
1121 
1122 alias _mm_load1_pd = _mm_load_pd1;
1123 
1124 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe
1125 {
1126     a[1] = *mem_addr;
1127     return a;
1128 }
1129 
1130 // Note: strange signature since the memory doesn't have to aligned
1131 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe
1132 {
1133     auto pLong = cast(const(long)*)mem_addr;
1134     long2 r = [0, 0];
1135     r[0] = *pLong;
1136     return cast(__m128i)(r);
1137 }
1138 
1139 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe
1140 {
1141     a[0] = *mem_addr;
1142     return a;
1143 }
1144 
1145 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1146 {
1147     __m128d a = _mm_load_pd(mem_addr);
1148     return shufflevector!(__m128d, 1, 0)(a, a);
1149 }
1150 
1151 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
1152 {
1153     return loadUnaligned!(double2)(mem_addr);
1154 }
1155 
1156 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1157 {
1158     return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1159 }
1160 
1161 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1162 {
1163     int r = *cast(int*)(mem_addr);
1164     int4 result = [0, 0, 0, 0];
1165     result[0] = r;
1166     return result;
1167 }
1168 unittest
1169 {
1170     int r = 42;
1171     __m128i A = _mm_loadu_si32(&r);
1172     int[4] correct = [42, 0, 0, 0];
1173     assert(A.array == correct);
1174 }
1175 
1176 version(LDC)
1177 {
1178     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1179 }
1180 
1181 version(LDC)
1182 {
1183     /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 
1184     /// (elements are not stored when the highest bit is not set in the corresponding element) 
1185     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 
1186     /// boundary.
1187     alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR
1188 }
1189 else
1190 {
1191     ///ditto
1192     void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1193     {
1194         byte16 b = cast(byte16)a;
1195         byte16 m = cast(byte16)mask;
1196         byte* dest = cast(byte*)(mem_addr);
1197         foreach(j; 0..16)
1198         {
1199             if (m[j] & 128)
1200             {
1201                 dest[j] = b[j];
1202             }
1203         }
1204     }
1205 }
1206 unittest
1207 {
1208     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
1209     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
1210     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
1211     _mm_maskmoveu_si128(A, mask, dest.ptr);
1212     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
1213     assert(dest == correct);
1214 }
1215 
1216 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
1217 {
1218     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1219     __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
1220     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1221     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1222     return _mm_xor_si128(b, mask);
1223 }
1224 unittest
1225 {
1226     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1227                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1228     short[8] correct =                                  [45, 1,  9,  7, 9,  7, 0,  0];
1229     assert(R.array == correct);
1230 }
1231 
1232 
1233 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1234 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
1235 {
1236     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1237     __m128i value128 = _mm_set1_epi8(-128);
1238     __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1239     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1240     __m128i mask = _mm_and_si128(aTob, higher);
1241     return _mm_xor_si128(b, mask);
1242 }
1243 unittest
1244 {
1245     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1246                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1247     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
1248     assert(R.array == correct);
1249 }
1250 
1251 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe
1252 {
1253     // Generates maxpd starting with LDC 1.9
1254     a[0] = (a[0] > b[0]) ? a[0] : b[0];
1255     a[1] = (a[1] > b[1]) ? a[1] : b[1];
1256     return a;
1257 }
1258 unittest
1259 {
1260     __m128d A = _mm_setr_pd(4.0, 1.0);
1261     __m128d B = _mm_setr_pd(1.0, 8.0);
1262     __m128d M = _mm_max_pd(A, B);
1263     assert(M[0] == 4.0);
1264     assert(M[1] == 8.0);
1265 }
1266 
1267 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe
1268 {
1269      __m128d r = a;
1270     // Generates maxsd starting with LDC 1.3
1271     r[0] = (a[0] > b[0]) ? a[0] : b[0];
1272     return r;
1273 }
1274 unittest
1275 {
1276     __m128d A = _mm_setr_pd(1.0, 1.0);
1277     __m128d B = _mm_setr_pd(4.0, 2.0);
1278     __m128d M = _mm_max_sd(A, B);
1279     assert(M[0] == 4.0);
1280     assert(M[1] == 1.0);
1281 }
1282 
1283 version(LDC)
1284 {
1285     alias _mm_mfence = __builtin_ia32_mfence;
1286 }
1287 else
1288 {
1289     void _mm_mfence() pure @safe
1290     {
1291         asm nothrow @nogc pure @safe
1292         {
1293             mfence;
1294         }
1295     }
1296 }
1297 unittest
1298 {
1299     _mm_mfence();
1300 }
1301 
1302 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
1303 {
1304     // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?)
1305     // Implemented using masks and XOR
1306     __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
1307     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1308     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1309     return _mm_xor_si128(b, mask);
1310 }
1311 unittest
1312 {
1313     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1314                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1315     short[8] correct =  [-4,-8, -4, -8, 0,-57, 0, -57];
1316     assert(R.array == correct);
1317 }
1318 
1319 
1320 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
1321 {
1322     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1323     __m128i value128 = _mm_set1_epi8(-128);
1324     __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1325     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1326     __m128i mask = _mm_and_si128(aTob, lower);
1327     return _mm_xor_si128(b, mask);
1328 }
1329 unittest
1330 {
1331     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1332                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1333     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
1334     assert(R.array == correct);
1335 }
1336 
1337 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe
1338 {
1339     // Generates minpd starting with LDC 1.9
1340     a[0] = (a[0] < b[0]) ? a[0] : b[0];
1341     a[1] = (a[1] < b[1]) ? a[1] : b[1];
1342     return a;
1343 }
1344 unittest
1345 {
1346     __m128d A = _mm_setr_pd(1.0, 2.0);
1347     __m128d B = _mm_setr_pd(4.0, 1.0);
1348     __m128d M = _mm_min_pd(A, B);
1349     assert(M[0] == 1.0);
1350     assert(M[1] == 1.0);
1351 }
1352 
1353 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
1354 {
1355     // Generates minsd starting with LDC 1.3
1356     __m128d r = a;
1357     r[0] = (a[0] < b[0]) ? a[0] : b[0];
1358     return r;
1359 }
1360 unittest
1361 {
1362     __m128d A = _mm_setr_pd(1.0, 3.0);
1363     __m128d B = _mm_setr_pd(4.0, 2.0);
1364     __m128d M = _mm_min_sd(A, B);
1365     assert(M[0] == 1.0);
1366     assert(M[1] == 3.0);
1367 }
1368 
1369 __m128i _mm_move_epi64 (__m128i a) pure @safe
1370 {
1371     long2 result = [ 0, 0 ];
1372     long2 la = cast(long2) a;
1373     result[0] = la[0];
1374     return cast(__m128i)(result);
1375 }
1376 unittest
1377 {
1378     long2 A = [13, 47];
1379     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
1380     long[2] correct = [13, 0];
1381     assert(B.array == correct);
1382 }
1383 
1384 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
1385 {
1386     b[1] = a[1];
1387     return b;
1388 }
1389 unittest
1390 {
1391     double2 A = [13.0, 47.0];
1392     double2 B = [34.0, 58.0];
1393     double2 C = _mm_move_sd(A, B);
1394     double[2] correct = [34.0, 47.0];
1395     assert(C.array == correct);
1396 }
1397 
1398 version(LDC)
1399 {
1400     /// Create mask from the most significant bit of each 8-bit element in `v`.
1401     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
1402 }
1403 else
1404 {
1405     /// Create mask from the most significant bit of each 8-bit element in `v`.
1406     int _mm_movemask_epi8(__m128i v) pure @safe
1407     {
1408         byte16 ai = cast(byte16)v;
1409         int r = 0;
1410         foreach(bit; 0..16)
1411         {
1412             if (ai[bit] < 0) r += (1 << bit);
1413         }
1414         return r;
1415     }
1416 }
1417 unittest
1418 {
1419     assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0)));
1420 }
1421 
1422 version(LDC)
1423 {
1424     /// Set each bit of mask `dst` based on the most significant bit of the corresponding 
1425     /// packed double-precision (64-bit) floating-point element in `v`.
1426     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
1427 }
1428 else
1429 {
1430     /// Set each bit of mask `dst` based on the most significant bit of the corresponding 
1431     /// packed double-precision (64-bit) floating-point element in `v`.
1432     int _mm_movemask_pd(__m128d v) pure @safe
1433     {
1434         long2 lv = cast(long2)v;
1435         int r = 0;
1436         if (lv[0] < 0) r += 1;
1437         if (lv[1] < 0) r += 2;
1438         return r;
1439     }
1440 }
1441 unittest
1442 {
1443     __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0);
1444     assert(_mm_movemask_pd(A) == 2);
1445 }
1446 
1447 
1448 // TODO: _mm_movepi64_pi64
1449 // TODO: __m128i _mm_movpi64_epi64 (__m64 a)
1450 
1451 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC
1452 // but seems there in clang
1453 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe
1454 {
1455     __m128i zero = _mm_setzero_si128();
1456     long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
1457     long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
1458     static if (__VERSION__ >= 2076)
1459     {
1460         return cast(__m128i)(la * lb);
1461     }
1462     else
1463     {
1464         // long2 mul not supported before LDC 1.5
1465         la[0] *= lb[0];
1466         la[1] *= lb[1];
1467         return cast(__m128i)(la);
1468     }
1469 }
1470 unittest
1471 {
1472     __m128i A = _mm_set_epi32(0, 0xDEADBEEF, 0, 0xffffffff);
1473     __m128i B = _mm_set_epi32(0, 0xCAFEBABE, 0, 0xffffffff);
1474     __m128i C = _mm_mul_epu32(A, B);
1475     long2 LC = cast(long2)C;
1476     assert(LC.array[0] == 18446744065119617025uL);
1477     assert(LC.array[1] == 12723420444339690338uL);
1478 }
1479 
1480 
1481 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
1482 {
1483     return a * b;
1484 }
1485 unittest
1486 {
1487     __m128d a = [-2.0, 1.5];
1488     a = _mm_mul_pd(a, a);
1489     assert(a.array == [4.0, 2.25]);
1490 }
1491 
1492 version(DigitalMars)
1493 {
1494     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
1495     __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
1496     {
1497         pragma(inline, false);
1498         a[0] = a[0] * b[0];
1499         return a;
1500     }  
1501 }
1502 else
1503 {
1504     __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
1505     {
1506         a[0] *= b[0];
1507         return a;
1508     }
1509 }
1510 unittest
1511 {
1512     __m128d a = [-2.0, 1.5];
1513     a = _mm_mul_sd(a, a);
1514     assert(a.array == [4.0, 1.5]);
1515 }
1516 
1517 
1518 // TODO: _mm_mul_su32
1519 
1520 version(LDC)
1521 {
1522     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
1523 }
1524 else
1525 {
1526     __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe
1527     {
1528         short8 sa = cast(short8)a;
1529         short8 sb = cast(short8)b;
1530         short8 r = void;
1531         r[0] = (sa[0] * sb[0]) >> 16;
1532         r[1] = (sa[1] * sb[1]) >> 16;
1533         r[2] = (sa[2] * sb[2]) >> 16;
1534         r[3] = (sa[3] * sb[3]) >> 16;
1535         r[4] = (sa[4] * sb[4]) >> 16;
1536         r[5] = (sa[5] * sb[5]) >> 16;
1537         r[6] = (sa[6] * sb[6]) >> 16;
1538         r[7] = (sa[7] * sb[7]) >> 16;
1539         return cast(__m128i)r;
1540     }
1541 }
1542 unittest
1543 {
1544     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
1545     __m128i B = _mm_set1_epi16(16384);
1546     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
1547     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
1548     assert(R.array == correct);
1549 }
1550 
1551 version(LDC)
1552 {
1553     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
1554 }
1555 else
1556 {
1557     __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe
1558     {
1559         short8 sa = cast(short8)a;
1560         short8 sb = cast(short8)b;
1561         short8 r = void;
1562         r[0] = cast(short)( (cast(ushort)sa[0] * cast(ushort)sb[0]) >> 16 );
1563         r[1] = cast(short)( (cast(ushort)sa[1] * cast(ushort)sb[1]) >> 16 );
1564         r[2] = cast(short)( (cast(ushort)sa[2] * cast(ushort)sb[2]) >> 16 );
1565         r[3] = cast(short)( (cast(ushort)sa[3] * cast(ushort)sb[3]) >> 16 );
1566         r[4] = cast(short)( (cast(ushort)sa[4] * cast(ushort)sb[4]) >> 16 );
1567         r[5] = cast(short)( (cast(ushort)sa[5] * cast(ushort)sb[5]) >> 16 );
1568         r[6] = cast(short)( (cast(ushort)sa[6] * cast(ushort)sb[6]) >> 16 );
1569         r[7] = cast(short)( (cast(ushort)sa[7] * cast(ushort)sb[7]) >> 16 );
1570         return cast(__m128i)r;
1571     }
1572 }
1573 unittest
1574 {
1575     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
1576     __m128i B = _mm_set1_epi16(16384);
1577     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
1578     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
1579     assert(R.array == correct);
1580 }
1581 
1582 __m128i _mm_mullo_epi16 (__m128i a, __m128i b)
1583 {
1584     return cast(__m128i)(cast(short8)a * cast(short8)b);
1585 }
1586 
1587 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
1588 {
1589     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
1590 }
1591 
1592 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
1593 {
1594     return a | b;
1595 }
1596 
1597 version(LDC)
1598 {
1599     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
1600 }
1601 else
1602 {
1603     __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe
1604     {
1605         short8 r;
1606         r[0] = saturateSignedIntToSignedShort(a[0]);
1607         r[1] = saturateSignedIntToSignedShort(a[1]);
1608         r[2] = saturateSignedIntToSignedShort(a[2]);
1609         r[3] = saturateSignedIntToSignedShort(a[3]);
1610         r[4] = saturateSignedIntToSignedShort(b[0]);
1611         r[5] = saturateSignedIntToSignedShort(b[1]);
1612         r[6] = saturateSignedIntToSignedShort(b[2]);
1613         r[7] = saturateSignedIntToSignedShort(b[3]);
1614         return cast(__m128i)r;
1615     }
1616 }
1617 unittest
1618 {
1619     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1620     short8 R = cast(short8) _mm_packs_epi32(A, A);
1621     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
1622     assert(R.array == correct);
1623 }
1624 
1625 version(LDC)
1626 {
1627     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
1628 }
1629 else
1630 {
1631     __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe
1632     {
1633         byte16 r;
1634         short8 sa = cast(short8)a;
1635         short8 sb = cast(short8)b;
1636         foreach(i; 0..8)
1637             r[i] = saturateSignedWordToSignedByte(sa[i]);
1638         foreach(i; 0..8)
1639             r[i+8] = saturateSignedWordToSignedByte(sb[i]);
1640         return cast(__m128i)r;
1641     }
1642 }
1643 unittest
1644 {
1645     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
1646     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
1647     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
1648                         127, -128, 127, 0, 127, -128, 127, 0];
1649     assert(R.array == correct);
1650 }
1651 
1652 version(LDC)
1653 {
1654     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
1655 }
1656 else
1657 {
1658     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure
1659     {
1660         short8 sa = cast(short8)a;
1661         short8 sb = cast(short8)b;
1662         ubyte[16] result = void;
1663         for (int i = 0; i < 8; ++i)
1664         {
1665             short s = sa[i];
1666             if (s < 0) s = 0;
1667             if (s > 255) s = 255;
1668             result[i] = cast(ubyte)s;
1669 
1670             s = sb[i];
1671             if (s < 0) s = 0;
1672             if (s > 255) s = 255;
1673             result[i+8] = cast(ubyte)s;
1674         }
1675         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
1676     }
1677 }
1678 unittest
1679 {
1680     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
1681     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
1682     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
1683                                                 0, 255, 0, 255, 255, 2, 1, 0];
1684     foreach(i; 0..16)
1685         assert(AA[i] == cast(byte)(correctResult[i]));
1686 }
1687 
1688 version(LDC)
1689 {
1690     alias _mm_pause = __builtin_ia32_pause;
1691 }
1692 else
1693 {
1694     void _mm_pause() pure @safe
1695     {
1696         asm nothrow @nogc pure @safe
1697         {
1698             rep; nop; // F3 90 =  pause
1699         }
1700     }
1701 }
1702 unittest
1703 {
1704     _mm_pause();
1705 }
1706 
1707 
1708 version(LDC)
1709 {
1710     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
1711 }
1712 else
1713 {
1714     __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe
1715     {
1716         byte16 ab = cast(byte16)a;
1717         byte16 bb = cast(byte16)b;
1718         ubyte[16] t;
1719         foreach(i; 0..16)
1720         {
1721             int diff = cast(ubyte)(ab[i]) - cast(ubyte)(bb[i]);
1722             if (diff < 0) diff = -diff;
1723             t[i] = cast(ubyte)(diff);
1724         }
1725         int4 r = _mm_setzero_si128();
1726         r[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
1727         r[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
1728         return r;
1729     }
1730 }
1731 unittest
1732 {
1733     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
1734     __m128i B = _mm_set1_epi8(1);
1735     __m128i R = _mm_sad_epu8(A, B);
1736     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
1737                       0,
1738                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
1739                       0];
1740     assert(R.array == correct);
1741 }
1742 
1743 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
1744 {
1745     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
1746     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
1747 }
1748 unittest
1749 {
1750     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1751     short8 B = cast(short8) A;
1752     foreach(i; 0..8)
1753         assert(B.array[i] == i);
1754 }
1755 
1756 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
1757 {
1758     int[4] result = [e0, e1, e2, e3];
1759     return loadUnaligned!(int4)(result.ptr);
1760 }
1761 unittest
1762 {
1763     __m128i A = _mm_set_epi32(3, 2, 1, 0);
1764     foreach(i; 0..4)
1765         assert(A.array[i] == i);
1766 }
1767 
1768 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted
1769 {
1770     long[2] result = [e0[0], e1[0]];
1771     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1772 }
1773 unittest
1774 {
1775     __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678));
1776     long2 B = cast(long2) A;
1777     assert(B.array[0] == 5678);
1778     assert(B.array[1] == 1234);
1779 }
1780 
1781 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
1782 {
1783     long[2] result = [e0, e1];
1784     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1785 }
1786 unittest
1787 {
1788     __m128i A = _mm_set_epi64x(1234, 5678);
1789     long2 B = cast(long2) A;
1790     assert(B.array[0] == 5678);
1791     assert(B.array[1] == 1234);
1792 }
1793 
1794 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
1795                       byte e11, byte e10, byte e9, byte e8,
1796                       byte e7, byte e6, byte e5, byte e4,
1797                       byte e3, byte e2, byte e1, byte e0) pure @trusted
1798 {
1799     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
1800                      e8, e9, e10, e11, e12, e13, e14, e15];
1801     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1802 }
1803 
1804 __m128d _mm_set_pd (double e1, double e0) pure @trusted
1805 {
1806     double[2] result = [e0, e1];
1807     return loadUnaligned!(double2)(result.ptr);
1808 }
1809 
1810 __m128d _mm_set_pd1 (double a) pure @trusted
1811 {
1812     double[2] result = [a, a];
1813     return loadUnaligned!(double2)(result.ptr);
1814 }
1815 
1816 __m128d _mm_set_sd (double a) pure @trusted
1817 {
1818     double[2] result = [a, 0];
1819     return loadUnaligned!(double2)(result.ptr);
1820 }
1821 
1822 __m128i _mm_set1_epi16 (short a) pure @trusted
1823 {
1824     short[8] result = [a, a, a, a, a, a, a, a];
1825     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
1826 }
1827 
1828 __m128i _mm_set1_epi32 (int a) pure @trusted
1829 {
1830     int[4] result = [a, a, a, a];
1831     return loadUnaligned!(int4)(result.ptr);
1832 }
1833 unittest
1834 {
1835     __m128 a = _mm_set1_ps(-1.0f);
1836     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
1837     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
1838 }
1839 
1840 /// Broadcast 64-bit integer `a` to all elements of `dst`.
1841 __m128i _mm_set1_epi64 (__m64 a) pure @safe
1842 {
1843     return _mm_set_epi64(a, a);
1844 }
1845 
1846 __m128i _mm_set1_epi64x (long a) pure @trusted
1847 {
1848     long[2] result = [a, a];
1849     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1850 }
1851 
1852 __m128i _mm_set1_epi8 (byte a) pure @trusted
1853 {
1854     byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a];
1855     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1856 }
1857 
1858 alias _mm_set1_pd = _mm_set_pd1;
1859 
1860 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
1861 {
1862     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
1863     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
1864 }
1865 
1866 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
1867 {
1868     int[4] result = [e3, e2, e1, e0];
1869     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
1870 }
1871 
1872 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
1873 {
1874     long[2] result = [e1, e0];
1875     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1876 }
1877 
1878 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
1879                        byte e11, byte e10, byte e9,  byte e8,
1880                        byte e7,  byte e6,  byte e5,  byte e4,
1881                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
1882 {
1883     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
1884                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
1885     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1886 }
1887 
1888 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
1889 {
1890     double[2] result = [e1, e0];
1891     return loadUnaligned!(double2)(result.ptr);
1892 }
1893 
1894 __m128d _mm_setzero_pd () pure @trusted
1895 {
1896     double[2] result = [0.0, 0.0];
1897     return loadUnaligned!(double2)(result.ptr);
1898 }
1899 
1900 __m128i _mm_setzero_si128() pure @trusted
1901 {
1902     int[4] result = [0, 0, 0, 0];
1903     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
1904 }
1905 
1906 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
1907 {
1908     return shufflevector!(int4, (imm8 >> 0) & 3,
1909                                 (imm8 >> 2) & 3,
1910                                 (imm8 >> 4) & 3,
1911                                 (imm8 >> 6) & 3)(a, a);
1912 }
1913 unittest
1914 {
1915     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
1916     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1917     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
1918     int[4] expectedB = [ 3, 2, 1, 0 ];
1919     assert(B.array == expectedB);
1920 }
1921 
1922 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
1923 {
1924     return shufflevector!(double2, 0 + ( imm8 & 1 ),
1925                                    2 + ( (imm8 >> 1) & 1 ))(a, b);
1926 }
1927 unittest
1928 {
1929     __m128d A = _mm_setr_pd(0.5, 2.0);
1930     __m128d B = _mm_setr_pd(4.0, 5.0);
1931     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
1932     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
1933     double[2] correct = [ 2.0, 5.0 ];
1934     assert(R.array == correct);
1935 }
1936 
1937 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
1938 {
1939     return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
1940                                       4 + ( (imm8 >> 0) & 3 ),
1941                                       4 + ( (imm8 >> 2) & 3 ),
1942                                       4 + ( (imm8 >> 4) & 3 ),
1943                                       4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
1944 }
1945 unittest
1946 {
1947     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1948     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1949     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
1950     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
1951     assert(C.array == expectedC);
1952 }
1953 
1954 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
1955 {
1956     return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
1957                                                 ( (imm8 >> 2) & 3 ),
1958                                                 ( (imm8 >> 4) & 3 ),
1959                                                 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
1960 }
1961 unittest
1962 {
1963     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1964     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1965     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
1966     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
1967     assert(B.array == expectedB);
1968 }
1969 
1970 version(LDC)
1971 {
1972     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
1973 }
1974 else
1975 {
1976     __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
1977     {
1978         int4 r = void;
1979         long2 lc = cast(long2)count;
1980         int bits = cast(int)(lc[0]);
1981         foreach(i; 0..4)
1982             r[i] = cast(uint)(a[i]) << bits;
1983         return r;
1984     }
1985 }
1986 unittest
1987 {
1988     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
1989     __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1));
1990     int[4] expectedB = [ 0, 4, 6, -8];
1991     assert(B.array == expectedB);
1992 }
1993 
1994 version(LDC)
1995 {
1996     alias _mm_sll_epi64  = __builtin_ia32_psllq128;
1997 }
1998 else
1999 {
2000     __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
2001     {
2002         long2 r = void;
2003         long2 sa = cast(long2)a;
2004         long2 lc = cast(long2)count;
2005         int bits = cast(int)(lc[0]);
2006         foreach(i; 0..2)
2007             r[i] = cast(ulong)(sa[i]) << bits;
2008         return cast(__m128i)r;
2009     }
2010 }
2011 unittest
2012 {
2013     __m128i A = _mm_setr_epi64(8, -4);
2014     long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1));
2015     long[2] expectedB = [ 16, -8];
2016     assert(B.array == expectedB);
2017 }
2018 
2019 version(LDC)
2020 {
2021     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
2022 }
2023 else
2024 {
2025     __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @safe
2026     {
2027         short8 sa = cast(short8)a;
2028         long2 lc = cast(long2)count;
2029         int bits = cast(int)(lc[0]);
2030         short8 r = void;
2031         foreach(i; 0..8)
2032             r[i] = cast(short)(cast(ushort)(sa[i]) << bits);
2033         return cast(int4)r;
2034     }
2035 }
2036 unittest
2037 {
2038     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2039     short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) );
2040     short[8] expectedB =     [ 0, 2, 4, 6, -8, -10, 12, 14 ];
2041     assert(B.array == expectedB);
2042 }
2043 
2044 version(LDC)
2045 {
2046     alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
2047 }
2048 else
2049 {
2050     __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
2051     {
2052         int4 r = void;
2053         foreach(i; 0..4)
2054             r[i] = cast(uint)(a[i]) << imm8;
2055         return r;
2056     }
2057 }
2058 unittest
2059 {
2060     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2061     __m128i B = _mm_slli_epi32(A, 1);
2062     int[4] expectedB = [ 0, 4, 6, -8];
2063     assert(B.array == expectedB);
2064 }
2065 
2066 version(LDC)
2067 {
2068     alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
2069 }
2070 else
2071 {
2072     __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe
2073     {
2074         long2 r = void;
2075         long2 sa = cast(long2)a;
2076         foreach(i; 0..2)
2077             r[i] = cast(ulong)(sa[i]) << imm8;
2078         return cast(__m128i)r;
2079     }
2080 }
2081 unittest
2082 {
2083     __m128i A = _mm_setr_epi64(8, -4);
2084     long2 B = cast(long2) _mm_slli_epi64(A, 1);
2085     long[2] expectedB = [ 16, -8];
2086     assert(B.array == expectedB);
2087 }
2088 
2089 version(LDC)
2090 {
2091     alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
2092 }
2093 else
2094 {
2095     __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe
2096     {
2097         short8 sa = cast(short8)a;
2098         short8 r = void;
2099         foreach(i; 0..8)
2100             r[i] = cast(short)(cast(ushort)(sa[i]) << imm8);
2101         return cast(int4)r;
2102     }
2103 }
2104 unittest
2105 {
2106     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2107     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
2108     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
2109     assert(B.array == expectedB);
2110 }
2111 
2112 /// Shift `a` left by `imm8` bytes while shifting in zeros.
2113 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe
2114 {
2115     static if (imm8 & 0xF0)
2116         return _mm_setzero_si128();
2117     else
2118         return cast(__m128i) shufflevector!(byte16,
2119         16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8,
2120         24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8)
2121         (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
2122 }
2123 unittest
2124 {
2125     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2126     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
2127     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
2128     assert(R.array == correct);
2129 }
2130 
2131 version(LDC)
2132 {
2133     // Disappeared with LDC 1.11
2134     static if (__VERSION__ < 2081)
2135         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
2136     else
2137     {
2138         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
2139         {
2140             vec.array[0] = llvm_sqrt(vec.array[0]);
2141             vec.array[1] = llvm_sqrt(vec.array[1]);
2142             return vec;
2143         }
2144     }
2145 }
2146 else
2147 {
2148     __m128d _mm_sqrt_pd(__m128d vec) pure @safe
2149     {
2150         vec.array[0] = sqrt(vec.array[0]);
2151         vec.array[1] = sqrt(vec.array[1]);
2152         return vec;
2153     }
2154 }
2155 
2156 
2157 version(LDC)
2158 {
2159     // Disappeared with LDC 1.11
2160     static if (__VERSION__ < 2081)
2161         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
2162     else
2163     {
2164         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
2165         {
2166             vec.array[0] = llvm_sqrt(vec.array[0]);
2167             vec.array[1] = vec.array[1];
2168             return vec;
2169         }
2170     }
2171 }
2172 else
2173 {
2174     __m128d _mm_sqrt_sd(__m128d vec) pure @safe
2175     {
2176         vec.array[0] = sqrt(vec.array[0]);
2177         vec.array[1] = vec.array[1];
2178         return vec;
2179     }
2180 }
2181 
2182 
2183 version(LDC)
2184 {
2185     alias _mm_sra_epi16 = __builtin_ia32_psraw128;
2186 }
2187 else
2188 {
2189     __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
2190     {
2191         short8 sa = cast(short8)a;
2192         long2 lc = cast(long2)count;
2193         int bits = cast(int)(lc[0]);
2194         short8 r = void;
2195         foreach(i; 0..8)
2196             r[i] = cast(short)(sa[i] >> bits);
2197         return cast(int4)r;
2198     }
2199 }
2200 unittest
2201 {
2202     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2203     short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) );
2204     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
2205     assert(B.array == expectedB);
2206 }
2207 
2208 version(LDC)
2209 {
2210     alias _mm_sra_epi32  = __builtin_ia32_psrad128;
2211 }
2212 else
2213 {
2214     __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe
2215     {
2216         int4 r = void;
2217         long2 lc = cast(long2)count;
2218         int bits = cast(int)(lc[0]);
2219         foreach(i; 0..4)
2220             r[i] = (a[i] >> bits);
2221         return r;
2222     }
2223 }
2224 unittest
2225 {
2226     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2227     __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1));
2228     int[4] expectedB = [ 0, 1, 1, -2];
2229     assert(B.array == expectedB);
2230 }
2231 
2232 
2233 version(LDC)
2234 {
2235     alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
2236 }
2237 else
2238 {
2239     __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe
2240     {
2241         short8 sa = cast(short8)a;
2242         short8 r = void;
2243         foreach(i; 0..8)
2244             r[i] = cast(short)(sa[i] >> imm8);
2245         return cast(int4)r;
2246     }
2247 }
2248 unittest
2249 {
2250     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2251     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
2252     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
2253     assert(B.array == expectedB);
2254 }
2255 
2256 version(LDC)
2257 {
2258     alias _mm_srai_epi32  = __builtin_ia32_psradi128;
2259 }
2260 else
2261 {
2262     __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe
2263     {
2264         int4 r = void;
2265         foreach(i; 0..4)
2266             r[i] = (a[i] >> imm8);
2267         return r;
2268     }
2269 }
2270 unittest
2271 {
2272     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2273     __m128i B = _mm_srai_epi32(A, 1);
2274     int[4] expectedB = [ 0, 1, 1, -2];
2275     assert(B.array == expectedB);
2276 }
2277 
2278 version(LDC)
2279 {
2280     alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
2281 }
2282 else
2283 {
2284     __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
2285     {
2286         short8 sa = cast(short8)a;
2287         long2 lc = cast(long2)count;
2288         int bits = cast(int)(lc[0]);
2289         short8 r = void;
2290         foreach(i; 0..8)
2291             r[i] = cast(short)(cast(ushort)(sa[i]) >> bits);
2292         return cast(int4)r;
2293     }
2294 }
2295 unittest
2296 {
2297     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2298     short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) );
2299     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
2300     assert(B.array == expectedB);
2301 }
2302 
2303 version(LDC)
2304 {
2305     alias _mm_srl_epi32  = __builtin_ia32_psrld128;
2306 }
2307 else
2308 {
2309     __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe
2310     {
2311         int4 r = void;
2312         long2 lc = cast(long2)count;
2313         int bits = cast(int)(lc[0]);
2314         foreach(i; 0..4)
2315             r[i] = cast(uint)(a[i]) >> bits;
2316         return r;
2317     }
2318 }
2319 unittest
2320 {
2321     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2322     __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1));
2323     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
2324     assert(B.array == expectedB);
2325 }
2326 
2327 version(LDC)
2328 {
2329     alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
2330 }
2331 else
2332 {
2333     __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
2334     {
2335         long2 r = void;
2336         long2 sa = cast(long2)a;
2337         long2 lc = cast(long2)count;
2338         int bits = cast(int)(lc[0]);
2339         foreach(i; 0..2)
2340             r[i] = cast(ulong)(sa[i]) >> bits;
2341         return cast(__m128i)r;
2342     }
2343 }
2344 unittest
2345 {
2346     __m128i A = _mm_setr_epi64(8, -4);
2347     long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1));
2348     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
2349     assert(B.array == expectedB);
2350 }
2351 
2352 version(LDC)
2353 {
2354     alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
2355 }
2356 else
2357 {
2358     __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
2359     {
2360         short8 sa = cast(short8)a;
2361         short8 r = void;
2362         foreach(i; 0..8)
2363             r[i] = cast(short)(cast(ushort)(sa[i]) >> imm8);
2364         return cast(int4)r;
2365     }
2366 }
2367 unittest
2368 {
2369     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2370     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
2371     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
2372     assert(B.array == expectedB);
2373 }
2374 
2375 version(LDC)
2376 {
2377     alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
2378 }
2379 else
2380 {
2381     __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe
2382     {
2383         int4 r = void;
2384         foreach(i; 0..4)
2385             r[i] = cast(uint)(a[i]) >> imm8;
2386         return r;
2387     }
2388 }
2389 unittest
2390 {
2391     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2392     __m128i B = _mm_srli_epi32(A, 1);
2393     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
2394     assert(B.array == expectedB);
2395 }
2396 
2397 version(LDC)
2398 {
2399     alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
2400 }
2401 else
2402 {
2403     __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe
2404     {
2405         long2 r = void;
2406         long2 sa = cast(long2)a;
2407         foreach(i; 0..2)
2408             r[i] = cast(ulong)(sa[i]) >> imm8;
2409         return cast(__m128i)r;
2410     }
2411 }
2412 unittest
2413 {
2414     __m128i A = _mm_setr_epi64(8, -4);
2415     long2 B = cast(long2) _mm_srli_epi64(A, 1);
2416     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
2417     assert(B.array == expectedB);
2418 }
2419 
2420 /// Shift `v` right by `bytes` bytes while shifting in zeros.
2421 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
2422 {
2423     static if (bytes & 0xF0)
2424         return _mm_setzero_si128();
2425     else
2426         return cast(__m128i) shufflevector!(byte16,
2427                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
2428                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
2429                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
2430 }
2431 unittest
2432 {
2433     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
2434     int[4] correct = [2, 3, 4, 0];
2435     assert(R.array == correct);
2436 }
2437 
2438 /// Shift `v` right by `bytes` bytes while shifting in zeros.
2439 /// #BONUS
2440 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
2441 {
2442     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
2443 }
2444 unittest
2445 {    
2446     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
2447     float[4] correct = [3.0f, 4.0f, 0, 0];
2448     assert(R.array == correct);
2449 }
2450 
2451 /// Shift `v` right by `bytes` bytes while shifting in zeros.
2452 /// #BONUS
2453 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
2454 {
2455     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
2456 }
2457 
2458 void _mm_store_pd (double* mem_addr, __m128d a) pure
2459 {
2460     __m128d* aligned = cast(__m128d*)mem_addr;
2461     *aligned = a;
2462 }
2463 
2464 void _mm_store_pd1 (double* mem_addr, __m128d a) pure
2465 {
2466     __m128d* aligned = cast(__m128d*)mem_addr;
2467     *aligned = shufflevector!(double2, 0, 0)(a, a);
2468 }
2469 
2470 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
2471 {
2472     *mem_addr = a[0];
2473 }
2474 
2475 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
2476 {
2477     *mem_addr = a;
2478 }
2479 
2480 alias _mm_store1_pd = _mm_store_pd1;
2481 
2482 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
2483 {
2484     *mem_addr = a[1];
2485 }
2486 
2487 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
2488 {
2489     long* dest = cast(long*)mem_addr;
2490     long2 la = cast(long2)a;
2491     *dest = a[0];
2492 }
2493 
2494 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
2495 {
2496     *mem_addr = a[0];
2497 }
2498 
2499 void _mm_storer_pd (double* mem_addr, __m128d a) pure
2500 {
2501     __m128d* aligned = cast(__m128d*)mem_addr;
2502     *aligned = shufflevector!(double2, 1, 0)(a, a);
2503 }
2504 
2505 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
2506 {
2507     storeUnaligned!double2(a, mem_addr);
2508 }
2509 
2510 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
2511 {
2512     storeUnaligned!__m128i(a, cast(int*)mem_addr);
2513 }
2514 
2515 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 
2516 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 
2517 /// boundary or a general-protection exception may be generated.
2518 void _mm_stream_pd (double* mem_addr, __m128d a)
2519 {
2520     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2521     __m128d* dest = cast(__m128d*)mem_addr;
2522     *dest = a;
2523 }
2524 
2525 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 
2526 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 
2527 /// may be generated.
2528 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
2529 {
2530     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2531     __m128i* dest = cast(__m128i*)mem_addr;
2532     *dest = a;
2533 }
2534 
2535 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 
2536 /// pollution. If the cache line containing address mem_addr is already in the cache, 
2537 /// the cache will be updated.
2538 void _mm_stream_si32 (int* mem_addr, int a)
2539 {
2540     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2541     *mem_addr = a;
2542 }
2543 
2544 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 
2545 /// cache pollution. If the cache line containing address mem_addr is already 
2546 /// in the cache, the cache will be updated.
2547 void _mm_stream_si64 (long* mem_addr, long a)
2548 {
2549     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2550     *mem_addr = a;
2551 }
2552 
2553 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
2554 {
2555     return cast(__m128i)(cast(short8)a - cast(short8)b);
2556 }
2557 
2558 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
2559 {
2560     return cast(__m128i)(cast(int4)a - cast(int4)b);
2561 }
2562 
2563 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
2564 {
2565     return cast(__m128i)(cast(long2)a - cast(long2)b);
2566 }
2567 
2568 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
2569 {
2570     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
2571 }
2572 
2573 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
2574 {
2575     return a - b;
2576 }
2577 
2578 version(DigitalMars)
2579 {
2580     // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599
2581     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
2582     {
2583         pragma(inline, false);
2584         a[0] = a[0] - b[0];
2585         return a;
2586     }
2587 }
2588 else
2589 {
2590     __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
2591     {
2592         a[0] -= b[0];
2593         return a;
2594     }
2595 }
2596 unittest
2597 {
2598     __m128d a = [1.5, -2.0];
2599     a = _mm_sub_sd(a, a);
2600     assert(a.array == [0.0, -2.0]);
2601 }
2602 
2603 
2604 // TODO: _mm_sub_si64
2605 
2606 version(LDC)
2607 {
2608     alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
2609 }
2610 else
2611 {
2612     __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted
2613     {
2614         short[8] res;
2615         short8 sa = cast(short8)a;
2616         short8 sb = cast(short8)b;
2617         foreach(i; 0..8)
2618             res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]);
2619         return _mm_loadu_si128(cast(int4*)res.ptr);
2620     }
2621 }
2622 unittest
2623 {
2624     short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0),
2625                                              _mm_setr_epi16(-10  ,     16, 5, 4, 3, 2, 1, 0));
2626     static immutable short[8] correctResult =              [32767, -32768, 0, 0, 0, 0, 0, 0];
2627     assert(res.array == correctResult);
2628 }
2629 
2630 version(LDC)
2631 {
2632     alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
2633 }
2634 else
2635 {
2636     __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted
2637     {
2638         byte[16] res;
2639         byte16 sa = cast(byte16)a;
2640         byte16 sb = cast(byte16)b;
2641         foreach(i; 0..16)
2642             res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]);
2643         return _mm_loadu_si128(cast(int4*)res.ptr);
2644     }
2645 }
2646 unittest
2647 {
2648     byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
2649                                             _mm_setr_epi8(  15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
2650     static immutable byte[16] correctResult            = [-128, 127,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
2651     assert(res.array == correctResult);
2652 }
2653 
2654 version(LDC)
2655 {
2656     alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
2657 }
2658 else
2659 {
2660     __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted
2661     {
2662         short[8] res;
2663         short8 sa = cast(short8)a;
2664         short8 sb = cast(short8)b;
2665         foreach(i; 0..8)
2666         {
2667             int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]);
2668             res[i] = saturateSignedIntToUnsignedShort(sum);
2669         }
2670         return _mm_loadu_si128(cast(int4*)res.ptr);
2671     }
2672 }
2673 unittest
2674 {
2675     __m128i A = _mm_setr_epi16(cast(short)65534,   0, 5, 4, 3, 2, 1, 0);
2676     short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534,  1, 5, 4, 3, 2, 1, 0),
2677                                            _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0));
2678     static immutable short[8] correct =                  [               0,  0, 1, 0, 0, 2, 0, 0];
2679     assert(R.array == correct);
2680 }
2681 
2682 version(LDC)
2683 {
2684     alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
2685 }
2686 else
2687 {
2688     __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted
2689     {
2690         ubyte[16] res;
2691         byte16 sa = cast(byte16)a;
2692         byte16 sb = cast(byte16)b;
2693         foreach(i; 0..16)
2694             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i]));
2695         return _mm_loadu_si128(cast(int4*)res.ptr);
2696     }
2697 }
2698 unittest
2699 {
2700     byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
2701                                             _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
2702     static immutable byte[16] correctResult =            [            0,   7,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
2703     assert(res.array == correctResult);
2704 }
2705 
2706 // Note: the only difference between these intrinsics is the signalling 
2707 //       behaviour of quiet NaNs. This is incorrect but the case where
2708 //       you would want to differentiate between qNaN and sNaN and then 
2709 //       treat them differently on purpose seems extremely rare.
2710 alias _mm_ucomieq_sd = _mm_comieq_sd;
2711 alias _mm_ucomige_sd = _mm_comige_sd;
2712 alias _mm_ucomigt_sd = _mm_comigt_sd;
2713 alias _mm_ucomile_sd = _mm_comile_sd;
2714 alias _mm_ucomilt_sd = _mm_comilt_sd;
2715 alias _mm_ucomineq_sd = _mm_comineq_sd;
2716 
2717 __m128d _mm_undefined_pd() pure @safe
2718 {
2719     __m128d result = void;
2720     return result;
2721 }
2722 __m128i _mm_undefined_si128() pure @safe
2723 {
2724     __m128i result = void;
2725     return result;
2726 }
2727 
2728 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
2729 {
2730     return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
2731                                        (cast(short8)a, cast(short8)b);
2732 }
2733 
2734 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
2735 {
2736     return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
2737 }
2738 
2739 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe
2740 {
2741     return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b);
2742 }
2743 
2744 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
2745 {
2746     return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
2747                                                12, 28, 13, 29, 14, 30, 15, 31)
2748                                                (cast(byte16)a, cast(byte16)b);
2749 }
2750 
2751 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
2752 {
2753     return shufflevector!(__m128d, 1, 3)(a, b);
2754 }
2755 
2756 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
2757 {
2758     return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
2759                                        (cast(short8)a, cast(short8)b);
2760 }
2761 
2762 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
2763 {
2764     return shufflevector!(int4, 0, 4, 1, 5)
2765                          (cast(int4)a, cast(int4)b);
2766 }
2767 
2768 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe
2769 {
2770     return cast(__m128i) shufflevector!(long2, 0, 2)
2771                                        (cast(long2)a, cast(long2)b);
2772 }
2773 
2774 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
2775 {
2776     return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
2777                                                 4, 20, 5, 21, 6, 22, 7, 23)
2778                                        (cast(byte16)a, cast(byte16)b);
2779 }
2780 
2781 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
2782 {
2783     return shufflevector!(__m128d, 0, 2)(a, b);
2784 }
2785 
2786 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
2787 {
2788     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
2789 }
2790 
2791 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
2792 {
2793     return a ^ b;
2794 }
2795 
2796 unittest
2797 {
2798     // distance between two points in 4D
2799     float distance(float[4] a, float[4] b) nothrow @nogc
2800     {
2801         __m128 va = _mm_loadu_ps(a.ptr);
2802         __m128 vb = _mm_loadu_ps(b.ptr);
2803         __m128 diffSquared = _mm_sub_ps(va, vb);
2804         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
2805         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
2806         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
2807         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
2808     }
2809     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
2810 }