1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.emmintrin;
7 
8 public import inteli.types;
9 public import inteli.xmmintrin; // SSE2 includes SSE1
10 
11 import inteli.internals;
12 
13 nothrow @nogc:
14 
15 // SSE2 instructions
16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
17 
18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
19 {
20     return cast(__m128i)(cast(short8)a + cast(short8)b);
21 }
22 
23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
24 {
25     return cast(__m128i)(cast(int4)a + cast(int4)b);
26 }
27 
28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
29 {
30     return cast(__m128i)(cast(long2)a + cast(long2)b);
31 }
32 
33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
34 {
35     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
36 }
37 
38 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
39 {
40     a[0] += b[0];
41     return a;
42 }
43 unittest
44 {
45     __m128d a = [1.5, -2.0];
46     a = _mm_add_sd(a, a);
47     assert(a.array == [3.0, -2.0]);
48 }
49 
50 
51 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
52 {
53     return a + b;
54 }
55 unittest
56 {
57     __m128d a = [1.5, -2.0];
58     a = _mm_add_pd(a, a);
59     assert(a.array == [3.0, -4.0]);
60 }
61 
62 // MMXREG: _mm_add_si64
63 
64 version(LDC)
65 {
66     alias _mm_adds_epi16 = __builtin_ia32_paddsw128;
67 }
68 else
69 {
70     __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
71     {
72         short[8] res;
73         short8 sa = cast(short8)a;
74         short8 sb = cast(short8)b;
75         foreach(i; 0..8)
76             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
77         return _mm_loadu_si128(cast(int4*)res.ptr);
78     }
79 }
80 unittest
81 {
82     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
83                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
84     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
85     assert(res.array == correctResult);
86 }
87 
88 version(LDC)
89 {
90     alias _mm_adds_epi8 = __builtin_ia32_paddsb128;
91 }
92 else
93 {
94     __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
95     {
96         byte[16] res;
97         byte16 sa = cast(byte16)a;
98         byte16 sb = cast(byte16)b;
99         foreach(i; 0..16)
100             res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
101         return _mm_loadu_si128(cast(int4*)res.ptr);
102     }
103 }
104 unittest
105 {
106     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
107                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
108     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
109                                                16, 18, 20, 22, 24, 26, 28, 30];
110     assert(res.array == correctResult);
111 }
112 
113 version(LDC)
114 {
115     alias _mm_adds_epu8 = __builtin_ia32_paddusb128;
116 }
117 else
118 {
119     __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
120     {
121         ubyte[16] res;
122         byte16 sa = cast(byte16)a;
123         byte16 sb = cast(byte16)b;
124         foreach(i; 0..16)
125             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
126         return _mm_loadu_si128(cast(int4*)res.ptr);
127     }
128 }
129 
130 version(LDC)
131 {
132     alias _mm_adds_epu16 = __builtin_ia32_paddusw128;
133 }
134 else
135 {
136     __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
137     {
138         ushort[8] res;
139         short8 sa = cast(short8)a;
140         short8 sb = cast(short8)b;
141         foreach(i; 0..8)
142             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
143         return _mm_loadu_si128(cast(int4*)res.ptr);
144     }
145 }
146 
147 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
148 {
149     return cast(__m128d)( cast(__m128i)a & cast(__m128i)b );
150 }
151 
152 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
153 {
154     return a & b;
155 }
156 
157 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
158 {
159     return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b );
160 }
161 
162 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
163 {
164     return (~a) & b;
165 }
166 
167 version(LDC)
168 {
169     __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe
170     {
171         // Generates pavgw even in LDC 1.0, even in -O0
172         enum ir = `
173             %ia = zext <8 x i16> %0 to <8 x i32>
174             %ib = zext <8 x i16> %1 to <8 x i32>
175             %isum = add <8 x i32> %ia, %ib
176             %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
177             %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
178             %r = trunc <8 x i32> %isums to <8 x i16>
179             ret <8 x i16> %r`;
180         return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b);        
181     }
182 }
183 else
184 {
185     __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe
186     {
187         short8 sa = cast(short8)a;
188         short8 sb = cast(short8)b;
189         short8 sr = void;
190         foreach(i; 0..8)
191         {
192             sr[i] = cast(ushort)( (cast(ushort)(sa[i]) + cast(ushort)(sb[i]) + 1) >> 1 );
193         }
194         return cast(int4)sr;
195     }
196 }
197 unittest
198 {
199     __m128i A = _mm_set1_epi16(31);
200     __m128i B = _mm_set1_epi16(64);
201     short8 avg = cast(short8)(_mm_avg_epu16(A, B));
202     foreach(i; 0..8)
203         assert(avg[i] == 48);
204 }
205 
206 version(LDC)
207 {
208     __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe
209     {
210         // Generates pavgb even in LDC 1.0, even in -O0
211         enum ir = `
212             %ia = zext <16 x i8> %0 to <16 x i16>
213             %ib = zext <16 x i8> %1 to <16 x i16>
214             %isum = add <16 x i16> %ia, %ib
215             %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
216             %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
217             %r = trunc <16 x i16> %isums to <16 x i8>
218             ret <16 x i8> %r`;
219         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b);        
220     }
221 }
222 else
223 {
224     __m128i _mm_avg_epu8 (__m128i a, __m128i b)
225     {
226         byte16 sa = cast(byte16)a;
227         byte16 sb = cast(byte16)b;
228         byte16 sr = void;
229         foreach(i; 0..16)
230         {
231             sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 );
232         }
233         return cast(int4)sr;
234     }
235 }
236 unittest
237 {
238     __m128i A = _mm_set1_epi8(31);
239     __m128i B = _mm_set1_epi8(64);
240     byte16 avg = cast(byte16)(_mm_avg_epu8(A, B));
241     foreach(i; 0..16)
242         assert(avg[i] == 48);
243 }
244 
245 // Note: unlike Intel API, shift amount is a compile-time parameter.
246 __m128i _mm_bslli_si128(int bits)(__m128i a) pure @safe
247 {
248     // Generates pslldq starting with LDC 1.1 -O2
249     __m128i zero = _mm_setzero_si128();
250     return cast(__m128i) 
251         shufflevector!(byte16, 16 - bits, 17 - bits, 18 - bits, 19 - bits,
252                                20 - bits, 21 - bits, 22 - bits, 23 - bits,
253                                24 - bits, 25 - bits, 26 - bits, 27 - bits,
254                                28 - bits, 29 - bits, 30 - bits, 31 - bits)
255         (cast(byte16)zero, cast(byte16)a);
256 }
257 unittest
258 {
259     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
260     byte[16] exact =              [0, 0, 0, 0, 0, 0, 1, 2, 3, 4,  5,  6,  7,  8,  9, 10];
261     __m128i result = _mm_bslli_si128!5(toShift);
262     assert(  (cast(byte16)result).array == exact);
263 }
264 
265 // Note: unlike Intel API, shift amount is a compile-time parameter.
266 __m128i _mm_bsrli_si128(int bits)(__m128i a) pure @safe
267 {
268     // Generates psrldq starting with LDC 1.1 -O2
269     __m128i zero = _mm_setzero_si128();
270     return  cast(__m128i) 
271         shufflevector!(byte16, 0 + bits, 1 + bits, 2 + bits, 3 + bits,
272                                4 + bits, 5 + bits, 6 + bits, 7 + bits,
273                                8 + bits, 9 + bits, 10 + bits, 11 + bits,
274                                12 + bits, 13 + bits, 14 + bits, 15 + bits)
275         (cast(byte16)a, cast(byte16)zero);
276 }
277 unittest
278 {
279     __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
280     byte[16] exact =               [5, 6, 7, 8, 9,10,11,12,13,14, 15,  0,  0,  0,  0,  0];
281     __m128i result = _mm_bsrli_si128!5(toShift);
282     assert( (cast(byte16)result).array == exact);
283 }
284 
285 __m128 _mm_castpd_ps (__m128d a) pure @safe
286 {
287     return cast(__m128)a;
288 }
289 
290 __m128i _mm_castpd_si128 (__m128d a) pure @safe
291 {
292     return cast(__m128i)a;
293 }
294 
295 __m128d _mm_castps_pd (__m128 a) pure @safe
296 {
297     return cast(__m128d)a;
298 }
299 
300 __m128i _mm_castps_si128 (__m128 a) pure @safe
301 {
302     return cast(__m128i)a;
303 }
304 
305 __m128d _mm_castsi128_pd (__m128i a) pure @safe
306 {
307     return cast(__m128d)a;
308 }
309 
310 __m128 _mm_castsi128_ps (__m128i a) pure @safe
311 {
312     return cast(__m128)a;
313 }
314 
315 version(LDC)
316 {
317     alias _mm_clflush = __builtin_ia32_clflush;
318 }
319 else
320 {
321     void _mm_clflush (const(void)* p) pure @safe
322     {
323         version(D_InlineAsm_X86)
324         {
325             asm pure nothrow @nogc @safe
326             {
327                 mov EAX, p;
328                 clflush [EAX];
329             }
330         }
331         else version(D_InlineAsm_X86_64)
332         {
333             asm pure nothrow @nogc @safe
334             {
335                 mov RAX, p;
336                 clflush [RAX];
337             }
338         }
339         else
340             static assert(false, "Should implement clflush for this compiler");
341     }
342 }
343 unittest
344 {
345     ubyte[64] cacheline;
346     _mm_clflush(cacheline.ptr);
347 }
348 
349 
350 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
351 {
352     return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
353 }
354 unittest
355 {
356     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
357     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
358     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
359     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
360     assert(R.array == E);
361 }
362 
363 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
364 {
365     return equalMask!__m128i(a, b);
366 }
367 unittest
368 {
369     int4   A = [-3, -2, -1,  0];
370     int4   B = [ 4, -2,  2,  0];
371     int[4] E = [ 0, -1,  0, -1];
372     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
373     assert(R.array == E);
374 }
375 
376 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
377 {
378     return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
379 }
380 unittest
381 {
382     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
383     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
384     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
385     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
386     __m128i D = _mm_cmpeq_epi8(A, B);
387     assert(C.array == correct);
388 }
389 
390 
391 
392 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
393 {
394     return cast(__m128d) cmppd!(FPComparison.oeq)(a, b);
395 }
396 
397 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
398 {
399     return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b);
400 }
401 
402 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
403 {
404     return cast(__m128d) cmppd!(FPComparison.oge)(a, b);
405 }
406 
407 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
408 {
409     return cast(__m128d) cmpsd!(FPComparison.oge)(a, b);
410 }
411 
412 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
413 {
414     return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
415 }
416 unittest
417 {
418     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
419     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
420     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
421     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
422     assert(R.array == E);
423 }
424 
425 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
426 {
427     return cast(__m128i)( greaterMask!int4(a, b));
428 }
429 unittest
430 {
431     int4   A = [-3,  2, -1,  0];
432     int4   B = [ 4, -2,  2,  0];
433     int[4] E = [ 0, -1,  0,  0];
434     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
435     assert(R.array == E);
436 }
437 
438 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
439 {
440     return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
441 }
442 unittest
443 {
444     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
445     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
446     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
447     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
448     __m128i D = _mm_cmpeq_epi8(A, B);
449     assert(C.array == correct);
450 }
451 
452 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
453 {
454     return cast(__m128d) cmppd!(FPComparison.ogt)(a, b);
455 }
456 
457 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
458 {
459     return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b);
460 }
461 
462 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
463 {
464     return cast(__m128d) cmppd!(FPComparison.ole)(a, b);
465 }
466 
467 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
468 {
469     return cast(__m128d) cmpsd!(FPComparison.ole)(a, b);
470 }
471 
472 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
473 {
474     return _mm_cmpgt_epi16(b, a);
475 }
476 
477 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
478 {
479     return _mm_cmpgt_epi32(b, a);
480 }
481 
482 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
483 {
484     return _mm_cmpgt_epi8(b, a);
485 }
486 
487 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
488 {
489     return cast(__m128d) cmppd!(FPComparison.olt)(a, b);
490 }
491 
492 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
493 {
494     return cast(__m128d) cmpsd!(FPComparison.olt)(a, b);
495 }
496 
497 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
498 {
499     return cast(__m128d) cmppd!(FPComparison.une)(a, b);
500 }
501 
502 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
503 {
504     return cast(__m128d) cmpsd!(FPComparison.une)(a, b);
505 }
506 
507 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
508 {
509     return cast(__m128d) cmppd!(FPComparison.ult)(a, b);
510 }
511 
512 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
513 {
514     return cast(__m128d) cmpsd!(FPComparison.ult)(a, b);
515 }
516 
517 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
518 {
519     return cast(__m128d) cmppd!(FPComparison.ule)(a, b);
520 }
521 
522 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
523 {
524     return cast(__m128d) cmpsd!(FPComparison.ule)(a, b);
525 }
526 
527 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
528 {
529     return cast(__m128d) cmppd!(FPComparison.ugt)(a, b);
530 }
531 
532 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
533 {
534     return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b);
535 }
536 
537 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
538 {
539     return cast(__m128d) cmppd!(FPComparison.uge)(a, b);
540 }
541 
542 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
543 {
544     return cast(__m128d) cmpsd!(FPComparison.uge)(a, b);
545 }
546 
547 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
548 {
549     return cast(__m128d) cmppd!(FPComparison.ord)(a, b);
550 }
551 
552 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
553 {
554     return cast(__m128d) cmpsd!(FPComparison.ord)(a, b);
555 }
556 
557 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
558 {
559     return cast(__m128d) cmppd!(FPComparison.uno)(a, b);
560 }
561 
562 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
563 {
564     return cast(__m128d) cmpsd!(FPComparison.uno)(a, b);
565 }
566 
567 
568 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
569 // Some such comparisons yields true for NaNs, other don't.
570 
571 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe
572 {
573     return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC
574 }
575 
576 int _mm_comige_sd (__m128d a, __m128d b) pure @safe
577 {
578     return comsd!(FPComparison.oge)(a, b);
579 }
580 
581 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe
582 {
583     return comsd!(FPComparison.ogt)(a, b);
584 }
585 
586 int _mm_comile_sd (__m128d a, __m128d b) pure @safe
587 {
588     return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC
589 }
590 
591 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe
592 {
593     return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC
594 }
595 
596 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe
597 {
598     return comsd!(FPComparison.one)(a, b);
599 }
600 
601 version(LDC)
602 {
603      __m128d _mm_cvtepi32_pd (__m128i a) pure  @safe
604     {
605         // Generates cvtdq2pd since LDC 1.0, even without optimizations
606         enum ir = `
607             %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
608             %r = sitofp <2 x i32> %v to <2 x double>
609             ret <2 x double> %r`;
610         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a);
611     }
612 }
613 else
614 {
615     __m128d _mm_cvtepi32_pd (__m128i a) pure  @safe
616     {
617         double2 r = void;
618         r[0] = a[0];
619         r[1] = a[1];
620         return r;
621     }
622 }
623 unittest
624 {
625     __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54));
626     assert(A[0] == 54.0);
627     assert(A[1] == 54.0);
628 }
629 
630 // PERF: verify the instruction generated
631 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe
632 {
633     __m128 res;
634     res.array[0] = cast(float)a.array[0];
635     res.array[1] = cast(float)a.array[1];
636     res.array[2] = cast(float)a.array[2];
637     res.array[3] = cast(float)a.array[3];
638     return res;
639 }
640 unittest
641 {
642     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
643     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
644 }
645 
646 
647 version(LDC) 
648 {
649     // Like in clang, implemented with a magic intrinsic right now
650     alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq;
651 
652 /* Unfortunately this generates a cvttpd2dq instruction
653     __m128i _mm_cvtpd_epi32 (__m128d a) pure  @safe
654     {
655         enum ir = `
656             %i = fptosi <2 x double> %0 to <2 x i32>
657             %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>        
658             ret <4 x i32> %r`;
659 
660         return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a);
661     } */
662 }
663 else
664 {
665     __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe
666     {
667         __m128i r = _mm_setzero_si128();
668         r[0] = convertDoubleToInt32UsingMXCSR(a[0]);
669         r[1] = convertDoubleToInt32UsingMXCSR(a[1]);
670         return r; 
671     }
672 }
673 unittest
674 {
675     int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0));
676     assert(A[0] == 55 && A[1] == 61 && A[2] == 0 && A[3] == 0);
677 }
678 
679 // MMXREG: _mm_cvtpd_pi32
680 
681 version(LDC)
682 {
683     alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately
684 }
685 else
686 {
687     __m128 _mm_cvtpd_ps (__m128d a) pure @safe
688     {
689         __m128 r = void;
690         r[0] = a[0];
691         r[1] = a[1];
692         r[2] = 0;
693         r[3] = 0;
694         return r;
695     }
696 }
697 unittest
698 {
699     __m128d A = _mm_set_pd(5.25, 4.0);
700     __m128 B = _mm_cvtpd_ps(A);
701     assert(B.array == [4.0f, 5.25f, 0, 0]);
702 }
703 
704 // MMXREG: _mm_cvtpi32_pd
705 
706 version(LDC)
707 {
708     alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
709 }
710 else
711 {
712     __m128i _mm_cvtps_epi32 (__m128 a) pure @safe
713     {
714         __m128i r = void;
715         r[0] = convertFloatToInt32UsingMXCSR(a[0]);
716         r[1] = convertFloatToInt32UsingMXCSR(a[1]);
717         r[2] = convertFloatToInt32UsingMXCSR(a[2]);
718         r[3] = convertFloatToInt32UsingMXCSR(a[3]);
719         return r; 
720     }
721 }
722 unittest
723 {
724     uint savedRounding = _MM_GET_ROUNDING_MODE();
725 
726     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
727     __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
728     assert(A.array == [1, -2, 54, -3]);
729 
730     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
731     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
732     assert(A.array == [1, -3, 53, -3]);
733 
734     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
735     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
736     assert(A.array == [2, -2, 54, -2]);
737 
738     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
739     A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f));
740     assert(A.array == [1, -2, 53, -2]);
741 
742     _MM_SET_ROUNDING_MODE(savedRounding);
743 }
744 
745 
746 version(LDC)
747 {
748     __m128d _mm_cvtps_pd (__m128 a) pure  @safe
749     {
750         // Generates cvtps2pd since LDC 1.0, no opt
751         enum ir = `
752             %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1>
753             %r = fpext <2 x float> %v to <2 x double>
754             ret <2 x double> %r`;
755         return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a);
756     }
757 }
758 else
759 {
760      __m128d _mm_cvtps_pd (__m128 a) pure  @safe
761     {
762         double2 r = void;
763         r[0] = a[0];
764         r[1] = a[1];
765         return r;
766     }
767 }
768 unittest
769 {
770     __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f));
771     assert(A[0] == 54.0);
772     assert(A[1] == 54.0);
773 }
774 
775 double _mm_cvtsd_f64 (__m128d a) pure @safe
776 {
777     return extractelement!(double2, 0)(a);
778 }
779 
780 version(LDC)
781 {
782     alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si;
783 }
784 else
785 {
786     int _mm_cvtsd_si32 (__m128d a) pure @safe
787     {
788         return convertDoubleToInt32UsingMXCSR(a[0]);
789     }
790 }
791 unittest
792 {
793     assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0)));
794 }
795 
796 version(LDC)
797 {
798     // Unfortunately this builtin crashes in 32-bit
799     version(X86_64)
800         alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
801     else
802     {
803         long _mm_cvtsd_si64 (__m128d a) pure @safe
804         {
805             return convertDoubleToInt64UsingMXCSR(a[0]);
806         }
807     }
808 }
809 else
810 {
811     long _mm_cvtsd_si64 (__m128d a) pure @safe
812     {
813         return convertDoubleToInt64UsingMXCSR(a[0]);
814     }
815 }
816 unittest
817 {
818     assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0)));
819 
820     uint savedRounding = _MM_GET_ROUNDING_MODE();
821 
822     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
823     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5)));
824 
825     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
826     assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1)));
827 
828     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
829     assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1)));
830 
831     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
832     assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9)));
833 
834     _MM_SET_ROUNDING_MODE(savedRounding);    
835 }
836 
837 alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
838 
839 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe
840 {
841     // Generates cvtsd2ss since LDC 1.3 -O0
842     a[0] = b[0];
843     return a;
844 }
845 unittest
846 {
847     __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0));
848     assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]);
849 }
850 
851 int _mm_cvtsi128_si32 (__m128i a) pure @safe
852 {
853     return a[0];
854 }
855 
856 long _mm_cvtsi128_si64 (__m128i a) pure @safe
857 {
858     long2 la = cast(long2)a;
859     return la[0];
860 }
861 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
862 
863 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe
864 {
865     v[0] = cast(double)x;
866     return v;
867 }
868 unittest
869 {
870     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
871     assert(a.array == [42.0, 0]);
872 }
873 
874 __m128i _mm_cvtsi32_si128 (int a) pure @safe
875 {
876     int4 r = [0, 0, 0, 0];
877     r[0] = a;
878     return r;
879 }
880 unittest
881 {
882     __m128i a = _mm_cvtsi32_si128(65);
883     assert(a.array == [65, 0, 0, 0]);
884 }
885 
886 
887 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
888 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe
889 {
890     v[0] = cast(double)x;
891     return v;
892 }
893 unittest
894 {
895     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
896     assert(a.array == [42.0, 0]);
897 }
898 
899 __m128i _mm_cvtsi64_si128 (long a) pure @safe
900 {
901     long2 r = [0, 0];
902     r[0] = a;
903     return cast(__m128i)(r);
904 }
905 
906 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
907 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
908 
909 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe
910 {
911     v[0] = x[0];
912     return v;
913 }
914 unittest
915 {
916     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
917     assert(a.array == [42.0, 0]);
918 }
919 
920 long _mm_cvttss_si64 (__m128 a) pure @safe
921 {
922     return cast(long)(a[0]); // Generates cvttss2si as expected
923 }
924 unittest
925 {
926     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
927 }
928 
929 version(LDC)
930 {
931     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
932 }
933 else
934 {
935     __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe
936     {
937         // Note: doesn't generate cvttpd2dq as of LDC 1.13
938         __m128i r;
939         r[0] = cast(int)a[0];
940         r[1] = cast(int)a[1];
941         r[2] = 0;
942         r[3] = 0;
943         return r;
944     }
945 }
946 unittest
947 {
948     __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f));
949     assert(R.array == [-4, 45641, 0, 0]);
950 }
951 
952 //MMXREG: _mm_cvttpd_pi32
953 
954 __m128i _mm_cvttps_epi32 (__m128 a) pure @safe
955 {
956     // Note: Generates cvttps2dq since LDC 1.3 -O2
957     __m128i r;
958     r[0] = cast(int)a[0];
959     r[1] = cast(int)a[1];
960     r[2] = cast(int)a[2];
961     r[3] = cast(int)a[3];
962     return r;
963 }
964 unittest
965 {
966     __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f));
967     assert(R.array == [-4, 45641, 0, 1]);
968 }
969 
970 int _mm_cvttsd_si32 (__m128d a)
971 {
972     // Generates cvttsd2si since LDC 1.3 -O0
973     return cast(int)a[0];
974 }
975 
976 long _mm_cvttsd_si64 (__m128d a)
977 {
978     // Generates cvttsd2si since LDC 1.3 -O0
979     // but in 32-bit instead, it's a long sequence that resort to FPU
980     return cast(long)a[0];
981 }
982 
983 alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
984 
985 __m128d _mm_div_ps(__m128d a, __m128d b)
986 {
987     return a / b;
988 }
989 
990 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
991 {
992     a[0] /= b[0];
993     return a;
994 }
995 unittest
996 {
997     __m128d a = [2.0, 4.5];
998     a = _mm_div_sd(a, a);
999     assert(a.array == [1.0, 4.5]);
1000 }
1001 
1002 int _mm_extract_epi16(int imm8)(__m128i a) pure @safe
1003 {
1004     return extractelement!(short8, imm8)(a);
1005 }
1006 
1007 __m128i _mm_insert_epi16(int imm8)(__m128i a, int i) pure @safe
1008 {
1009     return insertelement!(short8, imm8)(a, i);
1010 }
1011 
1012 version(LDC)
1013 {
1014     alias _mm_lfence = __builtin_ia32_lfence;
1015 }
1016 else
1017 {
1018     void _mm_lfence() pure @safe
1019     {
1020         asm nothrow @nogc pure @safe
1021         {
1022             lfence;
1023         }
1024     }
1025 }
1026 unittest
1027 {
1028     _mm_lfence();
1029 }
1030 
1031 
1032 __m128d _mm_load_pd (const(double) * mem_addr) pure
1033 {
1034     __m128d* aligned = cast(__m128d*)mem_addr;
1035     return *aligned;
1036 }
1037 
1038 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
1039 {
1040     double[2] arr = [*mem_addr, *mem_addr];
1041     return loadUnaligned!(double2)(&arr[0]);
1042 }
1043 
1044 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe
1045 {
1046     double2 r = [0, 0];
1047     r[0] = *mem_addr;
1048     return r;
1049 }
1050 unittest
1051 {
1052     double x = -42;
1053     __m128d a = _mm_load_sd(&x);
1054     assert(a.array == [-42.0, 0.0]);
1055 }
1056 
1057 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
1058 {
1059     return *mem_addr;
1060 }
1061 
1062 alias _mm_load1_pd = _mm_load_pd1;
1063 
1064 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe
1065 {
1066     a[1] = *mem_addr;
1067     return a;
1068 }
1069 
1070 // Note: strange signature since the memory doesn't have to aligned
1071 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe
1072 {
1073     auto pLong = cast(const(long)*)mem_addr;
1074     long2 r = [0, 0];
1075     r[0] = *pLong;
1076     return cast(__m128i)(r);
1077 }
1078 
1079 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe
1080 {
1081     a[0] = *mem_addr;
1082     return a;
1083 }
1084 
1085 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
1086 {
1087     __m128d a = _mm_load_pd(mem_addr);
1088     return shufflevector!(__m128d, 1, 0)(a, a);
1089 }
1090 
1091 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
1092 {
1093     return loadUnaligned!(double2)(mem_addr);
1094 }
1095 
1096 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
1097 {
1098     return loadUnaligned!(__m128i)(cast(int*)mem_addr);
1099 }
1100 
1101 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
1102 {
1103     int r = *cast(int*)(mem_addr);
1104     int4 result = [0, 0, 0, 0];
1105     result[0] = r;
1106     return result;
1107 }
1108 unittest
1109 {
1110     int r = 42;
1111     __m128i A = _mm_loadu_si32(&r);
1112     int[4] correct = [42, 0, 0, 0];
1113     assert(A.array == correct);
1114 }
1115 
1116 version(LDC)
1117 {
1118     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
1119 }
1120 
1121 version(LDC)
1122 {
1123     /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 
1124     /// (elements are not stored when the highest bit is not set in the corresponding element) 
1125     /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 
1126     /// boundary.
1127     alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR
1128 }
1129 else
1130 {
1131     ///ditto
1132     void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted
1133     {
1134         byte16 b = cast(byte16)a;
1135         byte16 m = cast(byte16)mask;
1136         byte* dest = cast(byte*)(mem_addr);
1137         foreach(j; 0..16)
1138         {
1139             if (m[j] & 128)
1140             {
1141                 dest[j] = b[j];
1142             }
1143         }
1144     }
1145 }
1146 unittest
1147 {
1148     ubyte[16] dest =           [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42];
1149     __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0);
1150     __m128i A    = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15);
1151     _mm_maskmoveu_si128(A, mask, dest.ptr);
1152     ubyte[16] correct =        [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42];
1153     assert(dest == correct);
1154 }
1155 
1156 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe
1157 {
1158     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1159     __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else
1160     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1161     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1162     return _mm_xor_si128(b, mask);
1163 }
1164 unittest
1165 {
1166     short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1167                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1168     short[8] correct =                                  [45, 1,  9,  7, 9,  7, 0,  0];
1169     assert(R.array == correct);
1170 }
1171 
1172 
1173 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1174 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe
1175 {
1176     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1177     __m128i value128 = _mm_set1_epi8(-128);
1178     __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1179     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1180     __m128i mask = _mm_and_si128(aTob, higher);
1181     return _mm_xor_si128(b, mask);
1182 }
1183 unittest
1184 {
1185     byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1186                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1187     byte[16] correct =                                [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57];
1188     assert(R.array == correct);
1189 }
1190 
1191 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe
1192 {
1193     // Generates maxpd starting with LDC 1.9
1194     a[0] = (a[0] > b[0]) ? a[0] : b[0];
1195     a[1] = (a[1] > b[1]) ? a[1] : b[1];
1196     return a;
1197 }
1198 unittest
1199 {
1200     __m128d A = _mm_setr_pd(4.0, 1.0);
1201     __m128d B = _mm_setr_pd(1.0, 8.0);
1202     __m128d M = _mm_max_pd(A, B);
1203     assert(M[0] == 4.0);
1204     assert(M[1] == 8.0);
1205 }
1206 
1207 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe
1208 {
1209      __m128d r = a;
1210     // Generates maxsd starting with LDC 1.3
1211     r[0] = (a[0] > b[0]) ? a[0] : b[0];
1212     return r;
1213 }
1214 unittest
1215 {
1216     __m128d A = _mm_setr_pd(1.0, 1.0);
1217     __m128d B = _mm_setr_pd(4.0, 2.0);
1218     __m128d M = _mm_max_sd(A, B);
1219     assert(M[0] == 4.0);
1220     assert(M[1] == 1.0);
1221 }
1222 
1223 version(LDC)
1224 {
1225     alias _mm_mfence = __builtin_ia32_mfence;
1226 }
1227 else
1228 {
1229     void _mm_mfence() pure @safe
1230     {
1231         asm nothrow @nogc pure @safe
1232         {
1233             mfence;
1234         }
1235     }
1236 }
1237 unittest
1238 {
1239     _mm_mfence();
1240 }
1241 
1242 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe
1243 {
1244     // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?)
1245     // Implemented using masks and XOR
1246     __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else
1247     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1248     __m128i mask = _mm_and_si128(aTob, lowerShorts);
1249     return _mm_xor_si128(b, mask);
1250 }
1251 unittest
1252 {
1253     short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9,  7, 0,-57),
1254                                           _mm_setr_epi16(-4,-8,  9,  7, 0,-57, 0,  0));
1255     short[8] correct =  [-4,-8, -4, -8, 0,-57, 0, -57];
1256     assert(R.array == correct);
1257 }
1258 
1259 
1260 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe
1261 {
1262     // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have
1263     __m128i value128 = _mm_set1_epi8(-128);
1264     __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison
1265     __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
1266     __m128i mask = _mm_and_si128(aTob, lower);
1267     return _mm_xor_si128(b, mask);
1268 }
1269 unittest
1270 {
1271     byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9,  7, 0,-57, -4,-8,  9,  7, 0,-57, 0,  0),
1272                                          _mm_setr_epi8(-4,-8,  9,  7, 0,-57, 0,  0, 45, 1, -4, -8, 9,  7, 0,-57));
1273     byte[16] correct =                                [45, 1,  9,  7, 0,  7, 0,  0, 45, 1,  9,  7, 0,  7, 0,  0];
1274     assert(R.array == correct);
1275 }
1276 
1277 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe
1278 {
1279     // Generates minpd starting with LDC 1.9
1280     a[0] = (a[0] < b[0]) ? a[0] : b[0];
1281     a[1] = (a[1] < b[1]) ? a[1] : b[1];
1282     return a;
1283 }
1284 unittest
1285 {
1286     __m128d A = _mm_setr_pd(1.0, 2.0);
1287     __m128d B = _mm_setr_pd(4.0, 1.0);
1288     __m128d M = _mm_min_pd(A, B);
1289     assert(M[0] == 1.0);
1290     assert(M[1] == 1.0);
1291 }
1292 
1293 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe
1294 {
1295     // Generates minsd starting with LDC 1.3
1296     __m128d r = a;
1297     r[0] = (a[0] < b[0]) ? a[0] : b[0];
1298     return r;
1299 }
1300 unittest
1301 {
1302     __m128d A = _mm_setr_pd(1.0, 3.0);
1303     __m128d B = _mm_setr_pd(4.0, 2.0);
1304     __m128d M = _mm_min_sd(A, B);
1305     assert(M[0] == 1.0);
1306     assert(M[1] == 3.0);
1307 }
1308 
1309 __m128i _mm_move_epi64 (__m128i a) pure @safe
1310 {
1311     long2 result = [ 0, 0 ];
1312     long2 la = cast(long2) a;
1313     result[0] = la[0];
1314     return cast(__m128i)(result);
1315 }
1316 unittest
1317 {
1318     long2 A = [13, 47];
1319     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
1320     long[2] correct = [13, 0];
1321     assert(B.array == correct);
1322 }
1323 
1324 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
1325 {
1326     b[1] = a[1];
1327     return b;
1328 }
1329 unittest
1330 {
1331     double2 A = [13.0, 47.0];
1332     double2 B = [34.0, 58.0];
1333     double2 C = _mm_move_sd(A, B);
1334     double[2] correct = [34.0, 47.0];
1335     assert(C.array == correct);
1336 }
1337 
1338 version(LDC)
1339 {
1340     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
1341     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
1342 }
1343 
1344 // MMXREG: _mm_movepi64_pi64
1345 // MMXREG: __m128i _mm_movpi64_epi64 (__m64 a)
1346 
1347 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC
1348 // but seems there in clang
1349 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe
1350 {
1351     __m128i zero = _mm_setzero_si128();
1352     long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
1353     long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
1354     static if (__VERSION__ >= 2076)
1355     {
1356         return cast(__m128i)(la * lb);
1357     }
1358     else
1359     {
1360         // long2 mul not supported before LDC 1.5
1361         la[0] *= lb[0];
1362         la[1] *= lb[1];
1363         return cast(__m128i)(la);
1364     }
1365 }
1366 unittest
1367 {
1368     __m128i A = _mm_set_epi32(0, 0xDEADBEEF, 0, 0xffffffff);
1369     __m128i B = _mm_set_epi32(0, 0xCAFEBABE, 0, 0xffffffff);
1370     __m128i C = _mm_mul_epu32(A, B);
1371     long2 LC = cast(long2)C;
1372     assert(LC.array[0] == 18446744065119617025uL);
1373     assert(LC.array[1] == 12723420444339690338uL);
1374 }
1375 
1376 
1377 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
1378 {
1379     return a * b;
1380 }
1381 unittest
1382 {
1383     __m128d a = [-2.0, 1.5];
1384     a = _mm_mul_pd(a, a);
1385     assert(a.array == [4.0, 2.25]);
1386 }
1387 
1388 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
1389 {
1390     a[0] *= b[0];
1391     return a;
1392 }
1393 unittest
1394 {
1395     __m128d a = [-2.0, 1.5];
1396     a = _mm_mul_sd(a, a);
1397     assert(a.array == [4.0, 1.5]);
1398 }
1399 
1400 
1401 // MMXREG: _mm_mul_su32
1402 
1403 version(LDC)
1404 {
1405     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
1406 }
1407 else
1408 {
1409     __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe
1410     {
1411         short8 sa = cast(short8)a;
1412         short8 sb = cast(short8)b;
1413         short8 r = void;
1414         r[0] = (sa[0] * sb[0]) >> 16;
1415         r[1] = (sa[1] * sb[1]) >> 16;
1416         r[2] = (sa[2] * sb[2]) >> 16;
1417         r[3] = (sa[3] * sb[3]) >> 16;
1418         r[4] = (sa[4] * sb[4]) >> 16;
1419         r[5] = (sa[5] * sb[5]) >> 16;
1420         r[6] = (sa[6] * sb[6]) >> 16;
1421         r[7] = (sa[7] * sb[7]) >> 16;
1422         return cast(__m128i)r;
1423     }
1424 }
1425 unittest
1426 {
1427     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
1428     __m128i B = _mm_set1_epi16(16384);
1429     short8 R = cast(short8)_mm_mulhi_epi16(A, B);
1430     short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1];
1431     assert(R.array == correct);
1432 }
1433 
1434 version(LDC)
1435 {
1436     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
1437 }
1438 else
1439 {
1440     __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe
1441     {
1442         short8 sa = cast(short8)a;
1443         short8 sb = cast(short8)b;
1444         short8 r = void;
1445         r[0] = cast(short)( (cast(ushort)sa[0] * cast(ushort)sb[0]) >> 16 );
1446         r[1] = cast(short)( (cast(ushort)sa[1] * cast(ushort)sb[1]) >> 16 );
1447         r[2] = cast(short)( (cast(ushort)sa[2] * cast(ushort)sb[2]) >> 16 );
1448         r[3] = cast(short)( (cast(ushort)sa[3] * cast(ushort)sb[3]) >> 16 );
1449         r[4] = cast(short)( (cast(ushort)sa[4] * cast(ushort)sb[4]) >> 16 );
1450         r[5] = cast(short)( (cast(ushort)sa[5] * cast(ushort)sb[5]) >> 16 );
1451         r[6] = cast(short)( (cast(ushort)sa[6] * cast(ushort)sb[6]) >> 16 );
1452         r[7] = cast(short)( (cast(ushort)sa[7] * cast(ushort)sb[7]) >> 16 );
1453         return cast(__m128i)r;
1454     }
1455 }
1456 unittest
1457 {
1458     __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7);
1459     __m128i B = _mm_set1_epi16(16384);
1460     short8 R = cast(short8)_mm_mulhi_epu16(A, B);
1461     short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1];
1462     assert(R.array == correct);
1463 }
1464 
1465 __m128i _mm_mullo_epi16 (__m128i a, __m128i b)
1466 {
1467     return cast(__m128i)(cast(short8)a * cast(short8)b);
1468 }
1469 
1470 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
1471 {
1472     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
1473 }
1474 
1475 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
1476 {
1477     return a | b;
1478 }
1479 
1480 version(LDC)
1481 {
1482     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
1483 }
1484 else
1485 {
1486     __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe
1487     {
1488         short8 r;
1489         r[0] = saturateSignedIntToSignedShort(a[0]);
1490         r[1] = saturateSignedIntToSignedShort(a[1]);
1491         r[2] = saturateSignedIntToSignedShort(a[2]);
1492         r[3] = saturateSignedIntToSignedShort(a[3]);
1493         r[4] = saturateSignedIntToSignedShort(b[0]);
1494         r[5] = saturateSignedIntToSignedShort(b[1]);
1495         r[6] = saturateSignedIntToSignedShort(b[2]);
1496         r[7] = saturateSignedIntToSignedShort(b[3]);
1497         return cast(__m128i)r;
1498     }
1499 }
1500 unittest
1501 {
1502     __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
1503     short8 R = cast(short8) _mm_packs_epi32(A, A);
1504     short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0];
1505     assert(R.array == correct);
1506 }
1507 
1508 version(LDC)
1509 {
1510     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
1511 }
1512 else
1513 {
1514     __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe
1515     {
1516         byte16 r;
1517         short8 sa = cast(short8)a;
1518         short8 sb = cast(short8)b;
1519         foreach(i; 0..8)
1520             r[i] = saturateSignedWordToSignedByte(sa[i]);
1521         foreach(i; 0..8)
1522             r[i+8] = saturateSignedWordToSignedByte(sb[i]);
1523         return cast(__m128i)r;
1524     }
1525 }
1526 unittest
1527 {
1528     __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0);
1529     byte16 R = cast(byte16) _mm_packs_epi16(A, A);
1530     byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0,
1531                         127, -128, 127, 0, 127, -128, 127, 0];
1532     assert(R.array == correct);
1533 }
1534 
1535 version(LDC)
1536 {
1537     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
1538 }
1539 else
1540 {
1541     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure
1542     {
1543         short8 sa = cast(short8)a;
1544         short8 sb = cast(short8)b;
1545         ubyte[16] result = void;
1546         for (int i = 0; i < 8; ++i)
1547         {
1548             short s = sa[i];
1549             if (s < 0) s = 0;
1550             if (s > 255) s = 255;
1551             result[i] = cast(ubyte)s;
1552 
1553             s = sb[i];
1554             if (s < 0) s = 0;
1555             if (s > 255) s = 255;
1556             result[i+8] = cast(ubyte)s;
1557         }
1558         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
1559     }
1560 }
1561 unittest
1562 {
1563     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
1564     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
1565     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
1566                                                 0, 255, 0, 255, 255, 2, 1, 0];
1567     foreach(i; 0..16)
1568         assert(AA[i] == cast(byte)(correctResult[i]));
1569 }
1570 
1571 version(LDC)
1572 {
1573     alias _mm_pause = __builtin_ia32_pause;
1574 }
1575 else
1576 {
1577     void _mm_pause() pure @safe
1578     {
1579         asm nothrow @nogc pure @safe
1580         {
1581             rep; nop; // F3 90 =  pause
1582         }
1583     }
1584 }
1585 unittest
1586 {
1587     _mm_pause();
1588 }
1589 
1590 
1591 version(LDC)
1592 {
1593     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
1594 }
1595 else
1596 {
1597     __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe
1598     {
1599         byte16 ab = cast(byte16)a;
1600         byte16 bb = cast(byte16)b;
1601         ubyte[16] t;
1602         foreach(i; 0..16)
1603         {
1604             int diff = cast(ubyte)(ab[i]) - cast(ubyte)(bb[i]);
1605             if (diff < 0) diff = -diff;
1606             t[i] = cast(ubyte)(diff);
1607         }
1608         int4 r = _mm_setzero_si128();
1609         r[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
1610         r[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15];
1611         return r;
1612     }
1613 }
1614 unittest
1615 {
1616     __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
1617     __m128i B = _mm_set1_epi8(1);
1618     __m128i R = _mm_sad_epu8(A, B);
1619     int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
1620                       0,
1621                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
1622                       0];
1623     assert(R.array == correct);
1624 }
1625 
1626 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
1627 {
1628     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
1629     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
1630 }
1631 unittest
1632 {
1633     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
1634     short8 B = cast(short8) A;
1635     foreach(i; 0..8)
1636         assert(B.array[i] == i);
1637 }
1638 
1639 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
1640 {
1641     int[4] result = [e0, e1, e2, e3];
1642     return loadUnaligned!(int4)(result.ptr);
1643 }
1644 unittest
1645 {
1646     __m128i A = _mm_set_epi32(3, 2, 1, 0);
1647     foreach(i; 0..4)
1648         assert(A.array[i] == i);
1649 }
1650 
1651 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
1652 {
1653     long[2] result = [e0, e1];
1654     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1655 }
1656 unittest
1657 {
1658     __m128i A = _mm_set_epi64x(1234, 5678);
1659     long2 B = cast(long2) A;
1660     assert(B.array[0] == 5678);
1661     assert(B.array[1] == 1234);
1662 }
1663 
1664 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
1665                       byte e11, byte e10, byte e9, byte e8,
1666                       byte e7, byte e6, byte e5, byte e4,
1667                       byte e3, byte e2, byte e1, byte e0) pure @trusted
1668 {
1669     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
1670                      e8, e9, e10, e11, e12, e13, e14, e15];
1671     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1672 }
1673 
1674 __m128d _mm_set_pd (double e1, double e0) pure @trusted
1675 {
1676     double[2] result = [e0, e1];
1677     return loadUnaligned!(double2)(result.ptr);
1678 }
1679 
1680 __m128d _mm_set_pd1 (double a) pure @trusted
1681 {
1682     double[2] result = [a, a];
1683     return loadUnaligned!(double2)(result.ptr);
1684 }
1685 
1686 __m128d _mm_set_sd (double a) pure @trusted
1687 {
1688     double[2] result = [a, 0];
1689     return loadUnaligned!(double2)(result.ptr);
1690 }
1691 
1692 __m128i _mm_set1_epi16 (short a) pure @trusted
1693 {
1694     short[8] result = [a, a, a, a, a, a, a, a];
1695     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
1696 }
1697 
1698 __m128i _mm_set1_epi32 (int a) pure @trusted
1699 {
1700     int[4] result = [a, a, a, a];
1701     return loadUnaligned!(int4)(result.ptr);
1702 }
1703 unittest
1704 {
1705     __m128 a = _mm_set1_ps(-1.0f);
1706     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
1707     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
1708 }
1709 
1710 __m128i _mm_set1_epi64x (long a) pure @trusted
1711 {
1712     long[2] result = [a, a];
1713     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1714 }
1715 
1716 __m128i _mm_set1_epi8 (byte a) pure @trusted
1717 {
1718     byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a];
1719     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1720 }
1721 
1722 alias _mm_set1_pd = _mm_set_pd1;
1723 
1724 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
1725 {
1726     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
1727     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
1728 }
1729 
1730 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
1731 {
1732     int[4] result = [e3, e2, e1, e0];
1733     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
1734 }
1735 
1736 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
1737 {
1738     long[2] result = [e1, e0];
1739     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1740 }
1741 
1742 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12,
1743                        byte e11, byte e10, byte e9,  byte e8,
1744                        byte e7,  byte e6,  byte e5,  byte e4,
1745                        byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
1746 {
1747     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
1748                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
1749     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1750 }
1751 
1752 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
1753 {
1754     double[2] result = [e1, e0];
1755     return loadUnaligned!(double2)(result.ptr);
1756 }
1757 
1758 __m128d _mm_setzero_pd () pure @trusted
1759 {
1760     double[2] result = [0.0, 0.0];
1761     return loadUnaligned!(double2)(result.ptr);
1762 }
1763 
1764 __m128i _mm_setzero_si128() pure @trusted
1765 {
1766     int[4] result = [0, 0, 0, 0];
1767     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
1768 }
1769 
1770 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
1771 {
1772     return shufflevector!(int4, (imm8 >> 0) & 3,
1773                                 (imm8 >> 2) & 3,
1774                                 (imm8 >> 4) & 3,
1775                                 (imm8 >> 6) & 3)(a, a);
1776 }
1777 unittest
1778 {
1779     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
1780     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1781     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
1782     int[4] expectedB = [ 3, 2, 1, 0 ];
1783     assert(B.array == expectedB);
1784 }
1785 
1786 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
1787 {
1788     return shufflevector!(double2, 0 + ( imm8 & 1 ),
1789                                    2 + ( (imm8 >> 1) & 1 ))(a, b);
1790 }
1791 unittest
1792 {
1793     __m128d A = _mm_setr_pd(0.5, 2.0);
1794     __m128d B = _mm_setr_pd(4.0, 5.0);
1795     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
1796     __m128d R = _mm_shuffle_pd!SHUFFLE(A, B);
1797     double[2] correct = [ 2.0, 5.0 ];
1798     assert(R.array == correct);
1799 }
1800 
1801 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
1802 {
1803     return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
1804                                       4 + ( (imm8 >> 0) & 3 ),
1805                                       4 + ( (imm8 >> 2) & 3 ),
1806                                       4 + ( (imm8 >> 4) & 3 ),
1807                                       4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
1808 }
1809 unittest
1810 {
1811     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1812     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1813     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
1814     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
1815     assert(C.array == expectedC);
1816 }
1817 
1818 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
1819 {
1820     return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
1821                                                 ( (imm8 >> 2) & 3 ),
1822                                                 ( (imm8 >> 4) & 3 ),
1823                                                 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
1824 }
1825 unittest
1826 {
1827     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1828     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1829     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
1830     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
1831     assert(B.array == expectedB);
1832 }
1833 
1834 version(LDC)
1835 {
1836     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
1837 }
1838 else
1839 {
1840     __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe
1841     {
1842         int4 r = void;
1843         long2 lc = cast(long2)count;
1844         int bits = cast(int)(lc[0]);
1845         foreach(i; 0..4)
1846             r[i] = cast(uint)(a[i]) << bits;
1847         return r;
1848     }
1849 }
1850 unittest
1851 {
1852     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
1853     __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1));
1854     int[4] expectedB = [ 0, 4, 6, -8];
1855     assert(B.array == expectedB);
1856 }
1857 
1858 version(LDC)
1859 {
1860     alias _mm_sll_epi64  = __builtin_ia32_psllq128;
1861 }
1862 else
1863 {
1864     __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe
1865     {
1866         long2 r = void;
1867         long2 sa = cast(long2)a;
1868         long2 lc = cast(long2)count;
1869         int bits = cast(int)(lc[0]);
1870         foreach(i; 0..2)
1871             r[i] = cast(ulong)(sa[i]) << bits;
1872         return cast(__m128i)r;
1873     }
1874 }
1875 unittest
1876 {
1877     __m128i A = _mm_setr_epi64(8, -4);
1878     long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1));
1879     long[2] expectedB = [ 16, -8];
1880     assert(B.array == expectedB);
1881 }
1882 
1883 version(LDC)
1884 {
1885     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
1886 }
1887 else
1888 {
1889     __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @safe
1890     {
1891         short8 sa = cast(short8)a;
1892         long2 lc = cast(long2)count;
1893         int bits = cast(int)(lc[0]);
1894         short8 r = void;
1895         foreach(i; 0..8)
1896             r[i] = cast(short)(cast(ushort)(sa[i]) << bits);
1897         return cast(int4)r;
1898     }
1899 }
1900 unittest
1901 {
1902     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
1903     short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) );
1904     short[8] expectedB =     [ 0, 2, 4, 6, -8, -10, 12, 14 ];
1905     assert(B.array == expectedB);
1906 }
1907 
1908 version(LDC)
1909 {
1910     alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
1911 }
1912 else
1913 {
1914     __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe
1915     {
1916         int4 r = void;
1917         foreach(i; 0..4)
1918             r[i] = cast(uint)(a[i]) << imm8;
1919         return r;
1920     }
1921 }
1922 unittest
1923 {
1924     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
1925     __m128i B = _mm_slli_epi32(A, 1);
1926     int[4] expectedB = [ 0, 4, 6, -8];
1927     assert(B.array == expectedB);
1928 }
1929 
1930 version(LDC)
1931 {
1932     alias _mm_slli_epi64  = __builtin_ia32_psllqi128;
1933 }
1934 else
1935 {
1936     __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe
1937     {
1938         long2 r = void;
1939         long2 sa = cast(long2)a;
1940         foreach(i; 0..2)
1941             r[i] = cast(ulong)(sa[i]) << imm8;
1942         return cast(__m128i)r;
1943     }
1944 }
1945 unittest
1946 {
1947     __m128i A = _mm_setr_epi64(8, -4);
1948     long2 B = cast(long2) _mm_slli_epi64(A, 1);
1949     long[2] expectedB = [ 16, -8];
1950     assert(B.array == expectedB);
1951 }
1952 
1953 version(LDC)
1954 {
1955     alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
1956 }
1957 else
1958 {
1959     __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe
1960     {
1961         short8 sa = cast(short8)a;
1962         short8 r = void;
1963         foreach(i; 0..8)
1964             r[i] = cast(short)(cast(ushort)(sa[i]) << imm8);
1965         return cast(int4)r;
1966     }
1967 }
1968 unittest
1969 {
1970     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
1971     short8 B = cast(short8)( _mm_slli_epi16(A, 1) );
1972     short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ];
1973     assert(B.array == expectedB);
1974 }
1975 
1976 /// Shift `a` left by `imm8` bytes while shifting in zeros.
1977 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe
1978 {
1979     static if (imm8 & 0xF0)
1980         return _mm_setzero_si128();
1981     else
1982         return cast(__m128i) shufflevector!(byte16,
1983         16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8,
1984         24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8)
1985         (cast(byte16)_mm_setzero_si128(), cast(byte16)op);
1986 }
1987 unittest
1988 {
1989     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1990     short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left
1991     short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ];
1992     assert(R.array == correct);
1993 }
1994 
1995 version(LDC)
1996 {
1997     // Disappeared with LDC 1.11
1998     static if (__VERSION__ < 2081)
1999         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
2000     else
2001     {
2002         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
2003         {
2004             vec.array[0] = llvm_sqrt(vec.array[0]);
2005             vec.array[1] = llvm_sqrt(vec.array[1]);
2006             return vec;
2007         }
2008     }
2009 }
2010 else
2011 {
2012     __m128d _mm_sqrt_pd(__m128d vec) pure @safe
2013     {
2014         vec.array[0] = sqrt(vec.array[0]);
2015         vec.array[1] = sqrt(vec.array[1]);
2016         return vec;
2017     }
2018 }
2019 
2020 
2021 version(LDC)
2022 {
2023     // Disappeared with LDC 1.11
2024     static if (__VERSION__ < 2081)
2025         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
2026     else
2027     {
2028         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
2029         {
2030             vec.array[0] = llvm_sqrt(vec.array[0]);
2031             vec.array[1] = vec.array[1];
2032             return vec;
2033         }
2034     }
2035 }
2036 else
2037 {
2038     __m128d _mm_sqrt_sd(__m128d vec) pure @safe
2039     {
2040         vec.array[0] = sqrt(vec.array[0]);
2041         vec.array[1] = vec.array[1];
2042         return vec;
2043     }
2044 }
2045 
2046 
2047 version(LDC)
2048 {
2049     alias _mm_sra_epi16 = __builtin_ia32_psraw128;
2050 }
2051 else
2052 {
2053     __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe
2054     {
2055         short8 sa = cast(short8)a;
2056         long2 lc = cast(long2)count;
2057         int bits = cast(int)(lc[0]);
2058         short8 r = void;
2059         foreach(i; 0..8)
2060             r[i] = cast(short)(sa[i] >> bits);
2061         return cast(int4)r;
2062     }
2063 }
2064 unittest
2065 {
2066     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2067     short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) );
2068     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
2069     assert(B.array == expectedB);
2070 }
2071 
2072 version(LDC)
2073 {
2074     alias _mm_sra_epi32  = __builtin_ia32_psrad128;
2075 }
2076 else
2077 {
2078     __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe
2079     {
2080         int4 r = void;
2081         long2 lc = cast(long2)count;
2082         int bits = cast(int)(lc[0]);
2083         foreach(i; 0..4)
2084             r[i] = (a[i] >> bits);
2085         return r;
2086     }
2087 }
2088 unittest
2089 {
2090     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2091     __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1));
2092     int[4] expectedB = [ 0, 1, 1, -2];
2093     assert(B.array == expectedB);
2094 }
2095 
2096 
2097 version(LDC)
2098 {
2099     alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
2100 }
2101 else
2102 {
2103     __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe
2104     {
2105         short8 sa = cast(short8)a;
2106         short8 r = void;
2107         foreach(i; 0..8)
2108             r[i] = cast(short)(sa[i] >> imm8);
2109         return cast(int4)r;
2110     }
2111 }
2112 unittest
2113 {
2114     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2115     short8 B = cast(short8)( _mm_srai_epi16(A, 1) );
2116     short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ];
2117     assert(B.array == expectedB);
2118 }
2119 
2120 version(LDC)
2121 {
2122     alias _mm_srai_epi32  = __builtin_ia32_psradi128;
2123 }
2124 else
2125 {
2126     __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe
2127     {
2128         int4 r = void;
2129         foreach(i; 0..4)
2130             r[i] = (a[i] >> imm8);
2131         return r;
2132     }
2133 }
2134 unittest
2135 {
2136     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2137     __m128i B = _mm_srai_epi32(A, 1);
2138     int[4] expectedB = [ 0, 1, 1, -2];
2139     assert(B.array == expectedB);
2140 }
2141 
2142 version(LDC)
2143 {
2144     alias _mm_srl_epi16 = __builtin_ia32_psrlw128;
2145 }
2146 else
2147 {
2148     __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe
2149     {
2150         short8 sa = cast(short8)a;
2151         long2 lc = cast(long2)count;
2152         int bits = cast(int)(lc[0]);
2153         short8 r = void;
2154         foreach(i; 0..8)
2155             r[i] = cast(short)(cast(ushort)(sa[i]) >> bits);
2156         return cast(int4)r;
2157     }
2158 }
2159 unittest
2160 {
2161     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2162     short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) );
2163     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
2164     assert(B.array == expectedB);
2165 }
2166 
2167 version(LDC)
2168 {
2169     alias _mm_srl_epi32  = __builtin_ia32_psrld128;
2170 }
2171 else
2172 {
2173     __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe
2174     {
2175         int4 r = void;
2176         long2 lc = cast(long2)count;
2177         int bits = cast(int)(lc[0]);
2178         foreach(i; 0..4)
2179             r[i] = cast(uint)(a[i]) >> bits;
2180         return r;
2181     }
2182 }
2183 unittest
2184 {
2185     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2186     __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1));
2187     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
2188     assert(B.array == expectedB);
2189 }
2190 
2191 version(LDC)
2192 {
2193     alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
2194 }
2195 else
2196 {
2197     __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe
2198     {
2199         long2 r = void;
2200         long2 sa = cast(long2)a;
2201         long2 lc = cast(long2)count;
2202         int bits = cast(int)(lc[0]);
2203         foreach(i; 0..2)
2204             r[i] = cast(ulong)(sa[i]) >> bits;
2205         return cast(__m128i)r;
2206     }
2207 }
2208 unittest
2209 {
2210     __m128i A = _mm_setr_epi64(8, -4);
2211     long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1));
2212     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
2213     assert(B.array == expectedB);
2214 }
2215 
2216 version(LDC)
2217 {
2218     alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
2219 }
2220 else
2221 {
2222     __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe
2223     {
2224         short8 sa = cast(short8)a;
2225         short8 r = void;
2226         foreach(i; 0..8)
2227             r[i] = cast(short)(cast(ushort)(sa[i]) >> imm8);
2228         return cast(int4)r;
2229     }
2230 }
2231 unittest
2232 {
2233     __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7);
2234     short8 B = cast(short8)( _mm_srli_epi16(A, 1) );
2235     short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
2236     assert(B.array == expectedB);
2237 }
2238 
2239 version(LDC)
2240 {
2241     alias _mm_srli_epi32  = __builtin_ia32_psrldi128;
2242 }
2243 else
2244 {
2245     __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe
2246     {
2247         int4 r = void;
2248         foreach(i; 0..4)
2249             r[i] = cast(uint)(a[i]) >> imm8;
2250         return r;
2251     }
2252 }
2253 unittest
2254 {
2255     __m128i A = _mm_setr_epi32(0, 2, 3, -4);
2256     __m128i B = _mm_srli_epi32(A, 1);
2257     int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE];
2258     assert(B.array == expectedB);
2259 }
2260 
2261 version(LDC)
2262 {
2263     alias _mm_srli_epi64  = __builtin_ia32_psrlqi128;
2264 }
2265 else
2266 {
2267     __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe
2268     {
2269         long2 r = void;
2270         long2 sa = cast(long2)a;
2271         foreach(i; 0..2)
2272             r[i] = cast(ulong)(sa[i]) >> imm8;
2273         return cast(__m128i)r;
2274     }
2275 }
2276 unittest
2277 {
2278     __m128i A = _mm_setr_epi64(8, -4);
2279     long2 B = cast(long2) _mm_srli_epi64(A, 1);
2280     long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE];
2281     assert(B.array == expectedB);
2282 }
2283 
2284 /// Shift `v` right by `bytes` bytes while shifting in zeros.
2285 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe
2286 {
2287     static if (bytes & 0xF0)
2288         return _mm_setzero_si128();
2289     else
2290         return cast(__m128i) shufflevector!(byte16,
2291                                             bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7,
2292                                             bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15)
2293                                            (cast(byte16) v, cast(byte16)_mm_setzero_si128());
2294 }
2295 unittest
2296 {
2297     __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1));
2298     int[4] correct = [2, 3, 4, 0];
2299     assert(R.array == correct);
2300 }
2301 
2302 /// Shift `v` right by `bytes` bytes while shifting in zeros.
2303 /// #BONUS
2304 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe
2305 {
2306     return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v);
2307 }
2308 unittest
2309 {    
2310     __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f));
2311     float[4] correct = [3.0f, 4.0f, 0, 0];
2312     assert(R.array == correct);
2313 }
2314 
2315 /// Shift `v` right by `bytes` bytes while shifting in zeros.
2316 /// #BONUS
2317 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe
2318 {
2319     return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v);
2320 }
2321 
2322 void _mm_store_pd (double* mem_addr, __m128d a) pure
2323 {
2324     __m128d* aligned = cast(__m128d*)mem_addr;
2325     *aligned = a;
2326 }
2327 
2328 void _mm_store_pd1 (double* mem_addr, __m128d a) pure
2329 {
2330     __m128d* aligned = cast(__m128d*)mem_addr;
2331     *aligned = shufflevector!(double2, 0, 0)(a, a);
2332 }
2333 
2334 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
2335 {
2336     *mem_addr = extractelement!(double2, 0)(a);
2337 }
2338 
2339 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
2340 {
2341     *mem_addr = a;
2342 }
2343 
2344 alias _mm_store1_pd = _mm_store_pd1;
2345 
2346 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
2347 {
2348     *mem_addr = extractelement!(double2, 1)(a);
2349 }
2350 
2351 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
2352 {
2353     long* dest = cast(long*)mem_addr;
2354     *dest = extractelement!(long2, 0)(cast(long2)a);
2355 }
2356 
2357 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
2358 {
2359     *mem_addr = extractelement!(double2, 0)(a);
2360 }
2361 
2362 void _mm_storer_pd (double* mem_addr, __m128d a) pure
2363 {
2364     __m128d* aligned = cast(__m128d*)mem_addr;
2365     *aligned = shufflevector!(double2, 1, 0)(a, a);
2366 }
2367 
2368 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
2369 {
2370     storeUnaligned!double2(a, mem_addr);
2371 }
2372 
2373 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
2374 {
2375     storeUnaligned!__m128i(a, cast(int*)mem_addr);
2376 }
2377 
2378 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 
2379 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 
2380 /// boundary or a general-protection exception may be generated.
2381 void _mm_stream_pd (double* mem_addr, __m128d a)
2382 {
2383     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2384     __m128d* dest = cast(__m128d*)mem_addr;
2385     *dest = a;
2386 }
2387 
2388 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 
2389 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 
2390 /// may be generated.
2391 void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
2392 {
2393     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2394     __m128i* dest = cast(__m128i*)mem_addr;
2395     *dest = a;
2396 }
2397 
2398 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 
2399 /// pollution. If the cache line containing address mem_addr is already in the cache, 
2400 /// the cache will be updated.
2401 void _mm_stream_si32 (int* mem_addr, int a)
2402 {
2403     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2404     *mem_addr = a;
2405 }
2406 
2407 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 
2408 /// cache pollution. If the cache line containing address mem_addr is already 
2409 /// in the cache, the cache will be updated.
2410 void _mm_stream_si64 (long* mem_addr, long a)
2411 {
2412     // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2413     *mem_addr = a;
2414 }
2415 
2416 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
2417 {
2418     return cast(__m128i)(cast(short8)a - cast(short8)b);
2419 }
2420 
2421 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
2422 {
2423     return cast(__m128i)(cast(int4)a - cast(int4)b);
2424 }
2425 
2426 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
2427 {
2428     return cast(__m128i)(cast(long2)a - cast(long2)b);
2429 }
2430 
2431 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
2432 {
2433     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
2434 }
2435 
2436 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
2437 {
2438     return a - b;
2439 }
2440 
2441 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
2442 {
2443     a[0] -= b[0];
2444     return a;
2445 }
2446 unittest
2447 {
2448     __m128d a = [1.5, -2.0];
2449     a = _mm_sub_sd(a, a);
2450     assert(a.array == [0.0, -2.0]);
2451 }
2452 
2453 
2454 // MMXREG: _mm_sub_si64
2455 
2456 version(LDC)
2457 {
2458     alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
2459     alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
2460     alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
2461     alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
2462 }
2463 
2464 // Note: the only difference between these intrinsics is the signalling 
2465 //       behaviour of quiet NaNs. This is incorrect but the case where
2466 //       you would want to differentiate between qNaN and sNaN and then 
2467 //       treat them differently on purpose seems extremely rare.
2468 alias _mm_ucomieq_sd = _mm_comieq_sd;
2469 alias _mm_ucomige_sd = _mm_comige_sd;
2470 alias _mm_ucomigt_sd = _mm_comigt_sd;
2471 alias _mm_ucomile_sd = _mm_comile_sd;
2472 alias _mm_ucomilt_sd = _mm_comilt_sd;
2473 alias _mm_ucomineq_sd = _mm_comineq_sd;
2474 
2475 __m128d _mm_undefined_pd() pure @safe
2476 {
2477     __m128d result = void;
2478     return result;
2479 }
2480 __m128i _mm_undefined_si128() pure @safe
2481 {
2482     __m128i result = void;
2483     return result;
2484 }
2485 
2486 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
2487 {
2488     return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
2489                                        (cast(short8)a, cast(short8)b);
2490 }
2491 
2492 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
2493 {
2494     return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
2495 }
2496 
2497 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe
2498 {
2499     return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b);
2500 }
2501 
2502 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
2503 {
2504     return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
2505                                                12, 28, 13, 29, 14, 30, 15, 31)
2506                                                (cast(byte16)a, cast(byte16)b);
2507 }
2508 
2509 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
2510 {
2511     return shufflevector!(__m128d, 1, 3)(a, b);
2512 }
2513 
2514 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
2515 {
2516     return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
2517                                        (cast(short8)a, cast(short8)b);
2518 }
2519 
2520 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
2521 {
2522     return shufflevector!(int4, 0, 4, 1, 5)
2523                          (cast(int4)a, cast(int4)b);
2524 }
2525 
2526 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe
2527 {
2528     return cast(__m128i) shufflevector!(long2, 0, 2)
2529                                        (cast(long2)a, cast(long2)b);
2530 }
2531 
2532 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
2533 {
2534     return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
2535                                                 4, 20, 5, 21, 6, 22, 7, 23)
2536                                        (cast(byte16)a, cast(byte16)b);
2537 }
2538 
2539 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
2540 {
2541     return shufflevector!(__m128d, 0, 2)(a, b);
2542 }
2543 
2544 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
2545 {
2546     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
2547 }
2548 
2549 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
2550 {
2551     return a ^ b;
2552 }
2553 
2554 unittest
2555 {
2556     // distance between two points in 4D
2557     float distance(float[4] a, float[4] b) nothrow @nogc
2558     {
2559         __m128 va = _mm_loadu_ps(a.ptr);
2560         __m128 vb = _mm_loadu_ps(b.ptr);
2561         __m128 diffSquared = _mm_sub_ps(va, vb);
2562         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
2563         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared));
2564         sum = _mm_add_ps(sum, _mm_srli_ps!4(sum));
2565         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
2566     }
2567     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
2568 }