1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.emmintrin;
7 
8 public import inteli.types;
9 public import inteli.xmmintrin; // SSE2 includes SSE1
10 
11 import inteli.internals;
12 
13 nothrow @nogc:
14 
15 // SSE2 instructions
16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
17 
18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
19 {
20     return cast(__m128i)(cast(short8)a + cast(short8)b);
21 }
22 
23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
24 {
25     return cast(__m128i)(cast(int4)a + cast(int4)b);
26 }
27 
28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
29 {
30     return cast(__m128i)(cast(long2)a + cast(long2)b);
31 }
32 
33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
34 {
35     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
36 }
37 
38 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
39 {
40     a[0] += b[0];
41     return a;
42 }
43 unittest
44 {
45     __m128d a = [1.5, -2.0];
46     a = _mm_add_sd(a, a);
47     assert(a.array == [3.0, -2.0]);
48 }
49 
50 
51 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
52 {
53     return a + b;
54 }
55 unittest
56 {
57     __m128d a = [1.5, -2.0];
58     a = _mm_add_pd(a, a);
59     assert(a.array == [3.0, -4.0]);
60 }
61 
62 // MMXREG: _mm_add_si64
63 
64 version(LDC)
65 {
66     alias _mm_adds_epi16 = __builtin_ia32_paddsw128;
67 }
68 else
69 {
70     __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
71     {
72         short[8] res;
73         short8 sa = cast(short8)a;
74         short8 sb = cast(short8)b;
75         foreach(i; 0..8)
76             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
77         return _mm_loadu_si128(cast(int4*)res.ptr);
78     }
79 }
80 unittest
81 {
82     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
83                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
84     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
85     assert(res.array == correctResult);
86 }
87 
88 version(LDC)
89 {
90     alias _mm_adds_epi8 = __builtin_ia32_paddsb128;
91 }
92 else
93 {
94     __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
95     {
96         byte[16] res;
97         byte16 sa = cast(byte16)a;
98         byte16 sb = cast(byte16)b;
99         foreach(i; 0..16)
100             res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
101         return _mm_loadu_si128(cast(int4*)res.ptr);
102     }
103 }
104 unittest
105 {
106     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
107                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
108     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
109                                                16, 18, 20, 22, 24, 26, 28, 30];
110     assert(res.array == correctResult);
111 }
112 
113 version(LDC)
114 {
115     alias _mm_adds_epu8 = __builtin_ia32_paddusb128;
116 }
117 else
118 {
119     __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
120     {
121         ubyte[16] res;
122         byte16 sa = cast(byte16)a;
123         byte16 sb = cast(byte16)b;
124         foreach(i; 0..16)
125             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
126         return _mm_loadu_si128(cast(int4*)res.ptr);
127     }
128 }
129 
130 version(LDC)
131 {
132     alias _mm_adds_epu16 = __builtin_ia32_paddusw128;
133 }
134 else
135 {
136     __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
137     {
138         ushort[8] res;
139         short8 sa = cast(short8)a;
140         short8 sb = cast(short8)b;
141         foreach(i; 0..8)
142             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
143         return _mm_loadu_si128(cast(int4*)res.ptr);
144     }
145 }
146 
147 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
148 {
149     return cast(__m128d)( cast(__m128i)a & cast(__m128i)b );
150 }
151 
152 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
153 {
154     return a & b;
155 }
156 
157 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
158 {
159     return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b );
160 }
161 
162 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
163 {
164     return (~a) & b;
165 }
166 
167 version(LDC)
168 {
169     pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.w")
170         short8 _mm_avg_epu16(short8, short8) pure @safe;
171 
172     pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.b")
173         byte16 _mm_avg_epu8(byte16, byte16) pure @safe;
174 }
175 // TODO
176 
177 
178 // TODO: __m128i _mm_bslli_si128 (__m128i a, int imm8)
179 // TODO: __m128i _mm_bsrli_si128 (__m128i a, int imm8)
180 
181 __m128 _mm_castpd_ps (__m128d a) pure @safe
182 {
183     return cast(__m128)a;
184 }
185 
186 __m128i _mm_castpd_si128 (__m128d a) pure @safe
187 {
188     return cast(__m128i)a;
189 }
190 
191 __m128d _mm_castps_pd (__m128 a) pure @safe
192 {
193     return cast(__m128d)a;
194 }
195 
196 __m128i _mm_castps_si128 (__m128 a) pure @safe
197 {
198     return cast(__m128i)a;
199 }
200 
201 __m128d _mm_castsi128_pd (__m128i a) pure @safe
202 {
203     return cast(__m128d)a;
204 }
205 
206 __m128 _mm_castsi128_ps (__m128i a) pure @safe
207 {
208     return cast(__m128)a;
209 }
210 
211 version(LDC)
212 {
213     alias _mm_clflush = __builtin_ia32_clflush;
214 }
215 // TODO
216 
217 version(LDC)
218 {
219     // just used for "ord" intrinsics
220     pragma(LDC_intrinsic, "llvm.x86.sse2.cmp.pd")
221         double2 __builtin_ia32_cmppd(double2, double2, byte) pure @safe;
222 }
223 
224 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
225 {
226     return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
227 }
228 unittest
229 {
230     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
231     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
232     short[8] E = [ 0,  0,  0,  0, -1,  0,  0,  0];
233     short8   R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B));
234     assert(R.array == E);
235 }
236 
237 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
238 {
239     return equalMask!__m128i(a, b);
240 }
241 unittest
242 {
243     int4   A = [-3, -2, -1,  0];
244     int4   B = [ 4, -2,  2,  0];
245     int[4] E = [ 0, -1,  0, -1];
246     int4   R = cast(int4)(_mm_cmpeq_epi16(A, B));
247     assert(R.array == E);
248 }
249 
250 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
251 {
252     return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
253 }
254 unittest
255 {
256     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
257     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
258     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
259     byte[16] correct =       [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
260     __m128i D = _mm_cmpeq_epi8(A, B);
261     assert(C.array == correct);
262 }
263 
264 
265 version(LDC)
266 {
267     __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe // TODO
268     {
269         return cast(__m128d) equalMask!double2(a, b);
270     }
271 
272     __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe // TODO
273     {
274         return __builtin_ia32_cmpsd(a, b, 0);
275     }
276 
277     __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe // TODO
278     {
279         return cast(__m128d) greaterOrEqualMask!double2(a, b);
280     }
281 
282     __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe // TODO
283     {
284         return __builtin_ia32_cmpsd(b, a, 2);
285     }
286 }
287 
288 
289 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
290 {
291     return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b));
292 }
293 unittest
294 {
295     short8   A = [-3, -2, -1,  0,  0,  1,  2,  3];
296     short8   B = [ 4,  3,  2,  1,  0, -1, -2, -3];
297     short[8] E = [ 0,  0,  0,  0,  0, -1, -1, -1];
298     short8   R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B));
299     assert(R.array == E);
300 }
301 
302 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
303 {
304     return cast(__m128i)( greaterMask!int4(a, b));
305 }
306 unittest
307 {
308     int4   A = [-3,  2, -1,  0];
309     int4   B = [ 4, -2,  2,  0];
310     int[4] E = [ 0, -1,  0,  0];
311     int4   R = cast(int4)(_mm_cmpgt_epi32(A, B));
312     assert(R.array == E);
313 }
314 
315 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
316 {
317     return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b));
318 }
319 unittest
320 {
321     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
322     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
323     byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B);
324     byte[16] correct =       [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0];
325     __m128i D = _mm_cmpeq_epi8(A, B);
326     assert(C.array == correct);
327 }
328 
329 version(LDC)
330 {
331 
332     __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe // TODO
333     {
334         return cast(__m128d) greaterMask!double2(a, b);
335     }
336 
337     __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe // TODO
338     {
339         return __builtin_ia32_cmpsd(b, a, 1);
340     }
341 
342     __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe // TODO
343     {
344         return cast(__m128d) greaterOrEqualMask!double2(b, a);
345     }
346 
347     __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe // TODO
348     {
349         return __builtin_ia32_cmpsd(a, b, 2);
350     }
351 }
352 
353 
354 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
355 {
356     return _mm_cmpgt_epi16(b, a);
357 }
358 
359 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
360 {
361     return _mm_cmpgt_epi32(b, a);
362 }
363 
364 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
365 {
366     return _mm_cmpgt_epi8(b, a);
367 }
368 
369 version(LDC)
370 {
371     __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe // TODO
372     {
373         return cast(__m128d) greaterMask!double2(b, a);
374     }
375 
376     __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe // TODO
377     {
378         return __builtin_ia32_cmpsd(a, b, 1);
379     }
380 
381     __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe // TODO
382     {
383         return cast(__m128d) notEqualMask!double2(a, b);
384     }
385 
386     __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe // TODO
387     {
388         return __builtin_ia32_cmpsd(a, b, 4);
389     }
390 
391     __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe // TODO
392     {
393         return _mm_cmplt_pd(b, a);
394     }
395 
396     __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe // TODO
397     {
398         return __builtin_ia32_cmpsd(b, a, 6);
399     }
400 
401     __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe // TODO
402     {
403         return _mm_cmple_pd(b, a);
404     }
405 
406     __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe // TODO
407     {
408         return __builtin_ia32_cmpsd(b, a, 5);
409     }
410 
411     __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe // TODO
412     {
413         return _mm_cmpgt_pd(b, a);
414     }
415 
416     __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe // TODO
417     {
418         return __builtin_ia32_cmpsd(a, b, 6);
419     }
420 
421     __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe // TODO
422     {
423         return _mm_cmpge_pd(b, a);
424     }
425 
426     __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe // TODO
427     {
428         return __builtin_ia32_cmpsd(a, b, 5);
429     }
430 
431     __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe // TODO
432     {
433         return __builtin_ia32_cmppd(a, b, 7);
434     }
435 
436     __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe // TODO
437     {
438         return __builtin_ia32_cmpsd(a, b, 7);
439     }
440 
441     __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe // TODO
442     {
443         return __builtin_ia32_cmppd(a, b, 3);
444     }
445 
446     __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe // TODO
447     {
448         return __builtin_ia32_cmpsd(a, b, 3);
449     }
450 }
451 
452 version(LDC)
453 {
454     alias _mm_comieq_sd = __builtin_ia32_comisdeq; // TODO
455     alias _mm_comige_sd = __builtin_ia32_comisdge; // TODO
456     alias _mm_comigt_sd = __builtin_ia32_comisdgt; // TODO
457     alias _mm_comile_sd = __builtin_ia32_comisdle; // TODO
458     alias _mm_comilt_sd = __builtin_ia32_comisdlt; // TODO
459     alias _mm_comineq_sd = __builtin_ia32_comisdneq; // TODO
460 }
461 
462 // TODO: alias _mm_cvtepi32_pd = __builtin_ia32_cvtdq2pd;
463 
464 // PERF: replace with __builtin_convertvector when available
465 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe
466 {
467     __m128 res;
468     res.array[0] = cast(float)a.array[0];
469     res.array[1] = cast(float)a.array[1];
470     res.array[2] = cast(float)a.array[2];
471     res.array[3] = cast(float)a.array[3];
472     return res;
473 }
474 unittest
475 {
476     __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000));
477     assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]);
478 }
479 
480 
481 version(LDC) // TODO
482 {
483     alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq;
484 }
485 
486 // MMXREG: _mm_cvtpd_pi32
487 version(LDC)
488 {
489     alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps;
490 // MMXREG: _mm_cvtpi32_pd
491     alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
492 }
493 // TODO
494 
495 // TODO: alias _mm_cvtps_pd = __builtin_ia32_cvtps2pd;
496 
497 double _mm_cvtsd_f64 (__m128d a) pure @safe
498 {
499     return extractelement!(double2, 0)(a);
500 }
501 
502 version(LDC)
503 {
504     alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si;
505     alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
506     alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
507 }
508 // TODO
509 
510 version(LDC)
511 {
512     alias _mm_cvtsd_ss = __builtin_ia32_cvtsd2ss;
513 }
514 // TODO
515 
516 int _mm_cvtsi128_si32 (__m128i a) pure @safe
517 {
518     return a[0];
519 }
520 
521 long _mm_cvtsi128_si64 (__m128i a) pure @safe
522 {
523     long2 la = cast(long2)a;
524     return la[0];
525 }
526 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
527 
528 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe
529 {
530     v[0] = cast(double)x;
531     return v;
532 }
533 unittest
534 {
535     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
536     assert(a.array == [42.0, 0]);
537 }
538 
539 __m128i _mm_cvtsi32_si128 (int a) pure @safe
540 {
541     int4 r = [0, 0, 0, 0];
542     r[0] = a;
543     return r;
544 }
545 
546 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
547 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe
548 {
549     v[0] = cast(double)x;
550     return v;
551 }
552 unittest
553 {
554     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
555     assert(a.array == [42.0, 0]);
556 }
557 
558 __m128i _mm_cvtsi64_si128 (long a) pure @safe
559 {
560     long2 r = [0, 0];
561     r[0] = a;
562     return cast(__m128i)(r);
563 }
564 
565 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
566 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
567 
568 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe
569 {
570     v[0] = x[0];
571     return v;
572 }
573 unittest
574 {
575     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
576     assert(a.array == [42.0, 0]);
577 }
578 
579 version(LDC)
580 {
581     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
582     //MMXREG: _mm_cvttpd_pi32
583     alias _mm_cvttps_epi32 = __builtin_ia32_cvttps2dq;
584     alias _mm_cvttsd_si32 = __builtin_ia32_cvttsd2si;
585     alias _mm_cvttsd_si64 = __builtin_ia32_cvttsd2si64;
586     alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
587 }
588 // TODO
589 
590 
591 
592 __m128d _mm_div_ps(__m128d a, __m128d b)
593 {
594     return a / b;
595 }
596 
597 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
598 {
599     a[0] /= b[0];
600     return a;
601 }
602 unittest
603 {
604     __m128d a = [2.0, 4.5];
605     a = _mm_div_sd(a, a);
606     assert(a.array == [1.0, 4.5]);
607 }
608 
609 int _mm_extract_epi16(int imm8)(__m128i a) pure @safe
610 {
611     return shufflevector!(short8, imm8)(a);
612 }
613 
614 __m128i _mm_insert_epi16(int imm8)(__m128i a, int i) pure @safe
615 {
616     return insertelement!(short8, imm8)(a, i);
617 }
618 
619 version(LDC)
620 {
621     alias _mm_lfence = __builtin_ia32_lfence;
622 }
623 // TODO
624 
625 
626 __m128d _mm_load_pd (const(double) * mem_addr) pure
627 {
628     __m128d* aligned = cast(__m128d*)mem_addr;
629     return *aligned;
630 }
631 
632 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
633 {
634     double[2] arr = [*mem_addr, *mem_addr];
635     return loadUnaligned!(double2)(&arr[0]);
636 }
637 
638 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe
639 {
640     double2 r = [0, 0];
641     r[0] = *mem_addr;
642     return r;
643 }
644 unittest
645 {
646     double x = -42;
647     __m128d a = _mm_load_sd(&x);
648     assert(a.array == [-42.0, 0.0]);
649 }
650 
651 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
652 {
653     return *mem_addr;
654 }
655 
656 alias _mm_load1_pd = _mm_load_pd1;
657 
658 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe
659 {
660     a[1] = *mem_addr;
661     return a;
662 }
663 
664 // Note: strange signature since the memory doesn't have to aligned
665 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe
666 {
667     auto pLong = cast(const(long)*)mem_addr;
668     long2 r = [0, 0];
669     r[0] = *pLong;
670     return cast(__m128i)(r);
671 }
672 
673 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe
674 {
675     a[0] = *mem_addr;
676     return a;
677 }
678 
679 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
680 {
681     __m128d a = _mm_load_pd(mem_addr);
682     return shufflevector!(__m128d, 1, 0)(a, a);
683 }
684 
685 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
686 {
687     return loadUnaligned!(double2)(mem_addr);
688 }
689 
690 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
691 {
692     return loadUnaligned!(__m128i)(cast(int*)mem_addr);
693 }
694 
695 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted
696 {
697     int r = *cast(int*)(mem_addr);
698     int4 result = [0, 0, 0, 0];
699     result[0] = r;
700     return result;
701 }
702 unittest
703 {
704     int r = 42;
705     __m128i A = _mm_loadu_si32(&r);
706     int[4] correct = [42, 0, 0, 0];
707     assert(A.array == correct);
708 }
709 
710 version(LDC)
711 {
712     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
713 
714     alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu;
715 
716     pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxs.w")
717         short8 __builtin_ia32_pmaxsw128(short8, short8) pure @safe;
718     alias _mm_max_epi16 = __builtin_ia32_pmaxsw128;
719 
720     pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxu.b")
721         byte16 __builtin_ia32_pmaxub128(byte16, byte16) pure @safe;
722     alias _mm_max_epu8 = __builtin_ia32_pmaxub128;
723 
724     alias _mm_max_pd = __builtin_ia32_maxpd;
725     alias _mm_max_sd = __builtin_ia32_maxsd;
726 
727     alias _mm_mfence = __builtin_ia32_mfence;
728 
729     pragma(LDC_intrinsic, "llvm.x86.sse2.pmins.w")
730         short8 __builtin_ia32_pminsw128(short8, short8) pure @safe;
731     alias _mm_min_epi16 = __builtin_ia32_pminsw128;
732 
733     pragma(LDC_intrinsic, "llvm.x86.sse2.pminu.b")
734         byte16 __builtin_ia32_pminub128(byte16, byte16) pure @safe;
735     alias _mm_min_epu8 = __builtin_ia32_pminub128;
736 
737     alias _mm_min_pd = __builtin_ia32_minpd;
738     alias _mm_min_sd = __builtin_ia32_minsd;
739 }
740 // TODO
741 
742 __m128i _mm_move_epi64 (__m128i a) pure @safe
743 {
744     long2 result = [ 0, 0 ];
745     long2 la = cast(long2) a;
746     result[0] = la[0];
747     return cast(__m128i)(result);
748 }
749 unittest
750 {
751     long2 A = [13, 47];
752     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
753     long[2] correct = [13, 0];
754     assert(B.array == correct);
755 }
756 
757 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
758 {
759     b[1] = a[1];
760     return b;
761 }
762 unittest
763 {
764     double2 A = [13.0, 47.0];
765     double2 B = [34.0, 58.0];
766     double2 C = _mm_move_sd(A, B);
767     double[2] correct = [34.0, 47.0];
768     assert(C.array == correct);
769 }
770 
771 version(LDC)
772 {
773     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
774     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
775 }
776 
777 // MMXREG: _mm_movepi64_pi64
778 // MMXREG: __m128i _mm_movpi64_epi64 (__m64 a)
779 
780 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC
781 // but seems there in clang
782 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe
783 {
784     __m128i zero = _mm_setzero_si128();
785     long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero);
786     long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero);
787     static if (__VERSION__ >= 2076)
788     {
789         return cast(__m128i)(la * lb);
790     }
791     else
792     {
793         // long2 mul not supported before LDC 1.5
794         la[0] *= lb[0];
795         la[1] *= lb[1];
796         return cast(__m128i)(la);
797     }
798 }
799 unittest
800 {
801     __m128i A = _mm_set_epi32(0, 0xDEADBEEF, 0, 0xffffffff);
802     __m128i B = _mm_set_epi32(0, 0xCAFEBABE, 0, 0xffffffff);
803     __m128i C = _mm_mul_epu32(A, B);
804     long2 LC = cast(long2)C;
805     assert(LC.array[0] == 18446744065119617025uL);
806     assert(LC.array[1] == 12723420444339690338uL);
807 }
808 
809 
810 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
811 {
812     return a * b;
813 }
814 unittest
815 {
816     __m128d a = [-2.0, 1.5];
817     a = _mm_mul_pd(a, a);
818     assert(a.array == [4.0, 2.25]);
819 }
820 
821 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
822 {
823     a[0] *= b[0];
824     return a;
825 }
826 unittest
827 {
828     __m128d a = [-2.0, 1.5];
829     a = _mm_mul_sd(a, a);
830     assert(a.array == [4.0, 1.5]);
831 }
832 
833 
834 // MMXREG: _mm_mul_su32
835 
836 version(LDC)
837 {
838     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
839     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
840 }
841 // TODO
842 
843 __m128i _mm_mullo_epi16 (__m128i a, __m128i b)
844 {
845     return cast(__m128i)(cast(short8)a * cast(short8)b);
846 }
847 
848 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
849 {
850     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
851 }
852 
853 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
854 {
855     return a | b;
856 }
857 
858 version(LDC)
859 {
860     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
861     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
862 }
863 version(LDC)
864 {
865     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
866 }
867 else
868 {
869     __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure
870     {
871         short8 sa = cast(short8)a;
872         short8 sb = cast(short8)b;
873         ubyte[16] result = void;
874         for (int i = 0; i < 8; ++i)
875         {
876             short s = sa[i];
877             if (s < 0) s = 0;
878             if (s > 255) s = 255;
879             result[i] = cast(ubyte)s;
880 
881             s = sb[i];
882             if (s < 0) s = 0;
883             if (s > 255) s = 255;
884             result[i+8] = cast(ubyte)s;
885         }
886         return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr);
887     }
888 }
889 unittest
890 {
891     __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0);
892     byte16 AA = cast(byte16) _mm_packus_epi16(A, A);
893     static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0,
894                                                 0, 255, 0, 255, 255, 2, 1, 0];
895     foreach(i; 0..16)
896         assert(AA[i] == cast(byte)(correctResult[i]));
897 }
898 
899 // TODO
900 version(LDC)
901 {
902     alias _mm_pause = __builtin_ia32_pause;
903 }
904 // TODO
905 
906 version(LDC)
907 {
908     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
909 }
910 // TODO
911 
912 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
913 {
914     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
915     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
916 }
917 unittest
918 {
919     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
920     short8 B = cast(short8) A;
921     foreach(i; 0..8)
922         assert(B.array[i] == i);
923 }
924 
925 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
926 {
927     int[4] result = [e0, e1, e2, e3];
928     return loadUnaligned!(int4)(result.ptr);
929 }
930 unittest
931 {
932     __m128i A = _mm_set_epi32(3, 2, 1, 0);
933     foreach(i; 0..4)
934         assert(A.array[i] == i);
935 }
936 
937 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
938 {
939     long[2] result = [e0, e1];
940     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
941 }
942 unittest
943 {
944     __m128i A = _mm_set_epi64x(1234, 5678);
945     long2 B = cast(long2) A;
946     assert(B.array[0] == 5678);
947     assert(B.array[1] == 1234);
948 }
949 
950 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
951                       byte e11, byte e10, byte e9, byte e8,
952                       byte e7, byte e6, byte e5, byte e4,
953                       byte e3, byte e2, byte e1, byte e0) pure @trusted
954 {
955     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
956                      e8, e9, e10, e11, e12, e13, e14, e15];
957     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
958 }
959 
960 __m128d _mm_set_pd (double e1, double e0) pure @trusted
961 {
962     double[2] result = [e0, e1];
963     return loadUnaligned!(double2)(result.ptr);
964 }
965 
966 __m128d _mm_set_pd1 (double a) pure @trusted
967 {
968     double[2] result = [a, a];
969     return loadUnaligned!(double2)(result.ptr);
970 }
971 
972 __m128d _mm_set_sd (double a) pure @trusted
973 {
974     double[2] result = [a, 0];
975     return loadUnaligned!(double2)(result.ptr);
976 }
977 
978 __m128i _mm_set1_epi16 (short a) pure @trusted
979 {
980     short[8] result = [a, a, a, a, a, a, a, a];
981     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
982 }
983 
984 __m128i _mm_set1_epi32 (int a) pure @trusted
985 {
986     int[4] result = [a, a, a, a];
987     return loadUnaligned!(int4)(result.ptr);
988 }
989 unittest
990 {
991     __m128 a = _mm_set1_ps(-1.0f);
992     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
993     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
994 }
995 
996 __m128i _mm_set1_epi64x (long a) pure @trusted
997 {
998     long[2] result = [a, a];
999     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1000 }
1001 
1002 __m128i _mm_set1_epi8 (char a) pure @trusted
1003 {
1004     byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a];
1005     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1006 }
1007 
1008 alias _mm_set1_pd = _mm_set_pd1;
1009 
1010 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
1011 {
1012     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
1013     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
1014 }
1015 
1016 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
1017 {
1018     int[4] result = [e3, e2, e1, e0];
1019     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
1020 }
1021 
1022 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
1023 {
1024     long[2] result = [e1, e0];
1025     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
1026 }
1027 
1028 __m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12,
1029                        char e11, char e10, char e9, char e8,
1030                        char e7, char e6, char e5, char e4,
1031                        char e3, char e2, char e1, char e0) pure @trusted
1032 {
1033     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
1034                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
1035     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
1036 }
1037 
1038 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
1039 {
1040     double[2] result = [e1, e0];
1041     return loadUnaligned!(double2)(result.ptr);
1042 }
1043 
1044 __m128d _mm_setzero_pd () pure @trusted
1045 {
1046     double[2] result = [0.0, 0.0];
1047     return loadUnaligned!(double2)(result.ptr);
1048 }
1049 
1050 __m128i _mm_setzero_si128() pure @trusted
1051 {
1052     int[4] result = [0, 0, 0, 0];
1053     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
1054 }
1055 
1056 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
1057 {
1058     return shufflevector!(int4, (imm8 >> 0) & 3,
1059                                 (imm8 >> 2) & 3,
1060                                 (imm8 >> 4) & 3,
1061                                 (imm8 >> 6) & 3)(a, a);
1062 }
1063 unittest
1064 {
1065     __m128i A = _mm_setr_epi32(0, 1, 2, 3);
1066     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1067     int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A);
1068     int[4] expectedB = [ 3, 2, 1, 0 ];
1069     assert(B.array == expectedB);
1070 }
1071 
1072 __m128d _mm_shuffle_pd (int imm8)(__m128d a) pure @safe
1073 {
1074     return shufflevector!(double2, 0 + ( imm8 & 1 ),
1075                                    2 + ( (imm8 >> 1) & 1 ))(a, a);
1076 }
1077 unittest
1078 {
1079     __m128d A = _mm_setr_pd(0.5f, 2.0f);
1080     enum int SHUFFLE = _MM_SHUFFLE2(1, 1);
1081     __m128d B = _mm_shuffle_pd!SHUFFLE(A);
1082     double[2] expectedB = [ 2.0f, 2.0f ];
1083     assert(B.array == expectedB);
1084 }
1085 
1086 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
1087 {
1088     return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3,
1089                                       4 + ( (imm8 >> 0) & 3 ),
1090                                       4 + ( (imm8 >> 2) & 3 ),
1091                                       4 + ( (imm8 >> 4) & 3 ),
1092                                       4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a);
1093 }
1094 unittest
1095 {
1096     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1097     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1098     short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A);
1099     short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ];
1100     assert(C.array == expectedC);
1101 }
1102 
1103 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
1104 {
1105     return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ),
1106                                                 ( (imm8 >> 2) & 3 ),
1107                                                 ( (imm8 >> 4) & 3 ),
1108                                                 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a);
1109 }
1110 unittest
1111 {
1112     __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
1113     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1114     short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A);
1115     short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ];
1116     assert(B.array == expectedB);
1117 }
1118 
1119 version(LDC)
1120 {
1121     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
1122     alias _mm_sll_epi64 = __builtin_ia32_psllq128;
1123     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
1124     alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
1125     alias _mm_slli_epi64 = __builtin_ia32_psllqi128;
1126     alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
1127 }
1128 // TODO
1129 
1130 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe
1131 {
1132     static if (imm8 & 0xF0)
1133         return _mm_setzero_si128();
1134     else
1135         return shufflevector!(byte16,
1136         16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8,
1137         24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8)
1138         (_mm_setzero_si128(), op);
1139 }
1140 
1141 version(LDC)
1142 {
1143     // Disappeared with LDC 1.11
1144     static if (__VERSION__ < 2081)
1145         alias _mm_sqrt_pd = __builtin_ia32_sqrtpd;
1146     else
1147     {
1148         __m128d _mm_sqrt_pd(__m128d vec) pure @safe
1149         {
1150             vec.array[0] = llvm_sqrt(vec.array[0]);
1151             vec.array[1] = llvm_sqrt(vec.array[1]);
1152             return vec;
1153         }
1154     }
1155 }
1156 else
1157 {
1158     __m128d _mm_sqrt_pd(__m128d vec) pure @safe
1159     {
1160         import std.math: sqrt;
1161         vec.array[0] = sqrt(vec.array[0]);
1162         vec.array[1] = sqrt(vec.array[1]);
1163         return vec;
1164     }
1165 }
1166 
1167 
1168 version(LDC)
1169 {
1170     // Disappeared with LDC 1.11
1171     static if (__VERSION__ < 2081)
1172         alias _mm_sqrt_sd = __builtin_ia32_sqrtsd;
1173     else
1174     {
1175         __m128d _mm_sqrt_sd(__m128d vec) pure @safe
1176         {
1177             vec.array[0] = llvm_sqrt(vec.array[0]);
1178             vec.array[1] = vec.array[1];
1179             return vec;
1180         }
1181     }
1182 }
1183 else
1184 {
1185     __m128d _mm_sqrt_sd(__m128d vec) pure @safe
1186     {
1187         import std.math: sqrt;
1188         vec.array[0] = sqrt(vec.array[0]);
1189         vec.array[1] = vec.array[1];
1190         return vec;
1191     }
1192 }
1193 
1194 
1195 version(LDC)
1196 {
1197     alias _mm_sra_epi16  = __builtin_ia32_psraw128;
1198     alias _mm_sra_epi32  = __builtin_ia32_psrad128;
1199     alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
1200     alias _mm_srai_epi32 = __builtin_ia32_psradi128;
1201 
1202     alias _mm_srl_epi16  = __builtin_ia32_psrlw128;
1203     alias _mm_srl_epi32  = __builtin_ia32_psrld128;
1204     alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
1205     alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
1206     alias _mm_srli_epi32 = __builtin_ia32_psrldi128;
1207     alias _mm_srli_epi64 = __builtin_ia32_psrlqi128;
1208 }
1209 // TODO
1210 
1211 __m128i _mm_srli_si128(ubyte imm8)(__m128i op) pure @safe
1212 {
1213     static if (imm8 & 0xF0)
1214         return _mm_setzero_si128();
1215     else
1216         return cast(__m128i) shufflevector!(byte16,
1217                                             imm8+0, imm8+1, imm8+2, imm8+3, imm8+4, imm8+5, imm8+6, imm8+7,
1218                                             imm8+8, imm8+9, imm8+10, imm8+11, imm8+12, imm8+13, imm8+14, imm8+15)
1219                                            (cast(byte16) op, cast(byte16)_mm_setzero_si128());
1220 }
1221 
1222 // Note: this is a bonus intrinsic
1223 __m128 _mm_srli_si128(ubyte imm8)(__m128 op) @safe
1224 {
1225     return cast(__m128)_mm_srli_si128!imm8(cast(__m128i)op);
1226 }
1227 unittest
1228 {
1229     // test that cast works at all
1230     __m128 A = cast(__m128) _mm_set1_epi32(0x3F800000);
1231     assert(A.array == [1.0f, 1.0f, 1.0f, 1.0f]);
1232 
1233     // test _mm_srli_si128 for __m128i
1234     assert(_mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)).array == [2, 3, 4, 0]);
1235     assert(_mm_srli_si128!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)).array == [3.0f, 4.0f, 0, 0]);
1236 }
1237 
1238 __m128d _mm_srli_si128(ubyte imm8)(__m128d op) pure @safe
1239 {
1240     return cast(__m128d) _mm_srli_si128!imm8(cast(__m128i)op);
1241 }
1242 
1243 void _mm_store_pd (double* mem_addr, __m128d a) pure
1244 {
1245     __m128d* aligned = cast(__m128d*)mem_addr;
1246     *aligned = a;
1247 }
1248 
1249 void _mm_store_pd1 (double* mem_addr, __m128d a) pure
1250 {
1251     __m128d* aligned = cast(__m128d*)mem_addr;
1252     *aligned = shufflevector!(double2, 0, 0)(a, a);
1253 }
1254 
1255 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
1256 {
1257     *mem_addr = extractelement!(double2, 0)(a);
1258 }
1259 
1260 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
1261 {
1262     *mem_addr = a;
1263 }
1264 
1265 alias _mm_store1_pd = _mm_store_pd1;
1266 
1267 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
1268 {
1269     *mem_addr = extractelement!(double2, 1)(a);
1270 }
1271 
1272 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
1273 {
1274     long* dest = cast(long*)mem_addr;
1275     *dest = extractelement!(long2, 0)(cast(long2)a);
1276 }
1277 
1278 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
1279 {
1280     *mem_addr = extractelement!(double2, 0)(a);
1281 }
1282 
1283 void _mm_storer_pd (double* mem_addr, __m128d a) pure
1284 {
1285     __m128d* aligned = cast(__m128d*)mem_addr;
1286     *aligned = shufflevector!(double2, 1, 0)(a, a);
1287 }
1288 
1289 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
1290 {
1291     storeUnaligned!double2(a, mem_addr);
1292 }
1293 
1294 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
1295 {
1296     storeUnaligned!__m128i(a, cast(int*)mem_addr);
1297 }
1298 
1299 // TODO: _mm_stream_pd
1300 // TODO: _mm_stream_si128
1301 // TODO: _mm_stream_si32
1302 // TODO: _mm_stream_si64
1303 
1304 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
1305 {
1306     return cast(__m128i)(cast(short8)a - cast(short8)b);
1307 }
1308 
1309 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
1310 {
1311     return cast(__m128i)(cast(int4)a - cast(int4)b);
1312 }
1313 
1314 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
1315 {
1316     return cast(__m128i)(cast(long2)a - cast(long2)b);
1317 }
1318 
1319 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
1320 {
1321     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
1322 }
1323 
1324 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
1325 {
1326     return a - b;
1327 }
1328 
1329 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
1330 {
1331     a[0] -= b[0];
1332     return a;
1333 }
1334 unittest
1335 {
1336     __m128d a = [1.5, -2.0];
1337     a = _mm_sub_sd(a, a);
1338     assert(a.array == [0.0, -2.0]);
1339 }
1340 
1341 
1342 // MMXREG: _mm_sub_si64
1343 
1344 version(LDC)
1345 {
1346     alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
1347     alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
1348     alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
1349     alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
1350 
1351     alias _mm_ucomieq_sd = __builtin_ia32_ucomisdeq;
1352     alias _mm_ucomige_sd = __builtin_ia32_ucomisdge;
1353     alias _mm_ucomigt_sd = __builtin_ia32_ucomisdgt;
1354     alias _mm_ucomile_sd = __builtin_ia32_ucomisdle;
1355     alias _mm_ucomilt_sd = __builtin_ia32_ucomisdlt;
1356     alias _mm_ucomineq_sd = __builtin_ia32_ucomisdneq;
1357 }
1358 // TODO
1359 
1360 __m128d _mm_undefined_pd() pure @safe
1361 {
1362     __m128d result = void;
1363     return result;
1364 }
1365 __m128i _mm_undefined_si128() pure @safe
1366 {
1367     __m128i result = void;
1368     return result;
1369 }
1370 
1371 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
1372 {
1373     return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
1374                                        (cast(short8)a, cast(short8)b);
1375 }
1376 
1377 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
1378 {
1379     return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
1380 }
1381 
1382 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe
1383 {
1384     return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b);
1385 }
1386 
1387 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
1388 {
1389     return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
1390                                                12, 28, 13, 29, 14, 30, 15, 31)
1391                                                (cast(byte16)a, cast(byte16)b);
1392 }
1393 
1394 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
1395 {
1396     return shufflevector!(__m128d, 1, 3)(a, b);
1397 }
1398 
1399 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
1400 {
1401     return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
1402                                        (cast(short8)a, cast(short8)b);
1403 }
1404 
1405 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
1406 {
1407     return shufflevector!(int4, 0, 4, 1, 6)
1408                          (cast(int4)a, cast(int4)b);
1409 }
1410 
1411 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe
1412 {
1413     return cast(__m128i) shufflevector!(long2, 0, 2)
1414                                        (cast(long2)a, cast(long2)b);
1415 }
1416 
1417 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
1418 {
1419     return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
1420                                                 4, 20, 5, 21, 6, 22, 7, 23)
1421                                        (cast(byte16)a, cast(byte16)b);
1422 }
1423 
1424 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
1425 {
1426     return shufflevector!(__m128d, 0, 2)(a, b);
1427 }
1428 
1429 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
1430 {
1431     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
1432 }
1433 
1434 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
1435 {
1436     return a ^ b;
1437 }
1438 
1439 unittest
1440 {
1441     // distance between two points in 4D
1442     float distance(float[4] a, float[4] b) nothrow @nogc
1443     {
1444         __m128 va = _mm_loadu_ps(a.ptr);
1445         __m128 vb = _mm_loadu_ps(b.ptr);
1446         __m128 diffSquared = _mm_sub_ps(va, vb);
1447         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
1448         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_si128!8(diffSquared));
1449         sum = _mm_add_ps(sum, _mm_srli_si128!4(sum));
1450         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
1451     }
1452     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
1453 }