1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 
7 module inteli.xmmintrin;
8 
9 public import inteli.types;
10 
11 import inteli.internals;
12 
13 // SSE1
14 // Note: intrinsics noted MMXREG are actually using MMX registers,
15 // and were not translated. These intrinsics are for instruction
16 // introduced with SSE1, that also work on MMX registers.
17 
18 nothrow @nogc:
19 
20 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
21 {
22     return a + b;
23 }
24 
25 unittest
26 {
27     __m128 a = [1, 2, 3, 4];
28     a = _mm_add_ps(a, a);
29     assert(a.array[0] == 2);
30     assert(a.array[1] == 4);
31     assert(a.array[2] == 6);
32     assert(a.array[3] == 8);
33 }
34 
35 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
36 {
37     a[0] += b[0];
38     return a;
39 }
40 unittest
41 {
42     __m128 a = [1, 2, 3, 4];
43     a = _mm_add_ss(a, a);
44     assert(a.array == [2.0f, 2, 3, 4]);
45 }
46 
47 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
48 {
49     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
50 }
51 unittest
52 {
53     // Note: tested in emmintrin.d
54 }
55 
56 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe
57 {
58     return (~a) & b;
59 }
60 
61 
62 // MMXREG: _mm_avg_pu16
63 // MMXREG: _mm_avg_pu8
64 
65 version(LDC)
66 {
67     pragma(LDC_intrinsic, "llvm.x86.sse.cmp.ps")
68         __m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe;
69 }
70 else
71 {
72     // unimplemented
73     /*__m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe
74     {
75         assert(false, "unimplemented");
76     }*/
77 }
78 
79 version(LDC)
80 {
81     __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
82     {
83         return __builtin_ia32_cmpps(a, b, 0);
84     }
85 
86     __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
87     {
88         return __builtin_ia32_cmpss(a, b, 0);
89     }
90 
91     __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
92     {
93         return __builtin_ia32_cmpps(b, a, 2); // CMPLEPS reversed
94     }
95 
96     __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
97     {
98         return __builtin_ia32_cmpss(b, a, 2); // CMPLESS reversed
99     }
100 
101     __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
102     {
103         return __builtin_ia32_cmpps(b, a, 1); // CMPLTPS reversed
104     }
105 
106     __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
107     {
108         return __builtin_ia32_cmpss(b, a, 1); // CMPLTSS reversed
109     }
110 
111     __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
112     {
113         return __builtin_ia32_cmpps(a, b, 2); // CMPLEPS
114     }
115 
116     __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
117     {
118         return __builtin_ia32_cmpss(a, b, 2); // CMPLESS
119     }
120 
121     __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
122     {
123         return __builtin_ia32_cmpps(a, b, 1); // CMPLTPS
124     }
125 
126     __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
127     {
128         return __builtin_ia32_cmpss(a, b, 1); // CMPLTSS
129     }
130 
131     __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
132     {
133         return __builtin_ia32_cmpps(a, b, 4); // CMPNEQPS
134     }
135 
136     __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
137     {
138         return __builtin_ia32_cmpss(a, b, 4); // CMPNEQSS
139     }
140 
141     __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
142     {
143         return __builtin_ia32_cmpps(b, a, 6); // CMPNLEPS reversed
144     }
145 
146     __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
147     {
148         return __builtin_ia32_cmpss(b, a, 6); // CMPNLESS reversed
149     }
150 
151     __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
152     {
153         return __builtin_ia32_cmpps(b, a, 5); // CMPNLTPS reversed
154     }
155 
156     __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
157     {
158         return __builtin_ia32_cmpss(b, a, 5); // CMPNLTPS reversed
159     }
160 
161     __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
162     {
163         return __builtin_ia32_cmpps(a, b, 6); // CMPNLEPS
164     }
165 
166     __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
167     {
168         return __builtin_ia32_cmpss(a, b, 6); // CMPNLESS
169     }
170 
171     __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
172     {
173         return __builtin_ia32_cmpps(a, b, 5); // CMPNLTPS
174     }
175 
176     __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
177     {
178         return __builtin_ia32_cmpss(a, b, 5); // CMPNLTSS
179     }
180 
181     __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
182     {
183         return __builtin_ia32_cmpps(a, b, 7); // CMPORDPS
184     }
185 
186     __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
187     {
188         return __builtin_ia32_cmpss(a, b, 7); // CMPORDSS
189     }
190 
191     __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
192     {
193         return __builtin_ia32_cmpps(a, b, 3); // CMPUNORDPS
194     }
195 
196     __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
197     {
198         return __builtin_ia32_cmpss(a, b, 3); // CMPUNORDSS
199     }
200 }
201 else
202 {
203     // TODO
204 }
205 
206 version(LDC)
207 {
208     alias _mm_comieq_ss = __builtin_ia32_comieq;
209 }
210 else
211 {
212     // TODO
213     /*__m128i _mm_comieq_ss(__m128, __m128) pure @safe
214     {
215         assert(false, "unimplemented");
216     }
217     */
218 }
219 
220 
221 version(LDC)
222 {
223     alias _mm_comige_ss = __builtin_ia32_comige;
224 }
225 else
226 {
227     // TODO
228     /*
229     __m128i _mm_comige_ss(__m128, __m128) pure @safe
230     {
231         assert(false, "unimplemented");
232     }
233     */
234 }
235 
236 
237 version(LDC)
238 {
239     alias _mm_comigt_ss = __builtin_ia32_comigt;
240 }
241 else
242 {
243     // TODO
244     /*
245     __m128i _mm_comigt_ss(__m128, __m128) pure @safe
246     {
247         assert(false, "unimplemented");
248     }
249     */
250 }
251 
252 
253 version(LDC)
254 {
255     alias _mm_comile_ss = __builtin_ia32_comile;
256 }
257 else
258 {
259     // TODO
260     /*
261     __m128i _mm_comile_ss(__m128, __m128) pure @safe
262     {
263         assert(false, "unimplemented");
264     }
265     */
266 }
267 
268 
269 version(LDC)
270 {
271     alias _mm_comilt_ss = __builtin_ia32_comilt;
272 }
273 else
274 {
275     // TODO
276     /*
277     __m128i _mm_comilt_ss(__m128, __m128) pure @safe
278     {
279         assert(false, "unimplemented");
280     }
281     */
282 }
283 
284 version(LDC)
285 {
286     alias _mm_comineq_ss = __builtin_ia32_comineq;
287 }
288 else
289 {
290     // TODO
291     /*
292     __m128i _mm_comineq_ss(__m128, __m128) pure @safe
293     {
294         assert(false, "unimplemented");
295     }
296     */
297 }
298 
299 // MMXREG: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
300 // MMXREG: __m64 _mm_cvt_ps2pi (__m128 a)
301 
302 
303 version(LDC)
304 {
305     pragma(LDC_intrinsic, "llvm.x86.sse2.cvtsi2sd")
306         double2 _mm_cvt_si2ss(double2, int) pure @safe;
307 }
308 else
309 {
310     // TODO
311     /*
312     __m128d _mm_cvt_si2ss(__m128d, int) pure @safe
313     {
314         assert(false, "unimplemented");
315     }
316     */
317 }
318 
319 version(LDC)
320 {
321     alias _mm_cvt_ss2si = __builtin_ia32_cvtss2si;
322 }
323 else
324 {
325     // TODO
326     /*
327     int _mm_cvt_ss2si(__m128 v) pure @safe
328     {
329         assert(false, "unimplemented");
330     }
331     */
332 }
333 
334 // MMXREG: __m128 _mm_cvtpi16_ps (__m64 a)
335 // MMXREG: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
336 // MMXREG: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
337 // MMXREG: __m128 _mm_cvtpi8_ps (__m64 a)
338 // MMXREG: __m64 _mm_cvtps_pi16 (__m128 a)
339 // MMXREG: __m64 _mm_cvtps_pi32 (__m128 a)
340 // MMXREG: __m64 _mm_cvtps_pi8 (__m128 a)
341 // MMXREG: __m128 _mm_cvtpu16_ps (__m64 a)
342 // MMXREG: __m128 _mm_cvtpu8_ps (__m64 a)
343 
344 version(LDC)
345 {
346     // this LLVM intrinsics seems to still be there
347     pragma(LDC_intrinsic, "llvm.x86.sse.cvtsi2ss")
348         float4 _mm_cvtsi32_ss(float4, int) pure @safe;
349 }
350 else
351 {
352     __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe
353     {
354         v[0] = cast(float)x;
355         return v;
356     }
357 }
358 unittest
359 {
360     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
361     assert(a.array == [42.0f, 0, 0, 0]);
362 }
363 
364 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy
365 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe
366 {
367     v[0] = cast(float)x;
368     return v;
369 }
370 unittest
371 {
372     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
373     assert(a.array == [42.0f, 0, 0, 0]);
374 }
375 
376 float _mm_cvtss_f32(__m128 a) pure @safe
377 {
378     return a[0];
379 }
380 
381 version(LDC)
382 {
383     alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si;
384 }
385 else
386 {
387     // TODO
388 }
389 
390 version(LDC)
391 {
392     alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64;
393 }
394 else
395 {
396     // TODO
397 }
398 
399 // MMXREG: __m64 _mm_cvtt_ps2pi (__m128 a)
400 
401 version(LDC)
402 {
403     alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si;
404     alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op
405 }
406 else
407 {
408     // TODO
409 }
410 
411 // MMXREG: _mm_cvttps_pi32
412 
413 version(LDC)
414 {
415     alias _mm_cvttss_si64 = __builtin_ia32_cvttss2si64;
416 }
417 else
418 {
419     // TODO
420 }
421 
422 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
423 {
424     return a / b;
425 }
426 unittest
427 {
428     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
429     a = _mm_div_ps(a, a);
430     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
431     assert(a.array == correct);
432 }
433 
434 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
435 {
436     a[0] /= b[0];
437     return a;
438 }
439 unittest
440 {
441     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
442     a = _mm_div_ss(a, a);
443     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
444     assert(a.array == correct);
445 }
446 
447 // MMXREG: int _mm_extract_pi16 (__m64 a, int imm8)
448 
449 // TODO: unsigned int _MM_GET_EXCEPTION_MASK ()
450 // TODO: unsigned int _MM_GET_EXCEPTION_STATE ()
451 // TODO: unsigned int _MM_GET_FLUSH_ZERO_MODE ()
452 // TODO: unsigned int _MM_GET_ROUNDING_MODE ()
453 // TODO: stmxcsr
454 // TODO: unsigned int _mm_getcsr (void)
455 
456 // MMXREG: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8)
457 
458 __m128 _mm_load_ps(const(float)*p) pure @trusted
459 {
460     return *cast(__m128*)p;
461 }
462 
463 __m128 _mm_load_ps1(const(float)*p) pure @trusted
464 {
465     float[4] f = [ *p, *p, *p, *p ];
466     return loadUnaligned!(float4)(f.ptr);
467 }
468 
469 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
470 {
471     float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ];
472     return loadUnaligned!(float4)(f.ptr);
473 }
474 
475 alias _mm_load1_ps = _mm_load_ps1;
476 
477 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe
478 {
479     long2 la = cast(long2)a;
480     la[1] = *mem_addr;
481     return cast(__m128)la;
482 }
483 
484 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe
485 {
486     long2 la = cast(long2)a;
487     la[0] = *mem_addr;
488     return cast(__m128)la;
489 }
490 
491 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted
492 {
493     __m128* aligned = cast(__m128*)mem_addr;
494     __m128 a = *aligned;
495     return shufflevector!(__m128, 3, 2, 1, 0)(a, a);
496 }
497 
498 __m128 _mm_loadu_ps(float*p) pure @safe
499 {
500     return loadUnaligned!(__m128)(p);
501 }
502 
503 // MMXREG: _mm_maskmove_si64
504 // MMXREG: _m_maskmovq
505 
506 // MMXREG: _mm_max_pi16
507 version(LDC)
508 {
509     alias _mm_max_ps = __builtin_ia32_maxps;
510 }
511 else
512 {
513     // TODO
514 }
515 
516 // MMXREG: _mm_max_pu8
517 version(LDC)
518 {
519     alias _mm_max_ss = __builtin_ia32_maxss;
520 }
521 else
522 {
523     // TODO
524 }
525 
526 // MMXREG: _mm_min_pi16
527 version(LDC)
528 {
529     alias _mm_min_ps = __builtin_ia32_minps;
530 }
531 else
532 {
533     // TODO
534 }
535 
536 // MMXREG: _mm_min_pi8
537 
538 version(LDC)
539 {
540     alias _mm_min_ss = __builtin_ia32_minss;
541 }
542 
543 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe
544 {
545     return shufflevector!(__m128, 4, 1, 2, 3)(a, b);
546 }
547 
548 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe
549 {
550     return shufflevector!(float4, 2, 3, 6, 7)(a, b);
551 }
552 
553 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe
554 {
555     return shufflevector!(float4, 0, 1, 4, 5)(a, b);
556 }
557 
558 // TODO: int _mm_movemask_pi8
559 version(LDC)
560 {
561     alias _mm_movemask_ps = __builtin_ia32_movmskps;
562 }
563 
564 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
565 {
566     return a * b;
567 }
568 unittest
569 {
570     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
571     a = _mm_mul_ps(a, a);
572     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
573     assert(a.array == correct);
574 }
575 
576 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
577 {
578     a[0] *= b[0];
579     return a;
580 }
581 unittest
582 {
583     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
584     a = _mm_mul_ss(a, a);
585     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
586     assert(a.array == correct);
587 }
588 
589 // MMXREG: _mm_mulhi_pu16
590 
591 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
592 {
593     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
594 }
595 
596 // MMXREG: __m64 _m_pavgb (__m64 a, __m64 b)
597 // MMXREG: __m64 _m_pavgw (__m64 a, __m64 b)
598 // MMXREG: int _m_pextrw (__m64 a, int imm8)
599 // MMXREG: __m64 _m_pinsrw (__m64 a, int i, int imm8)
600 // MMXREG: __m64 _m_pmaxsw (__m64 a, __m64 b)
601 // MMXREG: __m64 _m_pmaxub (__m64 a, __m64 b)
602 // MMXREG: __m64 _m_pminsw (__m64 a, __m64 b)
603 // MMXREG: __m64 _m_pminub (__m64 a, __m64 b)
604 // MMXREG: int _m_pmovmskb (__m64 a)
605 
606 // MMXREG: __m64 _m_pmulhuw (__m64 a, __m64 b)
607 
608 enum _MM_HINT_NTA = 0;
609 enum _MM_HINT_T0 = 1;
610 enum _MM_HINT_T1 = 2;
611 enum _MM_HINT_T2 = 3;
612 
613 // Note: locality must be compile-time
614 void _mm_prefetch(int locality)(void* p) pure @safe
615 {
616     llvm_prefetch(p, 0, locality, 1);
617 }
618 
619 // MMXREG: __m64 _m_psadbw (__m64 a, __m64 b)
620 // MMXREG: __m64 _m_pshufw (__m64 a, int imm8)
621 
622 version(LDC)
623 {
624     alias _mm_rcp_ps = __builtin_ia32_rcpps;
625 }
626 // TODO
627 
628 version(LDC)
629 {
630     alias _mm_rcp_ss = __builtin_ia32_rcpss;
631 }
632 // TODO
633 
634 version(LDC)
635 {
636     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
637 }
638 // TODO
639 
640 version(LDC)
641 {
642     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
643 }
644 // TODO
645 
646 // TODO: _mm_sad_pu8
647 // TODO: void _MM_SET_EXCEPTION_MASK (unsigned int a)
648 // TODO: void _MM_SET_EXCEPTION_STATE (unsigned int a)
649 // TODO: void _MM_SET_FLUSH_ZERO_MODE (unsigned int a)
650 
651 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
652 {
653     float[4] result = [e0, e1, e2, e3];
654     return loadUnaligned!(float4)(result.ptr);
655 }
656 
657 alias _mm_set_ps1 = _mm_set1_ps;
658 
659 // TODO: _MM_SET_ROUNDING_MODE
660 
661 __m128 _mm_set_ss (float a) pure @trusted
662 {
663     float[4] result = [a, 0.0f, 0.0f, 0.0f];
664     return loadUnaligned!(float4)(result.ptr);
665 }
666 
667 __m128 _mm_set1_ps (float a) pure @trusted
668 {
669     float[4] result = [a, a, a, a];
670     return loadUnaligned!(float4)(result.ptr);
671 }
672 
673 // TODO: _mm_setcsr
674 
675 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
676 {
677     float[4] result = [e3, e2, e1, e0];
678     return loadUnaligned!(float4)(result.ptr);
679 }
680 
681 __m128 _mm_setzero_ps() pure @trusted
682 {
683     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
684     return loadUnaligned!(float4)(result.ptr);
685 }
686 
687 version(LDC)
688 {
689     alias _mm_sfence = __builtin_ia32_sfence;
690 }
691 // TODO
692 
693 // MMXREG: mm_shuffle_pi16
694 
695 // Note: the immediate shuffle value is given at compile-time instead of runtime.
696 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
697 {
698     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
699 }
700 
701 version(LDC)
702 {
703     alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
704 }
705 else
706 {
707      __m128 _mm_sqrt_ps(__m128 vec) pure @safe
708     {
709         import std.math: sqrt;
710         vec.array[0] = sqrt(vec.array[0]);
711         vec.array[1] = sqrt(vec.array[1]);
712         vec.array[2] = sqrt(vec.array[2]);
713         vec.array[3] = sqrt(vec.array[3]);
714         return vec;
715     }
716 }
717 
718 version(LDC)
719 {
720     alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
721 }
722 else
723 {
724     __m128 _mm_sqrt_ss(__m128 vec) pure @safe
725     {
726         import std.math: sqrt;
727         vec.array[0] = sqrt(vec.array[0]);
728         return vec;
729     }
730 }
731 
732 unittest
733 {
734     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
735     assert(A.array[0] == 2.0f);
736 }
737 
738 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
739 {
740     __m128* aligned = cast(__m128*)mem_addr;
741     *aligned = a;
742 }
743 
744 alias _mm_store_ps1 = _mm_store1_ps;
745 
746 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
747 {
748     *mem_addr = a[0];
749 }
750 
751 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
752 {
753     __m128* aligned = cast(__m128*)mem_addr;
754     *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a);
755 }
756 
757 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe
758 {
759     *p = extractelement!(long2, 1)(a);
760 }
761 
762 void _mm_storel_pi(__m64* p, __m128 a) pure @safe
763 {
764     *p = extractelement!(long2, 0)(a);
765 }
766 
767 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
768 {
769     __m128* aligned = cast(__m128*)mem_addr;
770     *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a);
771 }
772 
773 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe
774 {
775     storeUnaligned!(float4)(a, mem_addr);
776 }
777 
778 // TODO: _mm_stream_pi, does not seem possible
779 // TODO: _mm_stream_ps, does not seem possible
780 
781 
782 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
783 {
784     return a - b;
785 }
786 unittest
787 {
788     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
789     a = _mm_sub_ps(a, a);
790     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
791     assert(a.array == correct);
792 }
793 
794 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
795 {
796     a[0] -= b[0];
797     return a;
798 }
799 unittest
800 {
801     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
802     a = _mm_sub_ss(a, a);
803     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
804     assert(a.array == correct);
805 }
806 
807 
808 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
809 {
810     __m128 tmp3, tmp2, tmp1, tmp0;
811     tmp0 = _mm_unpacklo_ps(row0, row1);
812     tmp2 = _mm_unpacklo_ps(row2, row3);
813     tmp1 = _mm_unpackhi_ps(row0, row1);
814     tmp3 = _mm_unpackhi_ps(row2, row3);
815     row0 = _mm_movelh_ps(tmp0, tmp2);
816     row1 = _mm_movehl_ps(tmp2, tmp0);
817     row2 = _mm_movelh_ps(tmp1, tmp3);
818     row3 = _mm_movehl_ps(tmp3, tmp1);
819 }
820 
821 version(LDC)
822 {
823     alias _mm_ucomieq_ss = __builtin_ia32_ucomieq;
824 }
825 // TODO
826 
827 version(LDC)
828 {
829     alias _mm_ucomige_ss = __builtin_ia32_ucomige;
830 }
831 // TODO
832 
833 version(LDC)
834 {
835     alias _mm_ucomigt_ss = __builtin_ia32_ucomigt;
836 }
837 // TODO
838 
839 version(LDC)
840 {
841     alias _mm_ucomile_ss = __builtin_ia32_ucomile;
842 }
843 // TODO
844 
845 version(LDC)
846 {
847     alias _mm_ucomilt_ss = __builtin_ia32_ucomilt;
848 }
849 // TODO
850 
851 version(LDC)
852 {
853     alias _mm_ucomineq_ss = __builtin_ia32_ucomineq;
854 }
855 // TODO
856 
857 
858 __m128 _mm_undefined_ps() pure @safe
859 {
860     __m128 undef = void;
861     return undef;
862 }
863 
864 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe
865 {
866     return shufflevector!(float4, 2, 6, 3, 7)(a, b);
867 }
868 
869 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe
870 {
871     return shufflevector!(float4, 0, 4, 1, 5)(a, b);
872 }
873 
874 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe
875 {
876     return a ^ b;
877 }