1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 
7 module inteli.xmmintrin;
8 
9 public import inteli.types;
10 
11 import inteli.internals;
12 
13 // SSE1
14 // Note: intrinsics noted MMXREG are actually using MMX registers,
15 // and were not translated. These intrinsics are for instruction
16 // introduced with SSE1, that also work on MMX registers.
17 
18 nothrow @nogc:
19 
20 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
21 {
22     return a + b;
23 }
24 
25 unittest
26 {
27     __m128 a = [1, 2, 3, 4];
28     a = _mm_add_ps(a, a);
29     assert(a.array[0] == 2);
30     assert(a.array[1] == 4);
31     assert(a.array[2] == 6);
32     assert(a.array[3] == 8);
33 }
34 
35 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
36 {
37     a[0] += b[0];
38     return a;
39 }
40 unittest
41 {
42     __m128 a = [1, 2, 3, 4];
43     a = _mm_add_ss(a, a);
44     assert(a.array == [2.0f, 2, 3, 4]);
45 }
46 
47 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
48 {
49     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
50 }
51 unittest
52 {
53     // Note: tested in emmintrin.d
54 }
55 
56 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe
57 {
58     return (~a) & b;
59 }
60 
61 
62 // MMXREG: _mm_avg_pu16
63 // MMXREG: _mm_avg_pu8
64 
65 version(LDC)
66 {
67     pragma(LDC_intrinsic, "llvm.x86.sse.cmp.ps")
68         __m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe;
69 }
70 else
71 {
72     // unimplemented
73     /*__m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe
74     {
75         assert(false, "unimplemented");
76     }*/
77 }
78 
79 version(LDC)
80 {
81     __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
82     {
83         return __builtin_ia32_cmpps(a, b, 0);
84     }
85 
86     __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
87     {
88         return __builtin_ia32_cmpss(a, b, 0);
89     }
90 
91     __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
92     {
93         return __builtin_ia32_cmpps(b, a, 2); // CMPLEPS reversed
94     }
95 
96     __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
97     {
98         return __builtin_ia32_cmpss(b, a, 2); // CMPLESS reversed
99     }
100 
101     __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
102     {
103         return __builtin_ia32_cmpps(b, a, 1); // CMPLTPS reversed
104     }
105 
106     __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
107     {
108         return __builtin_ia32_cmpss(b, a, 1); // CMPLTSS reversed
109     }
110 
111     __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
112     {
113         return __builtin_ia32_cmpps(a, b, 2); // CMPLEPS
114     }
115 
116     __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
117     {
118         return __builtin_ia32_cmpss(a, b, 2); // CMPLESS
119     }
120 
121     __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
122     {
123         return __builtin_ia32_cmpps(a, b, 1); // CMPLTPS
124     }
125 
126     __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
127     {
128         return __builtin_ia32_cmpss(a, b, 1); // CMPLTSS
129     }
130 
131     __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
132     {
133         return __builtin_ia32_cmpps(a, b, 4); // CMPNEQPS
134     }
135 
136     __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
137     {
138         return __builtin_ia32_cmpss(a, b, 4); // CMPNEQSS
139     }
140 
141     __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
142     {
143         return __builtin_ia32_cmpps(b, a, 6); // CMPNLEPS reversed
144     }
145 
146     __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
147     {
148         return __builtin_ia32_cmpss(b, a, 6); // CMPNLESS reversed
149     }
150 
151     __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
152     {
153         return __builtin_ia32_cmpps(b, a, 5); // CMPNLTPS reversed
154     }
155 
156     __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
157     {
158         return __builtin_ia32_cmpss(b, a, 5); // CMPNLTPS reversed
159     }
160 
161     __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
162     {
163         return __builtin_ia32_cmpps(a, b, 6); // CMPNLEPS
164     }
165 
166     __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
167     {
168         return __builtin_ia32_cmpss(a, b, 6); // CMPNLESS
169     }
170 
171     __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
172     {
173         return __builtin_ia32_cmpps(a, b, 5); // CMPNLTPS
174     }
175 
176     __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
177     {
178         return __builtin_ia32_cmpss(a, b, 5); // CMPNLTSS
179     }
180 
181     __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
182     {
183         return __builtin_ia32_cmpps(a, b, 7); // CMPORDPS
184     }
185 
186     __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
187     {
188         return __builtin_ia32_cmpss(a, b, 7); // CMPORDSS
189     }
190 
191     __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
192     {
193         return __builtin_ia32_cmpps(a, b, 3); // CMPUNORDPS
194     }
195 
196     __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
197     {
198         return __builtin_ia32_cmpss(a, b, 3); // CMPUNORDSS
199     }
200 }
201 else
202 {
203     // TODO
204 }
205 
206 version(LDC)
207 {
208     alias _mm_comieq_ss = __builtin_ia32_comieq;
209 }
210 else
211 {
212     // TODO
213     /*__m128i _mm_comieq_ss(__m128, __m128) pure @safe
214     {
215         assert(false, "unimplemented");
216     }
217     */
218 }
219 
220 
221 version(LDC)
222 {
223     alias _mm_comige_ss = __builtin_ia32_comige;
224 }
225 else
226 {
227     // TODO
228     /*
229     __m128i _mm_comige_ss(__m128, __m128) pure @safe
230     {
231         assert(false, "unimplemented");
232     }
233     */
234 }
235 
236 
237 version(LDC)
238 {
239     alias _mm_comigt_ss = __builtin_ia32_comigt;
240 }
241 else
242 {
243     // TODO
244     /*
245     __m128i _mm_comigt_ss(__m128, __m128) pure @safe
246     {
247         assert(false, "unimplemented");
248     }
249     */
250 }
251 
252 
253 version(LDC)
254 {
255     alias _mm_comile_ss = __builtin_ia32_comile;
256 }
257 else
258 {
259     // TODO
260     /*
261     __m128i _mm_comile_ss(__m128, __m128) pure @safe
262     {
263         assert(false, "unimplemented");
264     }
265     */
266 }
267 
268 
269 version(LDC)
270 {
271     alias _mm_comilt_ss = __builtin_ia32_comilt;
272 }
273 else
274 {
275     // TODO
276     /*
277     __m128i _mm_comilt_ss(__m128, __m128) pure @safe
278     {
279         assert(false, "unimplemented");
280     }
281     */
282 }
283 
284 version(LDC)
285 {
286     alias _mm_comineq_ss = __builtin_ia32_comineq;
287 }
288 else
289 {
290     // TODO
291     /*
292     __m128i _mm_comineq_ss(__m128, __m128) pure @safe
293     {
294         assert(false, "unimplemented");
295     }
296     */
297 }
298 
299 // MMXREG: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
300 // MMXREG: __m64 _mm_cvt_ps2pi (__m128 a)
301 
302 
303 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe
304 {
305     v[0] = cast(float)x;
306     return v;
307 }
308 unittest
309 {
310     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
311     assert(a.array == [42f, 0, 0, 0]);
312 }
313 
314 version(LDC)
315 {
316     alias _mm_cvt_ss2si = __builtin_ia32_cvtss2si;
317 }
318 else
319 {
320     // TODO
321     /*
322     int _mm_cvt_ss2si(__m128 v) pure @safe
323     {
324         assert(false, "unimplemented");
325     }
326     */
327 }
328 
329 // MMXREG: __m128 _mm_cvtpi16_ps (__m64 a)
330 // MMXREG: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
331 // MMXREG: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
332 // MMXREG: __m128 _mm_cvtpi8_ps (__m64 a)
333 // MMXREG: __m64 _mm_cvtps_pi16 (__m128 a)
334 // MMXREG: __m64 _mm_cvtps_pi32 (__m128 a)
335 // MMXREG: __m64 _mm_cvtps_pi8 (__m128 a)
336 // MMXREG: __m128 _mm_cvtpu16_ps (__m64 a)
337 // MMXREG: __m128 _mm_cvtpu8_ps (__m64 a)
338 
339 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe
340 {
341     v[0] = cast(float)x;
342     return v;
343 }
344 unittest
345 {
346     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
347     assert(a.array == [42.0f, 0, 0, 0]);
348 }
349 
350 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy
351 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe
352 {
353     v[0] = cast(float)x;
354     return v;
355 }
356 unittest
357 {
358     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
359     assert(a.array == [42.0f, 0, 0, 0]);
360 }
361 
362 float _mm_cvtss_f32(__m128 a) pure @safe
363 {
364     return a[0];
365 }
366 
367 version(LDC)
368 {
369     alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si;
370 }
371 else
372 {
373     // TODO
374 }
375 
376 version(LDC)
377 {
378     alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64;
379 }
380 else
381 {
382     // TODO
383 }
384 
385 // MMXREG: __m64 _mm_cvtt_ps2pi (__m128 a)
386 
387 version(LDC)
388 {
389     alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si;
390     alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op
391 }
392 else
393 {
394     // TODO
395 }
396 
397 // MMXREG: _mm_cvttps_pi32
398 
399 version(LDC)
400 {
401     alias _mm_cvttss_si64 = __builtin_ia32_cvttss2si64;
402 }
403 else
404 {
405     // TODO
406 }
407 
408 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
409 {
410     return a / b;
411 }
412 unittest
413 {
414     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
415     a = _mm_div_ps(a, a);
416     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
417     assert(a.array == correct);
418 }
419 
420 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
421 {
422     a[0] /= b[0];
423     return a;
424 }
425 unittest
426 {
427     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
428     a = _mm_div_ss(a, a);
429     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
430     assert(a.array == correct);
431 }
432 
433 // MMXREG: int _mm_extract_pi16 (__m64 a, int imm8)
434 
435 // TODO: unsigned int _MM_GET_EXCEPTION_MASK ()
436 // TODO: unsigned int _MM_GET_EXCEPTION_STATE ()
437 // TODO: unsigned int _MM_GET_FLUSH_ZERO_MODE ()
438 // TODO: unsigned int _MM_GET_ROUNDING_MODE ()
439 // TODO: stmxcsr
440 // TODO: unsigned int _mm_getcsr (void)
441 
442 // MMXREG: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8)
443 
444 __m128 _mm_load_ps(const(float)*p) pure @trusted
445 {
446     return *cast(__m128*)p;
447 }
448 
449 __m128 _mm_load_ps1(const(float)*p) pure @trusted
450 {
451     float[4] f = [ *p, *p, *p, *p ];
452     return loadUnaligned!(float4)(f.ptr);
453 }
454 
455 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
456 {
457     float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ];
458     return loadUnaligned!(float4)(f.ptr);
459 }
460 
461 alias _mm_load1_ps = _mm_load_ps1;
462 
463 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe
464 {
465     long2 la = cast(long2)a;
466     la[1] = *mem_addr;
467     return cast(__m128)la;
468 }
469 
470 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe
471 {
472     long2 la = cast(long2)a;
473     la[0] = *mem_addr;
474     return cast(__m128)la;
475 }
476 
477 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted
478 {
479     __m128* aligned = cast(__m128*)mem_addr;
480     __m128 a = *aligned;
481     return shufflevector!(__m128, 3, 2, 1, 0)(a, a);
482 }
483 
484 __m128 _mm_loadu_ps(float*p) pure @safe
485 {
486     return loadUnaligned!(__m128)(p);
487 }
488 
489 // MMXREG: _mm_maskmove_si64
490 // MMXREG: _m_maskmovq
491 
492 // MMXREG: _mm_max_pi16
493 version(LDC)
494 {
495     alias _mm_max_ps = __builtin_ia32_maxps;
496 }
497 else
498 {
499     // TODO
500 }
501 
502 // MMXREG: _mm_max_pu8
503 version(LDC)
504 {
505     alias _mm_max_ss = __builtin_ia32_maxss;
506 }
507 else
508 {
509     // TODO
510 }
511 
512 // MMXREG: _mm_min_pi16
513 version(LDC)
514 {
515     alias _mm_min_ps = __builtin_ia32_minps;
516 }
517 else
518 {
519     // TODO
520 }
521 
522 // MMXREG: _mm_min_pi8
523 
524 version(LDC)
525 {
526     alias _mm_min_ss = __builtin_ia32_minss;
527 }
528 
529 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe
530 {
531     return shufflevector!(__m128, 4, 1, 2, 3)(a, b);
532 }
533 
534 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe
535 {
536     return shufflevector!(float4, 2, 3, 6, 7)(a, b);
537 }
538 
539 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe
540 {
541     return shufflevector!(float4, 0, 1, 4, 5)(a, b);
542 }
543 
544 // TODO: int _mm_movemask_pi8
545 version(LDC)
546 {
547     alias _mm_movemask_ps = __builtin_ia32_movmskps;
548 }
549 
550 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
551 {
552     return a * b;
553 }
554 unittest
555 {
556     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
557     a = _mm_mul_ps(a, a);
558     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
559     assert(a.array == correct);
560 }
561 
562 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
563 {
564     a[0] *= b[0];
565     return a;
566 }
567 unittest
568 {
569     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
570     a = _mm_mul_ss(a, a);
571     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
572     assert(a.array == correct);
573 }
574 
575 // MMXREG: _mm_mulhi_pu16
576 
577 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
578 {
579     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
580 }
581 
582 // MMXREG: __m64 _m_pavgb (__m64 a, __m64 b)
583 // MMXREG: __m64 _m_pavgw (__m64 a, __m64 b)
584 // MMXREG: int _m_pextrw (__m64 a, int imm8)
585 // MMXREG: __m64 _m_pinsrw (__m64 a, int i, int imm8)
586 // MMXREG: __m64 _m_pmaxsw (__m64 a, __m64 b)
587 // MMXREG: __m64 _m_pmaxub (__m64 a, __m64 b)
588 // MMXREG: __m64 _m_pminsw (__m64 a, __m64 b)
589 // MMXREG: __m64 _m_pminub (__m64 a, __m64 b)
590 // MMXREG: int _m_pmovmskb (__m64 a)
591 
592 // MMXREG: __m64 _m_pmulhuw (__m64 a, __m64 b)
593 
594 enum _MM_HINT_NTA = 0;
595 enum _MM_HINT_T0 = 1;
596 enum _MM_HINT_T1 = 2;
597 enum _MM_HINT_T2 = 3;
598 
599 // Note: locality must be compile-time
600 void _mm_prefetch(int locality)(void* p) pure @safe
601 {
602     llvm_prefetch(p, 0, locality, 1);
603 }
604 
605 // MMXREG: __m64 _m_psadbw (__m64 a, __m64 b)
606 // MMXREG: __m64 _m_pshufw (__m64 a, int imm8)
607 
608 version(LDC)
609 {
610     alias _mm_rcp_ps = __builtin_ia32_rcpps;
611 }
612 // TODO
613 
614 version(LDC)
615 {
616     alias _mm_rcp_ss = __builtin_ia32_rcpss;
617 }
618 // TODO
619 
620 version(LDC)
621 {
622     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
623 }
624 // TODO
625 
626 version(LDC)
627 {
628     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
629 }
630 // TODO
631 
632 // TODO: _mm_sad_pu8
633 // TODO: void _MM_SET_EXCEPTION_MASK (unsigned int a)
634 // TODO: void _MM_SET_EXCEPTION_STATE (unsigned int a)
635 // TODO: void _MM_SET_FLUSH_ZERO_MODE (unsigned int a)
636 
637 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
638 {
639     float[4] result = [e0, e1, e2, e3];
640     return loadUnaligned!(float4)(result.ptr);
641 }
642 
643 alias _mm_set_ps1 = _mm_set1_ps;
644 
645 // TODO: _MM_SET_ROUNDING_MODE
646 
647 __m128 _mm_set_ss (float a) pure @trusted
648 {
649     float[4] result = [a, 0.0f, 0.0f, 0.0f];
650     return loadUnaligned!(float4)(result.ptr);
651 }
652 
653 __m128 _mm_set1_ps (float a) pure @trusted
654 {
655     float[4] result = [a, a, a, a];
656     return loadUnaligned!(float4)(result.ptr);
657 }
658 
659 // TODO: _mm_setcsr
660 
661 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
662 {
663     float[4] result = [e3, e2, e1, e0];
664     return loadUnaligned!(float4)(result.ptr);
665 }
666 
667 __m128 _mm_setzero_ps() pure @trusted
668 {
669     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
670     return loadUnaligned!(float4)(result.ptr);
671 }
672 
673 version(LDC)
674 {
675     alias _mm_sfence = __builtin_ia32_sfence;
676 }
677 // TODO
678 
679 // MMXREG: mm_shuffle_pi16
680 
681 // Note: the immediate shuffle value is given at compile-time instead of runtime.
682 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
683 {
684     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
685 }
686 
687 version(LDC)
688 {
689     // Disappeared with LDC 1.11
690     static if (__VERSION__ < 2081)
691         alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
692     else
693     {
694         __m128 _mm_sqrt_ps(__m128 vec) pure @safe
695         {
696             vec.array[0] = llvm_sqrt(vec.array[0]);
697             vec.array[1] = llvm_sqrt(vec.array[1]);
698             vec.array[2] = llvm_sqrt(vec.array[2]);
699             vec.array[3] = llvm_sqrt(vec.array[3]);
700             return vec;
701         }
702     }
703 }
704 else
705 {
706     __m128 _mm_sqrt_ps(__m128 vec) pure @safe
707     {
708         import std.math: sqrt;
709         vec.array[0] = sqrt(vec.array[0]);
710         vec.array[1] = sqrt(vec.array[1]);
711         vec.array[2] = sqrt(vec.array[2]);
712         vec.array[3] = sqrt(vec.array[3]);
713         return vec;
714     }
715 }
716 unittest
717 {
718     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
719     assert(A.array[0] == 2.0f);
720     assert(A.array[1] == 2.0f);
721     assert(A.array[2] == 2.0f);
722     assert(A.array[3] == 2.0f);
723 }
724 
725 version(LDC)
726 {
727     // Disappeared with LDC 1.11
728     static if (__VERSION__ < 2081)
729         alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
730     else
731     {
732         __m128 _mm_sqrt_ss(__m128 vec) pure @safe
733         {
734             vec.array[0] = llvm_sqrt(vec.array[0]);
735             vec.array[1] = vec.array[1];
736             vec.array[2] = vec.array[2];
737             vec.array[3] = vec.array[3];
738             return vec;
739         }
740     }
741 }
742 else
743 {
744     __m128 _mm_sqrt_ss(__m128 vec) pure @safe
745     {
746         import std.math: sqrt;
747         vec.array[0] = sqrt(vec.array[0]);
748         return vec;
749     }
750 }
751 unittest
752 {
753     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
754     assert(A.array[0] == 2.0f);
755     assert(A.array[1] == 4.0f);
756     assert(A.array[2] == 4.0f);
757     assert(A.array[3] == 4.0f);
758 }
759 
760 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
761 {
762     __m128* aligned = cast(__m128*)mem_addr;
763     *aligned = a;
764 }
765 
766 alias _mm_store_ps1 = _mm_store1_ps;
767 
768 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
769 {
770     *mem_addr = a[0];
771 }
772 
773 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
774 {
775     __m128* aligned = cast(__m128*)mem_addr;
776     *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a);
777 }
778 
779 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe
780 {
781     *p = extractelement!(long2, 1)(a);
782 }
783 
784 void _mm_storel_pi(__m64* p, __m128 a) pure @safe
785 {
786     *p = extractelement!(long2, 0)(a);
787 }
788 
789 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
790 {
791     __m128* aligned = cast(__m128*)mem_addr;
792     *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a);
793 }
794 
795 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe
796 {
797     storeUnaligned!(float4)(a, mem_addr);
798 }
799 
800 // TODO: _mm_stream_pi, does not seem possible
801 // TODO: _mm_stream_ps, does not seem possible
802 
803 
804 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
805 {
806     return a - b;
807 }
808 unittest
809 {
810     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
811     a = _mm_sub_ps(a, a);
812     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
813     assert(a.array == correct);
814 }
815 
816 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
817 {
818     a[0] -= b[0];
819     return a;
820 }
821 unittest
822 {
823     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
824     a = _mm_sub_ss(a, a);
825     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
826     assert(a.array == correct);
827 }
828 
829 
830 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
831 {
832     __m128 tmp3, tmp2, tmp1, tmp0;
833     tmp0 = _mm_unpacklo_ps(row0, row1);
834     tmp2 = _mm_unpacklo_ps(row2, row3);
835     tmp1 = _mm_unpackhi_ps(row0, row1);
836     tmp3 = _mm_unpackhi_ps(row2, row3);
837     row0 = _mm_movelh_ps(tmp0, tmp2);
838     row1 = _mm_movehl_ps(tmp2, tmp0);
839     row2 = _mm_movelh_ps(tmp1, tmp3);
840     row3 = _mm_movehl_ps(tmp3, tmp1);
841 }
842 
843 version(LDC)
844 {
845     alias _mm_ucomieq_ss = __builtin_ia32_ucomieq;
846 }
847 // TODO
848 
849 version(LDC)
850 {
851     alias _mm_ucomige_ss = __builtin_ia32_ucomige;
852 }
853 // TODO
854 
855 version(LDC)
856 {
857     alias _mm_ucomigt_ss = __builtin_ia32_ucomigt;
858 }
859 // TODO
860 
861 version(LDC)
862 {
863     alias _mm_ucomile_ss = __builtin_ia32_ucomile;
864 }
865 // TODO
866 
867 version(LDC)
868 {
869     alias _mm_ucomilt_ss = __builtin_ia32_ucomilt;
870 }
871 // TODO
872 
873 version(LDC)
874 {
875     alias _mm_ucomineq_ss = __builtin_ia32_ucomineq;
876 }
877 // TODO
878 
879 
880 __m128 _mm_undefined_ps() pure @safe
881 {
882     __m128 undef = void;
883     return undef;
884 }
885 
886 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe
887 {
888     return shufflevector!(float4, 2, 6, 3, 7)(a, b);
889 }
890 
891 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe
892 {
893     return shufflevector!(float4, 0, 4, 1, 5)(a, b);
894 }
895 
896 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe
897 {
898     return a ^ b;
899 }