1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 
7 module inteli.xmmintrin;
8 
9 public import inteli.types;
10 
11 import inteli.internals;
12 
13 // SSE1
14 // Note: intrinsics noted MMXREG are actually using MMX registers,
15 // and were not translated. These intrinsics are for instruction
16 // introduced with SSE1, that also work on MMX registers.
17 
18 nothrow @nogc:
19 
20 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
21 {
22     return a + b;
23 }
24 
25 unittest
26 {
27     __m128 a = [1, 2, 3, 4];
28     a = _mm_add_ps(a, a);
29     assert(a.array[0] == 2);
30     assert(a.array[1] == 4);
31     assert(a.array[2] == 6);
32     assert(a.array[3] == 8);
33 }
34 
35 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
36 {
37     a[0] += b[0];
38     return a;
39 }
40 unittest
41 {
42     __m128 a = [1, 2, 3, 4];
43     a = _mm_add_ss(a, a);
44     assert(a.array == [2.0f, 2, 3, 4]);
45 }
46 
47 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
48 {
49     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
50 }
51 unittest
52 {
53     // Note: tested in emmintrin.d
54 }
55 
56 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe
57 {
58     return (~a) & b;
59 }
60 
61 
62 // MMXREG: _mm_avg_pu16
63 // MMXREG: _mm_avg_pu8
64 
65 version(LDC)
66 {
67     pragma(LDC_intrinsic, "llvm.x86.sse.cmp.ps")
68         __m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe;
69 }
70 else
71 {
72     // unimplemented
73     /*__m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe
74     {
75         assert(false, "unimplemented");
76     }*/
77 }
78 
79 version(LDC)
80 {
81     __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
82     {
83         return __builtin_ia32_cmpps(a, b, 0);
84     }
85 
86     __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
87     {
88         return __builtin_ia32_cmpss(a, b, 0);
89     }
90 
91     __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
92     {
93         return __builtin_ia32_cmpps(b, a, 2); // CMPLEPS reversed
94     }
95 
96     __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
97     {
98         return __builtin_ia32_cmpss(b, a, 2); // CMPLESS reversed
99     }
100 
101     __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
102     {
103         return __builtin_ia32_cmpps(b, a, 1); // CMPLTPS reversed
104     }
105 
106     __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
107     {
108         return __builtin_ia32_cmpss(b, a, 1); // CMPLTSS reversed
109     }
110 
111     __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
112     {
113         return __builtin_ia32_cmpps(a, b, 2); // CMPLEPS
114     }
115 
116     __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
117     {
118         return __builtin_ia32_cmpss(a, b, 2); // CMPLESS
119     }
120 
121     __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
122     {
123         return __builtin_ia32_cmpps(a, b, 1); // CMPLTPS
124     }
125 
126     __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
127     {
128         return __builtin_ia32_cmpss(a, b, 1); // CMPLTSS
129     }
130 
131     __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
132     {
133         return __builtin_ia32_cmpps(a, b, 4); // CMPNEQPS
134     }
135 
136     __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
137     {
138         return __builtin_ia32_cmpss(a, b, 4); // CMPNEQSS
139     }
140 
141     __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
142     {
143         return __builtin_ia32_cmpps(b, a, 6); // CMPNLEPS reversed
144     }
145 
146     __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
147     {
148         return __builtin_ia32_cmpss(b, a, 6); // CMPNLESS reversed
149     }
150 
151     __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
152     {
153         return __builtin_ia32_cmpps(b, a, 5); // CMPNLTPS reversed
154     }
155 
156     __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
157     {
158         return __builtin_ia32_cmpss(b, a, 5); // CMPNLTPS reversed
159     }
160 
161     __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
162     {
163         return __builtin_ia32_cmpps(a, b, 6); // CMPNLEPS
164     }
165 
166     __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
167     {
168         return __builtin_ia32_cmpss(a, b, 6); // CMPNLESS
169     }
170 
171     __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
172     {
173         return __builtin_ia32_cmpps(a, b, 5); // CMPNLTPS
174     }
175 
176     __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
177     {
178         return __builtin_ia32_cmpss(a, b, 5); // CMPNLTSS
179     }
180 
181     __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
182     {
183         return __builtin_ia32_cmpps(a, b, 7); // CMPORDPS
184     }
185 
186     __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
187     {
188         return __builtin_ia32_cmpss(a, b, 7); // CMPORDSS
189     }
190 
191     __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
192     {
193         return __builtin_ia32_cmpps(a, b, 3); // CMPUNORDPS
194     }
195 
196     __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
197     {
198         return __builtin_ia32_cmpss(a, b, 3); // CMPUNORDSS
199     }
200 }
201 else
202 {
203     // TODO
204 }
205 
206 version(LDC)
207 {
208     alias _mm_comieq_ss = __builtin_ia32_comieq;
209 }
210 else
211 {
212     // TODO
213     /*__m128i _mm_comieq_ss(__m128, __m128) pure @safe
214     {
215         assert(false, "unimplemented");
216     }
217     */
218 }
219 
220 
221 version(LDC)
222 {
223     alias _mm_comige_ss = __builtin_ia32_comige;
224 }
225 else
226 {
227     // TODO
228     /*
229     __m128i _mm_comige_ss(__m128, __m128) pure @safe
230     {
231         assert(false, "unimplemented");
232     }
233     */
234 }
235 
236 
237 version(LDC)
238 {
239     alias _mm_comigt_ss = __builtin_ia32_comigt;
240 }
241 else
242 {
243     // TODO
244     /*
245     __m128i _mm_comigt_ss(__m128, __m128) pure @safe
246     {
247         assert(false, "unimplemented");
248     }
249     */
250 }
251 
252 
253 version(LDC)
254 {
255     alias _mm_comile_ss = __builtin_ia32_comile;
256 }
257 else
258 {
259     // TODO
260     /*
261     __m128i _mm_comile_ss(__m128, __m128) pure @safe
262     {
263         assert(false, "unimplemented");
264     }
265     */
266 }
267 
268 
269 version(LDC)
270 {
271     alias _mm_comilt_ss = __builtin_ia32_comilt;
272 }
273 else
274 {
275     // TODO
276     /*
277     __m128i _mm_comilt_ss(__m128, __m128) pure @safe
278     {
279         assert(false, "unimplemented");
280     }
281     */
282 }
283 
284 version(LDC)
285 {
286     alias _mm_comineq_ss = __builtin_ia32_comineq;
287 }
288 else
289 {
290     // TODO
291     /*
292     __m128i _mm_comineq_ss(__m128, __m128) pure @safe
293     {
294         assert(false, "unimplemented");
295     }
296     */
297 }
298 
299 // MMXREG: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
300 // MMXREG: __m64 _mm_cvt_ps2pi (__m128 a)
301 
302 
303 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe
304 {
305     v[0] = cast(float)x;
306     return v;
307 }
308 unittest
309 {
310     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
311     assert(a.array == [42f, 0, 0, 0]);
312 }
313 
314 version(LDC)
315 {
316     alias _mm_cvt_ss2si = __builtin_ia32_cvtss2si;
317 }
318 else
319 {
320     // TODO
321     /*
322     int _mm_cvt_ss2si(__m128 v) pure @safe
323     {
324         assert(false, "unimplemented");
325     }
326     */
327 }
328 
329 // MMXREG: __m128 _mm_cvtpi16_ps (__m64 a)
330 // MMXREG: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
331 // MMXREG: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
332 // MMXREG: __m128 _mm_cvtpi8_ps (__m64 a)
333 // MMXREG: __m64 _mm_cvtps_pi16 (__m128 a)
334 // MMXREG: __m64 _mm_cvtps_pi32 (__m128 a)
335 // MMXREG: __m64 _mm_cvtps_pi8 (__m128 a)
336 // MMXREG: __m128 _mm_cvtpu16_ps (__m64 a)
337 // MMXREG: __m128 _mm_cvtpu8_ps (__m64 a)
338 
339 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe
340 {
341     v[0] = cast(float)x;
342     return v;
343 }
344 unittest
345 {
346     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
347     assert(a.array == [42.0f, 0, 0, 0]);
348 }
349 
350 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy
351 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe
352 {
353     v[0] = cast(float)x;
354     return v;
355 }
356 unittest
357 {
358     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
359     assert(a.array == [42.0f, 0, 0, 0]);
360 }
361 
362 float _mm_cvtss_f32(__m128 a) pure @safe
363 {
364     return a[0];
365 }
366 
367 version(LDC)
368 {
369     alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si;
370 }
371 else
372 {
373     // TODO
374 }
375 
376 version(LDC)
377 {
378     alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64;
379 }
380 else
381 {
382     // TODO
383 }
384 
385 // MMXREG: __m64 _mm_cvtt_ps2pi (__m128 a)
386 
387 version(LDC)
388 {
389     alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si;
390     alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op
391 }
392 else
393 {
394     // TODO
395 }
396 
397 // MMXREG: _mm_cvttps_pi32
398 
399 version(LDC)
400 {
401     alias _mm_cvttss_si64 = __builtin_ia32_cvttss2si64;
402 }
403 else
404 {
405     // TODO
406 }
407 
408 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
409 {
410     return a / b;
411 }
412 unittest
413 {
414     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
415     a = _mm_div_ps(a, a);
416     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
417     assert(a.array == correct);
418 }
419 
420 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
421 {
422     a[0] /= b[0];
423     return a;
424 }
425 unittest
426 {
427     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
428     a = _mm_div_ss(a, a);
429     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
430     assert(a.array == correct);
431 }
432 
433 // MMXREG: int _mm_extract_pi16 (__m64 a, int imm8)
434 
435 // TODO: unsigned int _MM_GET_EXCEPTION_MASK ()
436 // TODO: unsigned int _MM_GET_EXCEPTION_STATE ()
437 // TODO: unsigned int _MM_GET_FLUSH_ZERO_MODE ()
438 // TODO: unsigned int _MM_GET_ROUNDING_MODE ()
439 // TODO: stmxcsr
440 // TODO: unsigned int _mm_getcsr (void)
441 
442 // MMXREG: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8)
443 
444 __m128 _mm_load_ps(const(float)*p) pure @trusted
445 {
446     return *cast(__m128*)p;
447 }
448 
449 __m128 _mm_load_ps1(const(float)*p) pure @trusted
450 {
451     float[4] f = [ *p, *p, *p, *p ];
452     return loadUnaligned!(float4)(f.ptr);
453 }
454 
455 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
456 {
457     float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ];
458     return loadUnaligned!(float4)(f.ptr);
459 }
460 
461 alias _mm_load1_ps = _mm_load_ps1;
462 
463 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe
464 {
465     long2 la = cast(long2)a;
466     la[1] = *mem_addr;
467     return cast(__m128)la;
468 }
469 
470 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe
471 {
472     long2 la = cast(long2)a;
473     la[0] = *mem_addr;
474     return cast(__m128)la;
475 }
476 
477 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted
478 {
479     __m128* aligned = cast(__m128*)mem_addr;
480     __m128 a = *aligned;
481     return shufflevector!(__m128, 3, 2, 1, 0)(a, a);
482 }
483 
484 __m128 _mm_loadu_ps(float*p) pure @safe
485 {
486     return loadUnaligned!(__m128)(p);
487 }
488 
489 __m128i _mm_loadu_si16(const(void)* mem_addr)
490 {
491     short r = *cast(short*)(mem_addr);
492     short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
493     result[0] = r;
494     return cast(__m128i)result;
495 }
496 unittest
497 {
498     short r = 13;
499     short8 A = cast(short8) _mm_loadu_si16(&r);
500     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
501     assert(A.array == correct);
502 }
503 
504 __m128i _mm_loadu_si64(const(void)* mem_addr)
505 {
506     long r = *cast(int*)(mem_addr);
507     long2 result = [0, 0];
508     result[0] = r;
509     return cast(__m128i)result;
510 }
511 unittest
512 {
513     long r = 446;
514     long2 A = cast(long2) _mm_loadu_si64(&r);
515     long[2] correct = [446, 0];
516     assert(A.array == correct);
517 }
518 
519 // MMXREG: _mm_maskmove_si64
520 // MMXREG: _m_maskmovq
521 
522 // MMXREG: _mm_max_pi16
523 version(LDC)
524 {
525     alias _mm_max_ps = __builtin_ia32_maxps;
526 }
527 else
528 {
529     // TODO
530 }
531 
532 // MMXREG: _mm_max_pu8
533 version(LDC)
534 {
535     alias _mm_max_ss = __builtin_ia32_maxss;
536 }
537 else
538 {
539     // TODO
540 }
541 
542 // MMXREG: _mm_min_pi16
543 version(LDC)
544 {
545     alias _mm_min_ps = __builtin_ia32_minps;
546 }
547 else
548 {
549     // TODO
550 }
551 
552 // MMXREG: _mm_min_pi8
553 
554 version(LDC)
555 {
556     alias _mm_min_ss = __builtin_ia32_minss;
557 }
558 
559 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe
560 {
561     return shufflevector!(__m128, 4, 1, 2, 3)(a, b);
562 }
563 
564 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe
565 {
566     return shufflevector!(float4, 2, 3, 6, 7)(a, b);
567 }
568 
569 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe
570 {
571     return shufflevector!(float4, 0, 1, 4, 5)(a, b);
572 }
573 
574 // TODO: int _mm_movemask_pi8
575 version(LDC)
576 {
577     alias _mm_movemask_ps = __builtin_ia32_movmskps;
578 }
579 
580 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
581 {
582     return a * b;
583 }
584 unittest
585 {
586     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
587     a = _mm_mul_ps(a, a);
588     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
589     assert(a.array == correct);
590 }
591 
592 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
593 {
594     a[0] *= b[0];
595     return a;
596 }
597 unittest
598 {
599     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
600     a = _mm_mul_ss(a, a);
601     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
602     assert(a.array == correct);
603 }
604 
605 // MMXREG: _mm_mulhi_pu16
606 
607 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
608 {
609     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
610 }
611 
612 // MMXREG: __m64 _m_pavgb (__m64 a, __m64 b)
613 // MMXREG: __m64 _m_pavgw (__m64 a, __m64 b)
614 // MMXREG: int _m_pextrw (__m64 a, int imm8)
615 // MMXREG: __m64 _m_pinsrw (__m64 a, int i, int imm8)
616 // MMXREG: __m64 _m_pmaxsw (__m64 a, __m64 b)
617 // MMXREG: __m64 _m_pmaxub (__m64 a, __m64 b)
618 // MMXREG: __m64 _m_pminsw (__m64 a, __m64 b)
619 // MMXREG: __m64 _m_pminub (__m64 a, __m64 b)
620 // MMXREG: int _m_pmovmskb (__m64 a)
621 
622 // MMXREG: __m64 _m_pmulhuw (__m64 a, __m64 b)
623 
624 enum _MM_HINT_NTA = 0;
625 enum _MM_HINT_T0 = 1;
626 enum _MM_HINT_T1 = 2;
627 enum _MM_HINT_T2 = 3;
628 
629 // Note: locality must be compile-time
630 void _mm_prefetch(int locality)(void* p) pure @safe
631 {
632     llvm_prefetch(p, 0, locality, 1);
633 }
634 
635 // MMXREG: __m64 _m_psadbw (__m64 a, __m64 b)
636 // MMXREG: __m64 _m_pshufw (__m64 a, int imm8)
637 
638 version(LDC)
639 {
640     alias _mm_rcp_ps = __builtin_ia32_rcpps;
641 }
642 // TODO
643 
644 version(LDC)
645 {
646     alias _mm_rcp_ss = __builtin_ia32_rcpss;
647 }
648 // TODO
649 
650 version(LDC)
651 {
652     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
653 }
654 // TODO
655 
656 version(LDC)
657 {
658     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
659 }
660 // TODO
661 
662 // TODO: _mm_sad_pu8
663 // TODO: void _MM_SET_EXCEPTION_MASK (unsigned int a)
664 // TODO: void _MM_SET_EXCEPTION_STATE (unsigned int a)
665 // TODO: void _MM_SET_FLUSH_ZERO_MODE (unsigned int a)
666 
667 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
668 {
669     float[4] result = [e0, e1, e2, e3];
670     return loadUnaligned!(float4)(result.ptr);
671 }
672 
673 alias _mm_set_ps1 = _mm_set1_ps;
674 
675 // TODO: _MM_SET_ROUNDING_MODE
676 
677 __m128 _mm_set_ss (float a) pure @trusted
678 {
679     float[4] result = [a, 0.0f, 0.0f, 0.0f];
680     return loadUnaligned!(float4)(result.ptr);
681 }
682 
683 __m128 _mm_set1_ps (float a) pure @trusted
684 {
685     float[4] result = [a, a, a, a];
686     return loadUnaligned!(float4)(result.ptr);
687 }
688 
689 // TODO: _mm_setcsr
690 
691 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
692 {
693     float[4] result = [e3, e2, e1, e0];
694     return loadUnaligned!(float4)(result.ptr);
695 }
696 
697 __m128 _mm_setzero_ps() pure @trusted
698 {
699     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
700     return loadUnaligned!(float4)(result.ptr);
701 }
702 
703 version(LDC)
704 {
705     alias _mm_sfence = __builtin_ia32_sfence;
706 }
707 // TODO
708 
709 // MMXREG: mm_shuffle_pi16
710 
711 // Note: the immediate shuffle value is given at compile-time instead of runtime.
712 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
713 {
714     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
715 }
716 
717 version(LDC)
718 {
719     // Disappeared with LDC 1.11
720     static if (__VERSION__ < 2081)
721         alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
722     else
723     {
724         __m128 _mm_sqrt_ps(__m128 vec) pure @safe
725         {
726             vec.array[0] = llvm_sqrt(vec.array[0]);
727             vec.array[1] = llvm_sqrt(vec.array[1]);
728             vec.array[2] = llvm_sqrt(vec.array[2]);
729             vec.array[3] = llvm_sqrt(vec.array[3]);
730             return vec;
731         }
732     }
733 }
734 else
735 {
736     __m128 _mm_sqrt_ps(__m128 vec) pure @safe
737     {
738         import std.math: sqrt;
739         vec.array[0] = sqrt(vec.array[0]);
740         vec.array[1] = sqrt(vec.array[1]);
741         vec.array[2] = sqrt(vec.array[2]);
742         vec.array[3] = sqrt(vec.array[3]);
743         return vec;
744     }
745 }
746 unittest
747 {
748     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
749     assert(A.array[0] == 2.0f);
750     assert(A.array[1] == 2.0f);
751     assert(A.array[2] == 2.0f);
752     assert(A.array[3] == 2.0f);
753 }
754 
755 version(LDC)
756 {
757     // Disappeared with LDC 1.11
758     static if (__VERSION__ < 2081)
759         alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
760     else
761     {
762         __m128 _mm_sqrt_ss(__m128 vec) pure @safe
763         {
764             vec.array[0] = llvm_sqrt(vec.array[0]);
765             vec.array[1] = vec.array[1];
766             vec.array[2] = vec.array[2];
767             vec.array[3] = vec.array[3];
768             return vec;
769         }
770     }
771 }
772 else
773 {
774     __m128 _mm_sqrt_ss(__m128 vec) pure @safe
775     {
776         import std.math: sqrt;
777         vec.array[0] = sqrt(vec.array[0]);
778         return vec;
779     }
780 }
781 unittest
782 {
783     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
784     assert(A.array[0] == 2.0f);
785     assert(A.array[1] == 4.0f);
786     assert(A.array[2] == 4.0f);
787     assert(A.array[3] == 4.0f);
788 }
789 
790 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
791 {
792     __m128* aligned = cast(__m128*)mem_addr;
793     *aligned = a;
794 }
795 
796 alias _mm_store_ps1 = _mm_store1_ps;
797 
798 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
799 {
800     *mem_addr = a[0];
801 }
802 
803 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
804 {
805     __m128* aligned = cast(__m128*)mem_addr;
806     *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a);
807 }
808 
809 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe
810 {
811     *p = extractelement!(long2, 1)(a);
812 }
813 
814 void _mm_storel_pi(__m64* p, __m128 a) pure @safe
815 {
816     *p = extractelement!(long2, 0)(a);
817 }
818 
819 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
820 {
821     __m128* aligned = cast(__m128*)mem_addr;
822     *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a);
823 }
824 
825 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe
826 {
827     storeUnaligned!(float4)(a, mem_addr);
828 }
829 
830 // TODO: _mm_stream_pi, does not seem possible
831 // TODO: _mm_stream_ps, does not seem possible
832 
833 
834 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
835 {
836     return a - b;
837 }
838 unittest
839 {
840     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
841     a = _mm_sub_ps(a, a);
842     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
843     assert(a.array == correct);
844 }
845 
846 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
847 {
848     a[0] -= b[0];
849     return a;
850 }
851 unittest
852 {
853     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
854     a = _mm_sub_ss(a, a);
855     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
856     assert(a.array == correct);
857 }
858 
859 
860 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
861 {
862     __m128 tmp3, tmp2, tmp1, tmp0;
863     tmp0 = _mm_unpacklo_ps(row0, row1);
864     tmp2 = _mm_unpacklo_ps(row2, row3);
865     tmp1 = _mm_unpackhi_ps(row0, row1);
866     tmp3 = _mm_unpackhi_ps(row2, row3);
867     row0 = _mm_movelh_ps(tmp0, tmp2);
868     row1 = _mm_movehl_ps(tmp2, tmp0);
869     row2 = _mm_movelh_ps(tmp1, tmp3);
870     row3 = _mm_movehl_ps(tmp3, tmp1);
871 }
872 
873 version(LDC)
874 {
875     alias _mm_ucomieq_ss = __builtin_ia32_ucomieq;
876 }
877 // TODO
878 
879 version(LDC)
880 {
881     alias _mm_ucomige_ss = __builtin_ia32_ucomige;
882 }
883 // TODO
884 
885 version(LDC)
886 {
887     alias _mm_ucomigt_ss = __builtin_ia32_ucomigt;
888 }
889 // TODO
890 
891 version(LDC)
892 {
893     alias _mm_ucomile_ss = __builtin_ia32_ucomile;
894 }
895 // TODO
896 
897 version(LDC)
898 {
899     alias _mm_ucomilt_ss = __builtin_ia32_ucomilt;
900 }
901 // TODO
902 
903 version(LDC)
904 {
905     alias _mm_ucomineq_ss = __builtin_ia32_ucomineq;
906 }
907 // TODO
908 
909 
910 __m128 _mm_undefined_ps() pure @safe
911 {
912     __m128 undef = void;
913     return undef;
914 }
915 
916 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe
917 {
918     return shufflevector!(float4, 2, 6, 3, 7)(a, b);
919 }
920 
921 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe
922 {
923     return shufflevector!(float4, 0, 4, 1, 5)(a, b);
924 }
925 
926 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe
927 {
928     return a ^ b;
929 }