1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.xmmintrin;
7 
8 public import inteli.types;
9 
10 import inteli.internals;
11 
12 import inteli.mmx;
13 
14 import core.stdc.stdlib: malloc, free;
15 import core.exception: onOutOfMemoryError;
16 
17 version(D_InlineAsm_X86)
18     version = InlineX86Asm;
19 else version(D_InlineAsm_X86_64)
20     version = InlineX86Asm;
21 
22 
23 // SSE1
24 
25 nothrow @nogc:
26 
27 
28 enum int _MM_EXCEPT_INVALID    = 0x0001;
29 enum int _MM_EXCEPT_DENORM     = 0x0002;
30 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004;
31 enum int _MM_EXCEPT_OVERFLOW   = 0x0008;
32 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010;
33 enum int _MM_EXCEPT_INEXACT    = 0x0020;
34 enum int _MM_EXCEPT_MASK       = 0x003f;
35 
36 enum int _MM_MASK_INVALID      = 0x0080;
37 enum int _MM_MASK_DENORM       = 0x0100;
38 enum int _MM_MASK_DIV_ZERO     = 0x0200;
39 enum int _MM_MASK_OVERFLOW     = 0x0400;
40 enum int _MM_MASK_UNDERFLOW    = 0x0800;
41 enum int _MM_MASK_INEXACT      = 0x1000;
42 enum int _MM_MASK_MASK         = 0x1f80;
43 
44 enum int _MM_ROUND_NEAREST     = 0x0000;
45 enum int _MM_ROUND_DOWN        = 0x2000;
46 enum int _MM_ROUND_UP          = 0x4000;
47 enum int _MM_ROUND_TOWARD_ZERO = 0x6000;
48 enum int _MM_ROUND_MASK        = 0x6000;
49 
50 enum int _MM_FLUSH_ZERO_MASK   = 0x8000;
51 enum int _MM_FLUSH_ZERO_ON     = 0x8000;
52 enum int _MM_FLUSH_ZERO_OFF    = 0x0000;
53 
54 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
55 {
56     return a + b;
57 }
58 
59 unittest
60 {
61     __m128 a = [1, 2, 3, 4];
62     a = _mm_add_ps(a, a);
63     assert(a.array[0] == 2);
64     assert(a.array[1] == 4);
65     assert(a.array[2] == 6);
66     assert(a.array[3] == 8);
67 }
68 
69 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
70 {
71     a[0] += b[0];
72     return a;
73 }
74 unittest
75 {
76     __m128 a = [1, 2, 3, 4];
77     a = _mm_add_ss(a, a);
78     assert(a.array == [2.0f, 2, 3, 4]);
79 }
80 
81 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
82 {
83     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
84 }
85 unittest
86 {
87     // Note: tested in emmintrin.d
88 }
89 
90 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe
91 {
92     return (~a) & b;
93 }
94 
95 
96 // TODO: _mm_avg_pu16
97 // TODO: _mm_avg_pu8
98 
99 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
100 {
101     return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
102 }
103 
104 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
105 {
106     return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
107 }
108 
109 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
110 {
111     return cast(__m128) cmpps!(FPComparison.oge)(a, b);
112 }
113 
114 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
115 {
116     return cast(__m128) cmpss!(FPComparison.oge)(a, b);
117 }
118 
119 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
120 {
121     return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
122 }
123 
124 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
125 {
126     return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
127 }
128 
129 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
130 {
131     return cast(__m128) cmpps!(FPComparison.ole)(a, b);
132 }
133 
134 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
135 {
136     return cast(__m128) cmpss!(FPComparison.ole)(a, b);
137 }
138 
139 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 
140 {
141     return cast(__m128) cmpps!(FPComparison.olt)(a, b);
142 }
143 
144 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
145 {
146     return cast(__m128) cmpss!(FPComparison.olt)(a, b);
147 }
148 
149 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
150 {
151     return cast(__m128) cmpps!(FPComparison.une)(a, b);
152 }
153 
154 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
155 {
156     return cast(__m128) cmpss!(FPComparison.une)(a, b);
157 }
158 
159 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
160 {
161     return cast(__m128) cmpps!(FPComparison.ult)(a, b);
162 }
163 
164 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
165 {
166     return cast(__m128) cmpss!(FPComparison.ult)(a, b);
167 }
168 
169 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
170 {
171     return cast(__m128) cmpps!(FPComparison.ule)(a, b);
172 }
173 
174 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
175 {
176     return cast(__m128) cmpss!(FPComparison.ule)(a, b);
177 }
178 
179 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
180 {
181     return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
182 }
183 
184 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
185 {
186     return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
187 }
188 
189 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
190 {
191     return cast(__m128) cmpps!(FPComparison.uge)(a, b);
192 }
193 
194 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
195 {
196     return cast(__m128) cmpss!(FPComparison.uge)(a, b);
197 }
198 
199 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
200 {
201     return cast(__m128) cmpps!(FPComparison.ord)(a, b);
202 }
203 
204 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
205 {
206     return cast(__m128) cmpss!(FPComparison.ord)(a, b);
207 }
208 
209 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
210 {
211     return cast(__m128) cmpps!(FPComparison.uno)(a, b);
212 }
213 
214 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
215 {
216     return cast(__m128) cmpss!(FPComparison.uno)(a, b);
217 }
218 
219 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
220 // Some such comparisons yields true for NaNs, other don't.
221 
222 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete
223 {
224     return comss!(FPComparison.ueq)(a, b); // yields true for NaN!
225 }
226 
227 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae
228 {
229     return comss!(FPComparison.oge)(a, b); 
230 }
231 
232 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
233 {
234     return comss!(FPComparison.ogt)(a, b);
235 }
236 
237 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
238 {
239     return comss!(FPComparison.ule)(a, b); // yields true for NaN!
240 }
241 
242 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
243 {
244     return comss!(FPComparison.ult)(a, b); // yields true for NaN!
245 }
246 
247 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
248 {
249     return comss!(FPComparison.one)(a, b);
250 }
251 
252 
253 // TODO: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
254 // TODO: __m64 _mm_cvt_ps2pi (__m128 a)
255 
256 
257 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe
258 {
259     v[0] = cast(float)x;
260     return v;
261 }
262 unittest
263 {
264     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
265     assert(a.array == [42f, 0, 0, 0]);
266 }
267 
268 // Note: is just another name for _mm_cvtss_si32
269 alias _mm_cvt_ss2si = _mm_cvtss_si32;
270 
271 
272 // TODO: __m128 _mm_cvtpi16_ps (__m64 a)
273 // TODO: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
274 // TODO: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
275 // TODO: __m128 _mm_cvtpi8_ps (__m64 a)
276 // TODO: __m64 _mm_cvtps_pi16 (__m128 a)
277 // TODO: __m64 _mm_cvtps_pi32 (__m128 a)
278 // TODO: __m64 _mm_cvtps_pi8 (__m128 a)
279 // TODO: __m128 _mm_cvtpu16_ps (__m64 a)
280 // TODO: __m128 _mm_cvtpu8_ps (__m64 a)
281 
282 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe
283 {
284     v[0] = cast(float)x;
285     return v;
286 }
287 unittest
288 {
289     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
290     assert(a.array == [42.0f, 0, 0, 0]);
291 }
292 
293 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy
294 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe
295 {
296     v[0] = cast(float)x;
297     return v;
298 }
299 unittest
300 {
301     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
302     assert(a.array == [42.0f, 0, 0, 0]);
303 }
304 
305 float _mm_cvtss_f32(__m128 a) pure @safe
306 {
307     return a[0];
308 }
309 
310 version(LDC)
311 {
312     alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si;
313 }
314 else
315 {
316     int _mm_cvtss_si32 (__m128 a) pure @safe
317     {
318         return convertFloatToInt32UsingMXCSR(a[0]);
319     }
320 }
321 unittest
322 {
323     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
324 }
325 
326 version(LDC)
327 {
328     version(X86_64)
329         alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64;
330     else
331     {
332         // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit
333         long _mm_cvtss_si64 (__m128 a) pure @safe
334         {
335             return convertFloatToInt64UsingMXCSR(a[0]);
336         }
337     }
338 }
339 else
340 {
341     long _mm_cvtss_si64 (__m128 a) pure @safe
342     {
343         return convertFloatToInt64UsingMXCSR(a[0]);
344     }
345 }
346 unittest
347 {
348     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
349 
350     uint savedRounding = _MM_GET_ROUNDING_MODE();
351 
352     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
353     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.5f)));
354 
355     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
356     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
357 
358     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
359     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
360 
361     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
362     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
363 
364     _MM_SET_ROUNDING_MODE(savedRounding);
365 }
366 
367 
368 version(LDC)
369 {
370     alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si;
371 }
372 else
373 {
374     int _mm_cvtt_ss2si (__m128 a) pure @safe
375     {
376         return cast(int)(a[0]);
377     }
378 }
379 unittest
380 {
381     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
382 }
383 
384 // TODO: __m64 _mm_cvtt_ps2pi (__m128 a)
385 
386 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op
387 
388 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code.
389 long _mm_cvttss_si64 (__m128 a) pure @safe
390 {
391     return cast(long)(a[0]); // Generates cvttss2si as expected
392 }
393 unittest
394 {
395     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
396 }
397 
398 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
399 {
400     return a / b;
401 }
402 unittest
403 {
404     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
405     a = _mm_div_ps(a, a);
406     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
407     assert(a.array == correct);
408 }
409 
410 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
411 {
412     a[0] /= b[0];
413     return a;
414 }
415 unittest
416 {
417     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
418     a = _mm_div_ss(a, a);
419     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
420     assert(a.array == correct);
421 }
422 
423 // TODO: int _mm_extract_pi16 (__m64 a, int imm8)
424 
425 /// Free aligned memory that was allocated with `_mm_malloc`.
426 void _mm_free(void * mem_addr) @trusted
427 {
428     // support for free(NULL)
429     if (mem_addr is null)
430         return;
431 
432     // Technically we don't need to store size and alignement in the chunk, but we do in case we 
433     // have to implement _mm_realloc
434 
435     size_t pointerSize = (void*).sizeof;
436     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
437     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
438     size_t alignment = *alignmentLocation;
439     assert(alignment != 0);
440     assert(isPointerAligned(mem_addr, alignment));
441     free(*rawLocation);
442 }
443 
444 uint _MM_GET_EXCEPTION_MASK() pure @safe
445 {
446     return _mm_getcsr() & _MM_MASK_MASK;
447 }
448 
449 uint _MM_GET_EXCEPTION_STATE() pure @safe
450 {
451     return _mm_getcsr() & _MM_EXCEPT_MASK;
452 }
453 
454 uint _MM_GET_FLUSH_ZERO_MODE() pure @safe
455 {
456     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
457 }
458 
459 uint _MM_GET_ROUNDING_MODE() pure @safe
460 {
461     return _mm_getcsr() & _MM_ROUND_MASK;
462 }
463 
464 uint _mm_getcsr() pure @safe
465 {
466     version (InlineX86Asm)
467     {
468         uint controlWord;
469         asm nothrow @nogc pure @safe
470         {
471             stmxcsr controlWord;
472         }
473         return controlWord;
474     }
475     else
476         static assert(0, "Not yet supported");
477 }
478 
479 // TODO: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8)
480 
481 __m128 _mm_load_ps(const(float)*p) pure @trusted
482 {
483     return *cast(__m128*)p;
484 }
485 
486 __m128 _mm_load_ps1(const(float)*p) pure @trusted
487 {
488     float[4] f = [ *p, *p, *p, *p ];
489     return loadUnaligned!(float4)(f.ptr);
490 }
491 
492 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
493 {
494     float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ];
495     return loadUnaligned!(float4)(f.ptr);
496 }
497 
498 alias _mm_load1_ps = _mm_load_ps1;
499 
500 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe
501 {
502     long2 la = cast(long2)a;
503     la[1] = (*mem_addr)[0];
504     return cast(__m128)la;
505 }
506 
507 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe
508 {
509     long2 la = cast(long2)a;
510     la[0] = (*mem_addr)[0];
511     return cast(__m128)la;
512 }
513 
514 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted
515 {
516     __m128* aligned = cast(__m128*)mem_addr;
517     __m128 a = *aligned;
518     return shufflevector!(__m128, 3, 2, 1, 0)(a, a);
519 }
520 
521 __m128 _mm_loadu_ps(const(float)*p) pure @safe
522 {
523     return loadUnaligned!(__m128)(p);
524 }
525 
526 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
527 {
528     short r = *cast(short*)(mem_addr);
529     short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
530     result[0] = r;
531     return cast(__m128i)result;
532 }
533 unittest
534 {
535     short r = 13;
536     short8 A = cast(short8) _mm_loadu_si16(&r);
537     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
538     assert(A.array == correct);
539 }
540 
541 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
542 {
543     long r = *cast(long*)(mem_addr);
544     long2 result = [0, 0];
545     result[0] = r;
546     return cast(__m128i)result;
547 }
548 unittest
549 {
550     long r = 446446446446;
551     long2 A = cast(long2) _mm_loadu_si64(&r);
552     long[2] correct = [446446446446, 0];
553     assert(A.array == correct);
554 }
555 
556 /// Allocate size bytes of memory, aligned to the alignment specified in align, 
557 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 
558 /// memory that is allocated with `_mm_malloc`.
559 void* _mm_malloc(size_t size, size_t alignment) @trusted
560 {
561     assert(alignment != 0);
562     size_t request = requestedSize(size, alignment);
563     void* raw = malloc(request);
564     if (request > 0 && raw == null) // malloc(0) can validly return anything
565         onOutOfMemoryError();
566     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
567 }
568 
569 // TODO: _mm_maskmove_si64
570 // TODO: _m_maskmovq
571 
572 // TODO: _mm_max_pi16
573 version(LDC)
574 {
575     alias _mm_max_ps = __builtin_ia32_maxps;
576 }
577 else
578 {
579     __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
580     {
581         __m128 r;
582         r[0] = (a[0] > b[0]) ? a[0] : b[0];
583         r[1] = (a[1] > b[1]) ? a[1] : b[1];
584         r[2] = (a[2] > b[2]) ? a[2] : b[2];
585         r[3] = (a[3] > b[3]) ? a[3] : b[3];
586         return r;
587     }
588 }
589 unittest
590 {
591     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
592     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
593     __m128 M = _mm_max_ps(A, B);
594     assert(M[0] == 4);
595     assert(M[1] == 2);
596     assert(M[2] == 4);    // in case of NaN, second operand prevails (as it seems)
597     assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems)
598 }
599 
600 // TODO: _mm_max_pu8
601 
602 version(LDC)
603 {
604     alias _mm_max_ss = __builtin_ia32_maxss;
605 }
606 else
607 {
608     __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
609     {
610         __m128 r = a;
611         r[0] = (a[0] > b[0]) ? a[0] : b[0];
612         return r;
613     }
614 }
615 unittest
616 {
617     __m128 A = _mm_setr_ps(1, 2, 3, 4);
618     __m128 B = _mm_setr_ps(4, 1, 4, 1);
619     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
620     __m128 M = _mm_max_ss(A, B);
621     assert(M[0] == 4);
622     assert(M[1] == 2);
623     assert(M[2] == 3);
624     assert(M[3] == 4);
625     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
626     assert(M[0] != M[0]);
627     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
628     assert(M[0] == 1);
629 }
630 
631 // TODO: _mm_min_pi16
632 
633 version(LDC)
634 {
635     alias _mm_min_ps = __builtin_ia32_minps;
636 }
637 else
638 {
639     __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
640     {
641         __m128 r;
642         r[0] = (a[0] < b[0]) ? a[0] : b[0];
643         r[1] = (a[1] < b[1]) ? a[1] : b[1];
644         r[2] = (a[2] < b[2]) ? a[2] : b[2];
645         r[3] = (a[3] < b[3]) ? a[3] : b[3];
646         return r;
647     }
648 }
649 unittest
650 {
651     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
652     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
653     __m128 M = _mm_min_ps(A, B);
654     assert(M[0] == 1);
655     assert(M[1] == 1);
656     assert(M[2] == 4);    // in case of NaN, second operand prevails (as it seems)
657     assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems)
658 }
659 
660 // TODO: _mm_min_pi8
661 
662 version(LDC)
663 {
664     alias _mm_min_ss = __builtin_ia32_minss;
665 }
666 else
667 {
668     __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
669     {
670         __m128 r = a;
671         r[0] = (a[0] < b[0]) ? a[0] : b[0];
672         return r;
673     }
674 }
675 unittest
676 {
677     __m128 A = _mm_setr_ps(1, 2, 3, 4);
678     __m128 B = _mm_setr_ps(4, 1, 4, 1);
679     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
680     __m128 M = _mm_min_ss(A, B);
681     assert(M[0] == 1);
682     assert(M[1] == 2);
683     assert(M[2] == 3);
684     assert(M[3] == 4);
685     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
686     assert(M[0] != M[0]);
687     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
688     assert(M[0] == 1);
689 }
690 
691 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe
692 {
693     return shufflevector!(__m128, 4, 1, 2, 3)(a, b);
694 }
695 
696 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe
697 {
698     return shufflevector!(float4, 2, 3, 6, 7)(a, b);
699 }
700 
701 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe
702 {
703     return shufflevector!(float4, 0, 1, 4, 5)(a, b);
704 }
705 
706 
707 version(LDC)
708 {
709     alias _mm_movemask_ps = __builtin_ia32_movmskps;
710 }
711 else
712 {
713     int _mm_movemask_ps (__m128 a) pure @safe
714     {
715         int4 ai = cast(int4)a;
716         int r = 0;
717         if (ai[0] < 0) r += 1;
718         if (ai[1] < 0) r += 2;
719         if (ai[2] < 0) r += 4;
720         if (ai[3] < 0) r += 8;
721         return r;
722     }
723 }
724 unittest
725 {
726     int4 A = [-1, 0, -43, 0];
727     assert(5 == _mm_movemask_ps(cast(float4)A));
728 }
729 
730 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
731 {
732     return a * b;
733 }
734 unittest
735 {
736     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
737     a = _mm_mul_ps(a, a);
738     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
739     assert(a.array == correct);
740 }
741 
742 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
743 {
744     a[0] *= b[0];
745     return a;
746 }
747 unittest
748 {
749     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
750     a = _mm_mul_ss(a, a);
751     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
752     assert(a.array == correct);
753 }
754 
755 // TODO: _mm_mulhi_pu16
756 
757 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
758 {
759     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
760 }
761 
762 // TODO: __m64 _m_pavgb (__m64 a, __m64 b)
763 // TODO: __m64 _m_pavgw (__m64 a, __m64 b)
764 // TODO: int _m_pextrw (__m64 a, int imm8)
765 // TODO: __m64 _m_pinsrw (__m64 a, int i, int imm8)
766 // TODO: __m64 _m_pmaxsw (__m64 a, __m64 b)
767 // TODO: __m64 _m_pmaxub (__m64 a, __m64 b)
768 // TODO: __m64 _m_pminsw (__m64 a, __m64 b)
769 // TODO: __m64 _m_pminub (__m64 a, __m64 b)
770 // TODO: int _m_pmovmskb (__m64 a)
771 
772 // TODO: __m64 _m_pmulhuw (__m64 a, __m64 b)
773 
774 enum _MM_HINT_NTA = 0;
775 enum _MM_HINT_T0 = 1;
776 enum _MM_HINT_T1 = 2;
777 enum _MM_HINT_T2 = 3;
778 
779 // Note: locality must be compile-time, unlike Intel Intrinsics API
780 void _mm_prefetch(int locality)(void* p) pure @safe
781 {
782     llvm_prefetch(p, 0, locality, 1);
783 }
784 
785 // TODO: __m64 _m_psadbw (__m64 a, __m64 b)
786 // TODO: __m64 _m_pshufw (__m64 a, int imm8)
787 
788 version(LDC)
789 {
790     alias _mm_rcp_ps = __builtin_ia32_rcpps;
791 }
792 else
793 {
794     __m128 _mm_rcp_ps (__m128 a) pure @safe
795     {
796         a[0] = 1.0f / a[0];
797         a[1] = 1.0f / a[1];
798         a[2] = 1.0f / a[2];
799         a[3] = 1.0f / a[3];
800         return a;
801     }
802 }
803 
804 version(LDC)
805 {
806     alias _mm_rcp_ss = __builtin_ia32_rcpss;
807 }
808 else
809 {
810     __m128 _mm_rcp_ss (__m128 a) pure @safe
811     {
812         a[0] = 1.0f / a[0];
813         return a;
814     }
815 }
816 
817 version(LDC)
818 {
819     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
820 }
821 else
822 {
823     __m128 _mm_rsqrt_ps (__m128 a) pure @safe
824     {
825         a[0] = 1.0f / sqrt(a[0]);
826         a[1] = 1.0f / sqrt(a[1]);
827         a[2] = 1.0f / sqrt(a[2]);
828         a[3] = 1.0f / sqrt(a[3]);
829         return a;
830     }
831 }
832 
833 version(LDC)
834 {
835     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
836 }
837 else
838 {
839     __m128 _mm_rsqrt_ss (__m128 a) pure @safe
840     {
841         a[0] = 1.0f / sqrt(a[0]);
842         return a;
843     }
844 }
845 
846 unittest
847 {
848     double maxRelativeError = 0.000245; // -72 dB
849     void testInvSqrt(float number)
850     {
851         __m128 A = _mm_set1_ps(number);
852 
853         // test _mm_rcp_ps
854         __m128 B = _mm_rcp_ps(A);
855         foreach(i; 0..4)
856         {
857             double exact = 1.0f / A[i]; 
858             double ratio = cast(double)(B[i]) / cast(double)(exact);
859             assert(fabs(ratio - 1) <= maxRelativeError);
860         }
861 
862         // test _mm_rcp_ss
863         {
864             B = _mm_rcp_ss(A);
865             double exact = 1.0f / A[0];
866             double ratio = cast(double)(B[0]) / cast(double)(exact);
867             assert(fabs(ratio - 1) <= maxRelativeError);
868         }
869 
870         // test _mm_rsqrt_ps
871         B = _mm_rsqrt_ps(A);
872         foreach(i; 0..4)
873         {
874             double exact = 1.0f / sqrt(A[i]);
875             double ratio = cast(double)(B[i]) / cast(double)(exact);
876             assert(fabs(ratio - 1) <= maxRelativeError);
877         }
878 
879         // test _mm_rsqrt_ss
880         {
881             B = _mm_rsqrt_ss(A);
882             double exact = 1.0f / sqrt(A[0]);
883             double ratio = cast(double)(B[0]) / cast(double)(exact);
884             assert(fabs(ratio - 1) <= maxRelativeError);
885         }
886     }
887 
888     testInvSqrt(1.1f);
889     testInvSqrt(2.45674864151f);
890     testInvSqrt(27841456468.0f);
891 }
892 
893 // TODO: _mm_sad_pu8
894 
895 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) pure @safe
896 {
897     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
898 }
899 
900 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) pure @safe
901 {
902     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
903 }
904 
905 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) pure @safe
906 {
907     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
908 }
909 
910 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
911 {
912     // Note: despite appearances, generates sensible code,
913     //       inlines correctly and is constant folded
914     float[4] result = [e0, e1, e2, e3];
915     return loadUnaligned!(float4)(result.ptr); 
916 }
917 
918 alias _mm_set_ps1 = _mm_set1_ps;
919 
920 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) pure @safe
921 {
922     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
923 }
924 
925 __m128 _mm_set_ss (float a) pure @trusted
926 {
927     __m128 r = _mm_setzero_ps();
928     r[0] = a;
929     return r;
930 }
931 
932 __m128 _mm_set1_ps (float a) pure @trusted
933 {
934     // Note: despite appearances, generates sensible code,
935     //       inlines correctly and is constant folded
936     float[4] result = [a, a, a, a];
937     return loadUnaligned!(float4)(result.ptr);
938 }
939 
940 void _mm_setcsr(uint controlWord) pure @safe
941 {
942     version (InlineX86Asm)
943     {
944         asm pure nothrow @nogc @safe
945         { 
946             ldmxcsr controlWord;
947         }
948     }
949     else
950         static assert(0, "Not yet supported");
951 }
952 
953 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
954 {
955     float[4] result = [e3, e2, e1, e0];
956     return loadUnaligned!(float4)(result.ptr);
957 }
958 
959 __m128 _mm_setzero_ps() pure @trusted
960 {
961     // Compiles to xorps without problems
962     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
963     return loadUnaligned!(float4)(result.ptr);
964 }
965 
966 version(LDC)
967 {
968     alias _mm_sfence = __builtin_ia32_sfence;
969 }
970 else
971 {
972     void _mm_sfence() pure @safe
973     {
974         asm nothrow @nogc pure @safe
975         {
976             sfence;
977         }
978     }
979 }
980 unittest
981 {
982     _mm_sfence();
983 }
984 
985 // TODO: mm_shuffle_pi16
986 
987 // Note: the immediate shuffle value is given at compile-time instead of runtime.
988 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
989 {
990     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
991 }
992 
993 version(LDC)
994 {
995     // Disappeared with LDC 1.11
996     static if (__VERSION__ < 2081)
997         alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
998     else
999     {
1000         __m128 _mm_sqrt_ps(__m128 vec) pure @safe
1001         {
1002             vec.array[0] = llvm_sqrt(vec.array[0]);
1003             vec.array[1] = llvm_sqrt(vec.array[1]);
1004             vec.array[2] = llvm_sqrt(vec.array[2]);
1005             vec.array[3] = llvm_sqrt(vec.array[3]);
1006             return vec;
1007         }
1008     }
1009 }
1010 else
1011 {
1012     __m128 _mm_sqrt_ps(__m128 vec) pure @safe
1013     {
1014         vec.array[0] = sqrt(vec.array[0]);
1015         vec.array[1] = sqrt(vec.array[1]);
1016         vec.array[2] = sqrt(vec.array[2]);
1017         vec.array[3] = sqrt(vec.array[3]);
1018         return vec;
1019     }
1020 }
1021 unittest
1022 {
1023     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
1024     assert(A.array[0] == 2.0f);
1025     assert(A.array[1] == 2.0f);
1026     assert(A.array[2] == 2.0f);
1027     assert(A.array[3] == 2.0f);
1028 }
1029 
1030 version(LDC)
1031 {
1032     // Disappeared with LDC 1.11
1033     static if (__VERSION__ < 2081)
1034         alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
1035     else
1036     {
1037         __m128 _mm_sqrt_ss(__m128 vec) pure @safe
1038         {
1039             vec.array[0] = llvm_sqrt(vec.array[0]);
1040             vec.array[1] = vec.array[1];
1041             vec.array[2] = vec.array[2];
1042             vec.array[3] = vec.array[3];
1043             return vec;
1044         }
1045     }
1046 }
1047 else
1048 {
1049     __m128 _mm_sqrt_ss(__m128 vec) pure @safe
1050     {
1051         vec.array[0] = sqrt(vec.array[0]);
1052         return vec;
1053     }
1054 }
1055 unittest
1056 {
1057     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
1058     assert(A.array[0] == 2.0f);
1059     assert(A.array[1] == 4.0f);
1060     assert(A.array[2] == 4.0f);
1061     assert(A.array[3] == 4.0f);
1062 }
1063 
1064 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1065 {
1066     __m128* aligned = cast(__m128*)mem_addr;
1067     *aligned = a;
1068 }
1069 
1070 alias _mm_store_ps1 = _mm_store1_ps;
1071 
1072 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
1073 {
1074     *mem_addr = a[0];
1075 }
1076 
1077 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1078 {
1079     __m128* aligned = cast(__m128*)mem_addr;
1080     *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a);
1081 }
1082 
1083 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe
1084 {
1085     long2 la = cast(long2)a;
1086     (*p)[0] = la[1];
1087 }
1088 unittest
1089 {
1090     __m64 R = _mm_setzero_si64();
1091     long2 A = [13, 25];
1092     _mm_storeh_pi(&R, cast(__m128)A);
1093     assert(R[0] == 25);
1094 }
1095 
1096 void _mm_storel_pi(__m64* p, __m128 a) pure @safe
1097 {
1098     long2 la = cast(long2)a;
1099     (*p)[0] = la[0];
1100 }
1101 unittest
1102 {
1103     __m64 R = _mm_setzero_si64();
1104     long2 A = [13, 25];
1105     _mm_storel_pi(&R, cast(__m128)A);
1106     assert(R[0] == 13);
1107 }
1108 
1109 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1110 {
1111     __m128* aligned = cast(__m128*)mem_addr;
1112     *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a);
1113 }
1114 
1115 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe
1116 {
1117     storeUnaligned!(float4)(a, mem_addr);
1118 }
1119 
1120 // TODO: _mm_stream_pi, does not seem possible
1121 
1122 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal
1123 // needs some IR outside this function that would say: 
1124 //
1125 //  !0 = !{ i32 1 }
1126 //
1127 // It's a LLVM IR metadata description.
1128 // Regardless, non-temporal moves are really dangerous for performance...
1129 void _mm_stream_ps (float* mem_addr, __m128 a)
1130 {
1131     __m128* dest = cast(__m128*)mem_addr;
1132     *dest = a; // it's a regular move instead
1133 }
1134 unittest
1135 {
1136     align(16) float[4] A;
1137     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
1138     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
1139 }
1140 
1141 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
1142 {
1143     return a - b;
1144 }
1145 unittest
1146 {
1147     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1148     a = _mm_sub_ps(a, a);
1149     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
1150     assert(a.array == correct);
1151 }
1152 
1153 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
1154 {
1155     a[0] -= b[0];
1156     return a;
1157 }
1158 unittest
1159 {
1160     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1161     a = _mm_sub_ss(a, a);
1162     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
1163     assert(a.array == correct);
1164 }
1165 
1166 
1167 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
1168 {
1169     __m128 tmp3, tmp2, tmp1, tmp0;
1170     tmp0 = _mm_unpacklo_ps(row0, row1);
1171     tmp2 = _mm_unpacklo_ps(row2, row3);
1172     tmp1 = _mm_unpackhi_ps(row0, row1);
1173     tmp3 = _mm_unpackhi_ps(row2, row3);
1174     row0 = _mm_movelh_ps(tmp0, tmp2);
1175     row1 = _mm_movehl_ps(tmp2, tmp0);
1176     row2 = _mm_movelh_ps(tmp1, tmp3);
1177     row3 = _mm_movehl_ps(tmp3, tmp1);
1178 }
1179 
1180 // Note: the only difference between these intrinsics is the signalling 
1181 //       behaviour of quiet NaNs. This is incorrect but the case where
1182 //       you would want to differentiate between qNaN and sNaN and then 
1183 //       treat them differently on purpose seems extremely rare.
1184 alias _mm_ucomieq_ss = _mm_comieq_ss;
1185 alias _mm_ucomige_ss = _mm_comige_ss;
1186 alias _mm_ucomigt_ss = _mm_comigt_ss;
1187 alias _mm_ucomile_ss = _mm_comile_ss;
1188 alias _mm_ucomilt_ss = _mm_comilt_ss;
1189 alias _mm_ucomineq_ss = _mm_comineq_ss;
1190 
1191 
1192 __m128 _mm_undefined_ps() pure @safe
1193 {
1194     __m128 undef = void;
1195     return undef;
1196 }
1197 
1198 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe
1199 {
1200     return shufflevector!(float4, 2, 6, 3, 7)(a, b);
1201 }
1202 
1203 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe
1204 {
1205     return shufflevector!(float4, 0, 4, 1, 5)(a, b);
1206 }
1207 
1208 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe
1209 {
1210     return a ^ b;
1211 }
1212 
1213 
1214 private
1215 {
1216     /// Returns: `true` if the pointer is suitably aligned.
1217     bool isPointerAligned(void* p, size_t alignment) pure
1218     {
1219         assert(alignment != 0);
1220         return ( cast(size_t)p & (alignment - 1) ) == 0;
1221     }
1222 
1223     /// Returns: next pointer aligned with alignment bytes.
1224     void* nextAlignedPointer(void* start, size_t alignment) pure
1225     {
1226         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
1227     }
1228 
1229     // Returns number of bytes to actually allocate when asking
1230     // for a particular alignment
1231     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
1232     {
1233         enum size_t pointerSize = size_t.sizeof;
1234         return askedSize + alignment - 1 + pointerSize * 3;
1235     }
1236 
1237     // Store pointer given my malloc, size and alignment
1238     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
1239     {
1240         enum size_t pointerSize = size_t.sizeof;
1241         char* start = cast(char*)raw + pointerSize * 3;
1242         void* aligned = nextAlignedPointer(start, alignment);
1243         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
1244         *rawLocation = raw;
1245         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
1246         *sizeLocation = size;
1247         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
1248         *alignmentLocation = alignment;
1249         assert( isPointerAligned(aligned, alignment) );
1250         return aligned;
1251     }
1252 
1253     // Returns: x, multiple of powerOfTwo, so that x >= n.
1254     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
1255     {
1256         // check power-of-two
1257         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
1258 
1259         size_t mask = ~(powerOfTwo - 1);
1260         return (n + powerOfTwo - 1) & mask;
1261     }
1262 }
1263 
1264 unittest
1265 {
1266     assert(nextMultipleOf(0, 4) == 0);
1267     assert(nextMultipleOf(1, 4) == 4);
1268     assert(nextMultipleOf(2, 4) == 4);
1269     assert(nextMultipleOf(3, 4) == 4);
1270     assert(nextMultipleOf(4, 4) == 4);
1271     assert(nextMultipleOf(5, 4) == 8);
1272 
1273     {
1274         void* p = _mm_malloc(23, 16);
1275         assert(p !is null);
1276         assert(((cast(size_t)p) & 0xf) == 0);
1277         _mm_free(p);
1278     }
1279 
1280     void* nullAlloc = _mm_malloc(0, 32);
1281     assert(nullAlloc != null);
1282     _mm_free(nullAlloc);
1283 }