1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.xmmintrin;
7 
8 public import inteli.types;
9 
10 import inteli.internals;
11 
12 import inteli.mmx;
13 import inteli.emmintrin;
14 
15 import core.stdc.stdlib: malloc, free;
16 import core.exception: onOutOfMemoryError;
17 
18 version(D_InlineAsm_X86)
19     version = InlineX86Asm;
20 else version(D_InlineAsm_X86_64)
21     version = InlineX86Asm;
22 
23 
24 // SSE1
25 
26 nothrow @nogc:
27 
28 
29 enum int _MM_EXCEPT_INVALID    = 0x0001;
30 enum int _MM_EXCEPT_DENORM     = 0x0002;
31 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004;
32 enum int _MM_EXCEPT_OVERFLOW   = 0x0008;
33 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010;
34 enum int _MM_EXCEPT_INEXACT    = 0x0020;
35 enum int _MM_EXCEPT_MASK       = 0x003f;
36 
37 enum int _MM_MASK_INVALID      = 0x0080;
38 enum int _MM_MASK_DENORM       = 0x0100;
39 enum int _MM_MASK_DIV_ZERO     = 0x0200;
40 enum int _MM_MASK_OVERFLOW     = 0x0400;
41 enum int _MM_MASK_UNDERFLOW    = 0x0800;
42 enum int _MM_MASK_INEXACT      = 0x1000;
43 enum int _MM_MASK_MASK         = 0x1f80;
44 
45 enum int _MM_ROUND_NEAREST     = 0x0000;
46 enum int _MM_ROUND_DOWN        = 0x2000;
47 enum int _MM_ROUND_UP          = 0x4000;
48 enum int _MM_ROUND_TOWARD_ZERO = 0x6000;
49 enum int _MM_ROUND_MASK        = 0x6000;
50 
51 enum int _MM_FLUSH_ZERO_MASK   = 0x8000;
52 enum int _MM_FLUSH_ZERO_ON     = 0x8000;
53 enum int _MM_FLUSH_ZERO_OFF    = 0x0000;
54 
55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
56 {
57     return a + b;
58 }
59 
60 unittest
61 {
62     __m128 a = [1, 2, 3, 4];
63     a = _mm_add_ps(a, a);
64     assert(a.array[0] == 2);
65     assert(a.array[1] == 4);
66     assert(a.array[2] == 6);
67     assert(a.array[3] == 8);
68 }
69 
70 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
71 {
72     a[0] += b[0];
73     return a;
74 }
75 unittest
76 {
77     __m128 a = [1, 2, 3, 4];
78     a = _mm_add_ss(a, a);
79     assert(a.array == [2.0f, 2, 3, 4]);
80 }
81 
82 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
83 {
84     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
85 }
86 unittest
87 {
88     // Note: tested in emmintrin.d
89 }
90 
91 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
92 {
93     return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
94 }
95 
96 /// Average packed unsigned 16-bit integers in ``a` and `b`.
97 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
98 {
99     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
100 }
101 
102 /// Average packed unsigned 8-bit integers in ``a` and `b`.
103 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
104 {
105     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
106 }
107 
108 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
109 {
110     return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
111 }
112 
113 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
114 {
115     return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
116 }
117 
118 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
119 {
120     return cast(__m128) cmpps!(FPComparison.oge)(a, b);
121 }
122 unittest
123 {
124     __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan),
125                                            _mm_setr_ps(0, 0, 0, 0));
126     int[4] correct = [-1, -1, 0, 0];
127     assert(R.array == correct);
128 }
129 
130 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
131 {
132     return cast(__m128) cmpss!(FPComparison.oge)(a, b);
133 }
134 
135 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
136 {
137     return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
138 }
139 
140 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
141 {
142     return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
143 }
144 
145 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
146 {
147     return cast(__m128) cmpps!(FPComparison.ole)(a, b);
148 }
149 
150 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
151 {
152     return cast(__m128) cmpss!(FPComparison.ole)(a, b);
153 }
154 
155 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
156 {
157     return cast(__m128) cmpps!(FPComparison.olt)(a, b);
158 }
159 
160 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
161 {
162     return cast(__m128) cmpss!(FPComparison.olt)(a, b);
163 }
164 
165 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
166 {
167     return cast(__m128) cmpps!(FPComparison.une)(a, b);
168 }
169 
170 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
171 {
172     return cast(__m128) cmpss!(FPComparison.une)(a, b);
173 }
174 
175 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
176 {
177     return cast(__m128) cmpps!(FPComparison.ult)(a, b);
178 }
179 
180 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
181 {
182     return cast(__m128) cmpss!(FPComparison.ult)(a, b);
183 }
184 
185 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
186 {
187     return cast(__m128) cmpps!(FPComparison.ule)(a, b);
188 }
189 
190 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
191 {
192     return cast(__m128) cmpss!(FPComparison.ule)(a, b);
193 }
194 
195 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
196 {
197     return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
198 }
199 
200 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
201 {
202     return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
203 }
204 
205 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
206 {
207     return cast(__m128) cmpps!(FPComparison.uge)(a, b);
208 }
209 
210 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
211 {
212     return cast(__m128) cmpss!(FPComparison.uge)(a, b);
213 }
214 
215 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
216 {
217     return cast(__m128) cmpps!(FPComparison.ord)(a, b);
218 }
219 
220 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
221 {
222     return cast(__m128) cmpss!(FPComparison.ord)(a, b);
223 }
224 
225 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
226 {
227     return cast(__m128) cmpps!(FPComparison.uno)(a, b);
228 }
229 
230 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
231 {
232     return cast(__m128) cmpss!(FPComparison.uno)(a, b);
233 }
234 
235 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
236 // Some such comparisons yields true for NaNs, other don't.
237 
238 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete
239 {
240     return comss!(FPComparison.ueq)(a, b); // yields true for NaN!
241 }
242 
243 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae
244 {
245     return comss!(FPComparison.oge)(a, b);
246 }
247 
248 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
249 {
250     return comss!(FPComparison.ogt)(a, b);
251 }
252 
253 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
254 {
255     return comss!(FPComparison.ule)(a, b); // yields true for NaN!
256 }
257 
258 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
259 {
260     return comss!(FPComparison.ult)(a, b); // yields true for NaN!
261 }
262 
263 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
264 {
265     return comss!(FPComparison.one)(a, b);
266 }
267 
268 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
269 
270 __m64 _mm_cvt_ps2pi (__m128 a) pure @safe
271 {
272     return to_m64(_mm_cvtps_epi32(a));
273 }
274 
275 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe
276 {
277     v[0] = cast(float)x;
278     return v;
279 }
280 unittest
281 {
282     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
283     assert(a.array == [42f, 0, 0, 0]);
284 }
285 
286 // Note: is just another name for _mm_cvtss_si32
287 alias _mm_cvt_ss2si = _mm_cvtss_si32;
288 
289 
290 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
291 {
292     __m128i ma = to_m128i(a);
293     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
294     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
295     return _mm_cvtepi32_ps(ma);
296 }
297 unittest
298 {
299     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
300     __m128 R = _mm_cvtpi16_ps(A);
301     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
302     assert(R.array == correct);
303 }
304 
305 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
306 {
307     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
308     a[0] = fb[0];
309     a[1] = fb[1];
310     return a;
311 }
312 unittest
313 {
314     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
315     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
316     assert(R.array == correct);
317 }
318 
319 
320 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @safe
321 {
322     long2 l;
323     l[0] = a[0];
324     l[1] = b[0];
325     return _mm_cvtepi32_ps(cast(__m128i)l);
326 }
327 
328 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
329 {
330     __m128i b = to_m128i(a); 
331 
332     // Zero extend to 32-bit
333     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
334     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
335 
336     // Replicate sign bit
337     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
338     return _mm_cvtepi32_ps(b);
339 }
340 unittest
341 {
342     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
343     __m128 R = _mm_cvtpi8_ps(A);
344     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
345     assert(R.array == correct);
346 }
347 
348 __m64 _mm_cvtps_pi16 (__m128 a) pure @safe
349 {
350     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
351     // Which means the 16-bit integers should be saturated
352     __m128i b = _mm_cvtps_epi32(a);
353     b = _mm_packs_epi32(b, b);
354     return to_m64(b);
355 }
356 unittest
357 {
358     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
359     short4 R = cast(short4) _mm_cvtps_pi16(A);
360     short[4] correct = [-1, 2, -32768, 32767];
361     assert(R.array == correct);
362 }
363 
364 __m64 _mm_cvtps_pi32 (__m128 a) pure @safe
365 {
366     return to_m64(_mm_cvtps_epi32(a));
367 }
368 unittest
369 {
370     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
371     int2 R = cast(int2) _mm_cvtps_pi32(A);
372     int[2] correct = [-33000, 70000];
373     assert(R.array == correct);
374 }
375 
376 __m64 _mm_cvtps_pi8 (__m128 a) pure @safe
377 {
378     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
379     // Which means the 8-bit integers should be saturated
380     __m128i b = _mm_cvtps_epi32(a);
381     b = _mm_packs_epi32(b, _mm_setzero_si128());
382     b = _mm_packs_epi16(b, _mm_setzero_si128());
383     return to_m64(b);
384 }
385 unittest
386 {
387     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
388     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
389     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
390     assert(R.array == correct);
391 }
392 
393 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
394 {
395     __m128i ma = to_m128i(a);
396     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
397     return _mm_cvtepi32_ps(ma);
398 }
399 unittest
400 {
401     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
402     __m128 R = _mm_cvtpu16_ps(A);
403     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
404     assert(R.array == correct);
405 }
406 
407 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
408 {
409     __m128i b = to_m128i(a); 
410 
411     // Zero extend to 32-bit
412     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
413     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
414     return _mm_cvtepi32_ps(b);
415 }
416 unittest
417 {
418     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
419     __m128 R = _mm_cvtpu8_ps(A);
420     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
421     assert(R.array == correct);
422 }
423 
424 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe
425 {
426     v[0] = cast(float)x;
427     return v;
428 }
429 unittest
430 {
431     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
432     assert(a.array == [42.0f, 0, 0, 0]);
433 }
434 
435 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy
436 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe
437 {
438     v[0] = cast(float)x;
439     return v;
440 }
441 unittest
442 {
443     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
444     assert(a.array == [42.0f, 0, 0, 0]);
445 }
446 
447 float _mm_cvtss_f32(__m128 a) pure @safe
448 {
449     return a[0];
450 }
451 
452 version(LDC)
453 {
454     alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si;
455 }
456 else
457 {
458     int _mm_cvtss_si32 (__m128 a) pure @safe
459     {
460         return convertFloatToInt32UsingMXCSR(a[0]);
461     }
462 }
463 unittest
464 {
465     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
466 }
467 
468 version(LDC)
469 {
470     version(X86_64)
471         alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64;
472     else
473     {
474         // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit
475         long _mm_cvtss_si64 (__m128 a) pure @safe
476         {
477             return convertFloatToInt64UsingMXCSR(a[0]);
478         }
479     }
480 }
481 else
482 {
483     long _mm_cvtss_si64 (__m128 a) pure @safe
484     {
485         return convertFloatToInt64UsingMXCSR(a[0]);
486     }
487 }
488 unittest
489 {
490     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
491 
492     uint savedRounding = _MM_GET_ROUNDING_MODE();
493 
494     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
495     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.5f)));
496 
497     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
498     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
499 
500     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
501     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
502 
503     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
504     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
505 
506     _MM_SET_ROUNDING_MODE(savedRounding);
507 }
508 
509 
510 version(LDC)
511 {
512     alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si;
513 }
514 else
515 {
516     int _mm_cvtt_ss2si (__m128 a) pure @safe
517     {
518         return cast(int)(a[0]);
519     }
520 }
521 unittest
522 {
523     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
524 }
525 
526 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
527 {
528     return to_m64(_mm_cvttps_epi32(a));
529 }
530 
531 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op
532 
533 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code.
534 long _mm_cvttss_si64 (__m128 a) pure @safe
535 {
536     return cast(long)(a[0]); // Generates cvttss2si as expected
537 }
538 unittest
539 {
540     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
541 }
542 
543 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
544 {
545     return a / b;
546 }
547 unittest
548 {
549     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
550     a = _mm_div_ps(a, a);
551     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
552     assert(a.array == correct);
553 }
554 
555 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
556 {
557     a[0] /= b[0];
558     return a;
559 }
560 unittest
561 {
562     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
563     a = _mm_div_ss(a, a);
564     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
565     assert(a.array == correct);
566 }
567 
568 int _mm_extract_pi16 (__m64 a, int imm8)
569 {
570     short4 sa = cast(short4)a;
571     return cast(ushort)(sa[imm8]);
572 }
573 unittest
574 {
575     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
576     assert(_mm_extract_pi16(A, 0) == 65535);
577     assert(_mm_extract_pi16(A, 1) == 6);
578     assert(_mm_extract_pi16(A, 2) == 0);
579     assert(_mm_extract_pi16(A, 3) == 4);
580 }
581 
582 /// Free aligned memory that was allocated with `_mm_malloc`.
583 void _mm_free(void * mem_addr) @trusted
584 {
585     // support for free(NULL)
586     if (mem_addr is null)
587         return;
588 
589     // Technically we don't need to store size and alignement in the chunk, but we do in case we
590     // have to implement _mm_realloc
591 
592     size_t pointerSize = (void*).sizeof;
593     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
594     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
595     size_t alignment = *alignmentLocation;
596     assert(alignment != 0);
597     assert(isPointerAligned(mem_addr, alignment));
598     free(*rawLocation);
599 }
600 
601 uint _MM_GET_EXCEPTION_MASK() pure @safe
602 {
603     return _mm_getcsr() & _MM_MASK_MASK;
604 }
605 
606 uint _MM_GET_EXCEPTION_STATE() pure @safe
607 {
608     return _mm_getcsr() & _MM_EXCEPT_MASK;
609 }
610 
611 uint _MM_GET_FLUSH_ZERO_MODE() pure @safe
612 {
613     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
614 }
615 
616 uint _MM_GET_ROUNDING_MODE() pure @safe
617 {
618     return _mm_getcsr() & _MM_ROUND_MASK;
619 }
620 
621 uint _mm_getcsr() pure @safe
622 {
623     version (InlineX86Asm)
624     {
625         uint controlWord;
626         asm nothrow @nogc pure @safe
627         {
628             stmxcsr controlWord;
629         }
630         return controlWord;
631     }
632     else
633         static assert(0, "Not yet supported");
634 }
635 
636 __m64 _mm_insert_pi16 (__m64 v, int i, int index)
637 {
638     short4 r = cast(short4)v;
639     r[index & 3] = cast(short)i;
640     return cast(__m64)r;
641 }
642 unittest
643 {
644     __m64 A = _mm_set_pi16(3, 2, 1, 0);
645     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
646     short[4] correct = [0, 42, 2, 3];
647     assert(R.array == correct);
648 }
649 
650 __m128 _mm_load_ps(const(float)*p) pure @trusted
651 {
652     return *cast(__m128*)p;
653 }
654 
655 __m128 _mm_load_ps1(const(float)*p) pure @trusted
656 {
657     return __m128(*p);
658 }
659 
660 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
661 {
662     float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ];
663     return loadUnaligned!(float4)(f.ptr);
664 }
665 
666 alias _mm_load1_ps = _mm_load_ps1;
667 
668 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe
669 {
670     long2 la = cast(long2)a;
671     la[1] = (*mem_addr)[0];
672     return cast(__m128)la;
673 }
674 
675 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe
676 {
677     long2 la = cast(long2)a;
678     la[0] = (*mem_addr)[0];
679     return cast(__m128)la;
680 }
681 
682 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted
683 {
684     __m128* aligned = cast(__m128*)mem_addr;
685     __m128 a = *aligned;
686     return shufflevector!(__m128, 3, 2, 1, 0)(a, a);
687 }
688 
689 __m128 _mm_loadu_ps(const(float)*p) pure @safe
690 {
691     return loadUnaligned!(__m128)(p);
692 }
693 
694 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
695 {
696     short r = *cast(short*)(mem_addr);
697     short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
698     result[0] = r;
699     return cast(__m128i)result;
700 }
701 unittest
702 {
703     short r = 13;
704     short8 A = cast(short8) _mm_loadu_si16(&r);
705     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
706     assert(A.array == correct);
707 }
708 
709 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
710 {
711     long r = *cast(long*)(mem_addr);
712     long2 result = [0, 0];
713     result[0] = r;
714     return cast(__m128i)result;
715 }
716 unittest
717 {
718     long r = 446446446446;
719     long2 A = cast(long2) _mm_loadu_si64(&r);
720     long[2] correct = [446446446446, 0];
721     assert(A.array == correct);
722 }
723 
724 /// Allocate size bytes of memory, aligned to the alignment specified in align,
725 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
726 /// memory that is allocated with `_mm_malloc`.
727 void* _mm_malloc(size_t size, size_t alignment) @trusted
728 {
729     assert(alignment != 0);
730     size_t request = requestedSize(size, alignment);
731     void* raw = malloc(request);
732     if (request > 0 && raw == null) // malloc(0) can validly return anything
733         onOutOfMemoryError();
734     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
735 }
736 
737 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
738 {
739     // this works since mask is zero-extended
740     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
741 }
742 
743 deprecated alias _m_maskmovq = _mm_maskmove_si64;
744 
745 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
746 {
747     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
748 }
749 
750 version(LDC)
751 {
752     alias _mm_max_ps = __builtin_ia32_maxps;
753 }
754 else
755 {
756     __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
757     {
758         __m128 r;
759         r[0] = (a[0] > b[0]) ? a[0] : b[0];
760         r[1] = (a[1] > b[1]) ? a[1] : b[1];
761         r[2] = (a[2] > b[2]) ? a[2] : b[2];
762         r[3] = (a[3] > b[3]) ? a[3] : b[3];
763         return r;
764     }
765 }
766 unittest
767 {
768     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
769     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
770     __m128 M = _mm_max_ps(A, B);
771     assert(M[0] == 4);
772     assert(M[1] == 2);
773     assert(M[2] == 4);    // in case of NaN, second operand prevails (as it seems)
774     assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems)
775 }
776 
777 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
778 {
779     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
780 }
781 
782 version(LDC)
783 {
784     alias _mm_max_ss = __builtin_ia32_maxss;
785 }
786 else
787 {
788     __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
789     {
790         __m128 r = a;
791         r[0] = (a[0] > b[0]) ? a[0] : b[0];
792         return r;
793     }
794 }
795 unittest
796 {
797     __m128 A = _mm_setr_ps(1, 2, 3, 4);
798     __m128 B = _mm_setr_ps(4, 1, 4, 1);
799     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
800     __m128 M = _mm_max_ss(A, B);
801     assert(M[0] == 4);
802     assert(M[1] == 2);
803     assert(M[2] == 3);
804     assert(M[3] == 4);
805     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
806     assert(M[0] != M[0]);
807     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
808     assert(M[0] == 1);
809 }
810 
811 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
812 {
813     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
814 }
815 
816 version(LDC)
817 {
818     alias _mm_min_ps = __builtin_ia32_minps;
819 }
820 else
821 {
822     __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
823     {
824         __m128 r;
825         r[0] = (a[0] < b[0]) ? a[0] : b[0];
826         r[1] = (a[1] < b[1]) ? a[1] : b[1];
827         r[2] = (a[2] < b[2]) ? a[2] : b[2];
828         r[3] = (a[3] < b[3]) ? a[3] : b[3];
829         return r;
830     }
831 }
832 unittest
833 {
834     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
835     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
836     __m128 M = _mm_min_ps(A, B);
837     assert(M[0] == 1);
838     assert(M[1] == 1);
839     assert(M[2] == 4);    // in case of NaN, second operand prevails (as it seems)
840     assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems)
841 }
842 
843 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
844 {
845     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
846 }
847 
848 version(LDC)
849 {
850     alias _mm_min_ss = __builtin_ia32_minss;
851 }
852 else
853 {
854     __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
855     {
856         __m128 r = a;
857         r[0] = (a[0] < b[0]) ? a[0] : b[0];
858         return r;
859     }
860 }
861 unittest
862 {
863     __m128 A = _mm_setr_ps(1, 2, 3, 4);
864     __m128 B = _mm_setr_ps(4, 1, 4, 1);
865     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
866     __m128 M = _mm_min_ss(A, B);
867     assert(M[0] == 1);
868     assert(M[1] == 2);
869     assert(M[2] == 3);
870     assert(M[3] == 4);
871     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
872     assert(M[0] != M[0]);
873     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
874     assert(M[0] == 1);
875 }
876 
877 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe
878 {
879     return shufflevector!(__m128, 4, 1, 2, 3)(a, b);
880 }
881 
882 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe
883 {
884     return shufflevector!(float4, 2, 3, 6, 7)(a, b);
885 }
886 
887 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe
888 {
889     return shufflevector!(float4, 0, 1, 4, 5)(a, b);
890 }
891 
892 int _mm_movemask_pi8 (__m64 a) pure @safe
893 {
894     return _mm_movemask_epi8(to_m128i(a));
895 }
896 unittest
897 {
898     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
899 }
900 
901 version(LDC)
902 {
903     alias _mm_movemask_ps = __builtin_ia32_movmskps;
904 }
905 else
906 {
907     int _mm_movemask_ps (__m128 a) pure @safe
908     {
909         int4 ai = cast(int4)a;
910         int r = 0;
911         if (ai[0] < 0) r += 1;
912         if (ai[1] < 0) r += 2;
913         if (ai[2] < 0) r += 4;
914         if (ai[3] < 0) r += 8;
915         return r;
916     }
917 }
918 unittest
919 {
920     int4 A = [-1, 0, -43, 0];
921     assert(5 == _mm_movemask_ps(cast(float4)A));
922 }
923 
924 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
925 {
926     return a * b;
927 }
928 unittest
929 {
930     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
931     a = _mm_mul_ps(a, a);
932     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
933     assert(a.array == correct);
934 }
935 
936 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
937 {
938     a[0] *= b[0];
939     return a;
940 }
941 unittest
942 {
943     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
944     a = _mm_mul_ss(a, a);
945     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
946     assert(a.array == correct);
947 }
948 
949 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
950 {
951     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
952 }
953 unittest
954 {
955     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
956     __m64 B = _mm_set1_pi16(16384);
957     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
958     short[4] correct = [0, 0x3FFC, 0, 0];
959     assert(R.array == correct);
960 }
961 
962 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
963 {
964     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
965 }
966 
967 deprecated alias 
968     _m_pavgb = _mm_avg_pu8,
969     _m_pavgw = _mm_avg_pu16,
970     _m_pextrw = _mm_extract_pi16,
971     _m_pinsrw = _mm_insert_pi16,
972     _m_pmaxsw = _mm_max_pi16,
973     _m_pmaxub = _mm_max_pu8,
974     _m_pminsw = _mm_min_pi16,
975     _m_pminub = _mm_min_pu8,
976     _m_pmovmskb = _mm_movemask_pi8,
977     _m_pmulhuw = _mm_mulhi_pu16;
978 
979 enum _MM_HINT_NTA = 0;
980 enum _MM_HINT_T0 = 1;
981 enum _MM_HINT_T1 = 2;
982 enum _MM_HINT_T2 = 3;
983 
984 // Note: locality must be compile-time, unlike Intel Intrinsics API
985 void _mm_prefetch(int locality)(void* p) pure @safe
986 {
987     llvm_prefetch(p, 0, locality, 1);
988 }
989 
990 deprecated alias
991     _m_psadbw = _mm_sad_pu8,
992     _m_pshufw = _mm_shuffle_pi16;
993 
994 version(LDC)
995 {
996     alias _mm_rcp_ps = __builtin_ia32_rcpps;
997 }
998 else
999 {
1000     __m128 _mm_rcp_ps (__m128 a) pure @safe
1001     {
1002         a[0] = 1.0f / a[0];
1003         a[1] = 1.0f / a[1];
1004         a[2] = 1.0f / a[2];
1005         a[3] = 1.0f / a[3];
1006         return a;
1007     }
1008 }
1009 
1010 version(LDC)
1011 {
1012     alias _mm_rcp_ss = __builtin_ia32_rcpss;
1013 }
1014 else
1015 {
1016     __m128 _mm_rcp_ss (__m128 a) pure @safe
1017     {
1018         a[0] = 1.0f / a[0];
1019         return a;
1020     }
1021 }
1022 
1023 version(LDC)
1024 {
1025     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
1026 }
1027 else
1028 {
1029     __m128 _mm_rsqrt_ps (__m128 a) pure @safe
1030     {
1031         a[0] = 1.0f / sqrt(a[0]);
1032         a[1] = 1.0f / sqrt(a[1]);
1033         a[2] = 1.0f / sqrt(a[2]);
1034         a[3] = 1.0f / sqrt(a[3]);
1035         return a;
1036     }
1037 }
1038 
1039 version(LDC)
1040 {
1041     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
1042 }
1043 else
1044 {
1045     __m128 _mm_rsqrt_ss (__m128 a) pure @safe
1046     {
1047         a[0] = 1.0f / sqrt(a[0]);
1048         return a;
1049     }
1050 }
1051 
1052 unittest
1053 {
1054     double maxRelativeError = 0.000245; // -72 dB
1055     void testInvSqrt(float number)
1056     {
1057         __m128 A = _mm_set1_ps(number);
1058 
1059         // test _mm_rcp_ps
1060         __m128 B = _mm_rcp_ps(A);
1061         foreach(i; 0..4)
1062         {
1063             double exact = 1.0f / A[i];
1064             double ratio = cast(double)(B[i]) / cast(double)(exact);
1065             assert(fabs(ratio - 1) <= maxRelativeError);
1066         }
1067 
1068         // test _mm_rcp_ss
1069         {
1070             B = _mm_rcp_ss(A);
1071             double exact = 1.0f / A[0];
1072             double ratio = cast(double)(B[0]) / cast(double)(exact);
1073             assert(fabs(ratio - 1) <= maxRelativeError);
1074         }
1075 
1076         // test _mm_rsqrt_ps
1077         B = _mm_rsqrt_ps(A);
1078         foreach(i; 0..4)
1079         {
1080             double exact = 1.0f / sqrt(A[i]);
1081             double ratio = cast(double)(B[i]) / cast(double)(exact);
1082             assert(fabs(ratio - 1) <= maxRelativeError);
1083         }
1084 
1085         // test _mm_rsqrt_ss
1086         {
1087             B = _mm_rsqrt_ss(A);
1088             double exact = 1.0f / sqrt(A[0]);
1089             double ratio = cast(double)(B[0]) / cast(double)(exact);
1090             assert(fabs(ratio - 1) <= maxRelativeError);
1091         }
1092     }
1093 
1094     testInvSqrt(1.1f);
1095     testInvSqrt(2.45674864151f);
1096     testInvSqrt(27841456468.0f);
1097 }
1098 
1099 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
1100 {
1101     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
1102 }
1103 
1104 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) pure @safe
1105 {
1106     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
1107 }
1108 
1109 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) pure @safe
1110 {
1111     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
1112 }
1113 
1114 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) pure @safe
1115 {
1116     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
1117 }
1118 
1119 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
1120 {
1121     // Note: despite appearances, generates sensible code,
1122     //       inlines correctly and is constant folded
1123     float[4] result = [e0, e1, e2, e3];
1124     return loadUnaligned!(float4)(result.ptr);
1125 }
1126 
1127 alias _mm_set_ps1 = _mm_set1_ps;
1128 
1129 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) pure @safe
1130 {
1131     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
1132 }
1133 
1134 __m128 _mm_set_ss (float a) pure @trusted
1135 {
1136     __m128 r = _mm_setzero_ps();
1137     r[0] = a;
1138     return r;
1139 }
1140 
1141 __m128 _mm_set1_ps (float a) pure @trusted
1142 {
1143     return __m128(a);
1144 }
1145 
1146 void _mm_setcsr(uint controlWord) pure @safe
1147 {
1148     version (InlineX86Asm)
1149     {
1150         asm pure nothrow @nogc @safe
1151         {
1152             ldmxcsr controlWord;
1153         }
1154     }
1155     else
1156         static assert(0, "Not yet supported");
1157 }
1158 
1159 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
1160 {
1161     float[4] result = [e3, e2, e1, e0];
1162     return loadUnaligned!(float4)(result.ptr);
1163 }
1164 
1165 __m128 _mm_setzero_ps() pure @trusted
1166 {
1167     // Compiles to xorps without problems
1168     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
1169     return loadUnaligned!(float4)(result.ptr);
1170 }
1171 
1172 version(LDC)
1173 {
1174     alias _mm_sfence = __builtin_ia32_sfence;
1175 }
1176 else
1177 {
1178     void _mm_sfence() pure @safe
1179     {
1180         asm nothrow @nogc pure @safe
1181         {
1182             sfence;
1183         }
1184     }
1185 }
1186 unittest
1187 {
1188     _mm_sfence();
1189 }
1190 
1191 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe
1192 {
1193     return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ),
1194                                               ( (imm8 >> 2) & 3 ),
1195                                               ( (imm8 >> 4) & 3 ),
1196                                               ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
1197 }
1198 unittest
1199 {
1200     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
1201     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1202     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
1203     short[4] expectedB = [ 3, 2, 1, 0 ];
1204     assert(B.array == expectedB);
1205 }
1206 
1207 // Note: the immediate shuffle value is given at compile-time instead of runtime.
1208 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
1209 {
1210     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
1211 }
1212 
1213 version(LDC)
1214 {
1215     // Disappeared with LDC 1.11
1216     static if (__VERSION__ < 2081)
1217         alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
1218     else
1219     {
1220         __m128 _mm_sqrt_ps(__m128 vec) pure @safe
1221         {
1222             vec.array[0] = llvm_sqrt(vec.array[0]);
1223             vec.array[1] = llvm_sqrt(vec.array[1]);
1224             vec.array[2] = llvm_sqrt(vec.array[2]);
1225             vec.array[3] = llvm_sqrt(vec.array[3]);
1226             return vec;
1227         }
1228     }
1229 }
1230 else
1231 {
1232     __m128 _mm_sqrt_ps(__m128 vec) pure @safe
1233     {
1234         vec.array[0] = sqrt(vec.array[0]);
1235         vec.array[1] = sqrt(vec.array[1]);
1236         vec.array[2] = sqrt(vec.array[2]);
1237         vec.array[3] = sqrt(vec.array[3]);
1238         return vec;
1239     }
1240 }
1241 unittest
1242 {
1243     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
1244     assert(A.array[0] == 2.0f);
1245     assert(A.array[1] == 2.0f);
1246     assert(A.array[2] == 2.0f);
1247     assert(A.array[3] == 2.0f);
1248 }
1249 
1250 version(LDC)
1251 {
1252     // Disappeared with LDC 1.11
1253     static if (__VERSION__ < 2081)
1254         alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
1255     else
1256     {
1257         __m128 _mm_sqrt_ss(__m128 vec) pure @safe
1258         {
1259             vec.array[0] = llvm_sqrt(vec.array[0]);
1260             vec.array[1] = vec.array[1];
1261             vec.array[2] = vec.array[2];
1262             vec.array[3] = vec.array[3];
1263             return vec;
1264         }
1265     }
1266 }
1267 else
1268 {
1269     __m128 _mm_sqrt_ss(__m128 vec) pure @safe
1270     {
1271         vec.array[0] = sqrt(vec.array[0]);
1272         return vec;
1273     }
1274 }
1275 unittest
1276 {
1277     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
1278     assert(A.array[0] == 2.0f);
1279     assert(A.array[1] == 4.0f);
1280     assert(A.array[2] == 4.0f);
1281     assert(A.array[3] == 4.0f);
1282 }
1283 
1284 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1285 {
1286     __m128* aligned = cast(__m128*)mem_addr;
1287     *aligned = a;
1288 }
1289 
1290 alias _mm_store_ps1 = _mm_store1_ps;
1291 
1292 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
1293 {
1294     *mem_addr = a[0];
1295 }
1296 unittest
1297 {
1298     float a;
1299     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
1300     assert(a == 546);
1301 }
1302 
1303 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1304 {
1305     __m128* aligned = cast(__m128*)mem_addr;
1306     *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a);
1307 }
1308 
1309 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe
1310 {
1311     long2 la = cast(long2)a;
1312     (*p)[0] = la[1];
1313 }
1314 unittest
1315 {
1316     __m64 R = _mm_setzero_si64();
1317     long2 A = [13, 25];
1318     _mm_storeh_pi(&R, cast(__m128)A);
1319     assert(R[0] == 25);
1320 }
1321 
1322 void _mm_storel_pi(__m64* p, __m128 a) pure @safe
1323 {
1324     long2 la = cast(long2)a;
1325     (*p)[0] = la[0];
1326 }
1327 unittest
1328 {
1329     __m64 R = _mm_setzero_si64();
1330     long2 A = [13, 25];
1331     _mm_storel_pi(&R, cast(__m128)A);
1332     assert(R[0] == 13);
1333 }
1334 
1335 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1336 {
1337     __m128* aligned = cast(__m128*)mem_addr;
1338     *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a);
1339 }
1340 
1341 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe
1342 {
1343     storeUnaligned!(float4)(a, mem_addr);
1344 }
1345 
1346 void _mm_stream_pi (__m64* mem_addr, __m64 a)
1347 {
1348     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
1349     *mem_addr = a; // it's a regular move instead
1350 }
1351 
1352 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal
1353 // needs some IR outside this function that would say:
1354 //
1355 //  !0 = !{ i32 1 }
1356 //
1357 // It's a LLVM IR metadata description.
1358 // Regardless, non-temporal moves are really dangerous for performance...
1359 void _mm_stream_ps (float* mem_addr, __m128 a)
1360 {
1361     __m128* dest = cast(__m128*)mem_addr;
1362     *dest = a; // it's a regular move instead
1363 }
1364 unittest
1365 {
1366     align(16) float[4] A;
1367     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
1368     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
1369 }
1370 
1371 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
1372 {
1373     return a - b;
1374 }
1375 unittest
1376 {
1377     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1378     a = _mm_sub_ps(a, a);
1379     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
1380     assert(a.array == correct);
1381 }
1382 
1383 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
1384 {
1385     a[0] -= b[0];
1386     return a;
1387 }
1388 unittest
1389 {
1390     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1391     a = _mm_sub_ss(a, a);
1392     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
1393     assert(a.array == correct);
1394 }
1395 
1396 
1397 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
1398 {
1399     __m128 tmp3, tmp2, tmp1, tmp0;
1400     tmp0 = _mm_unpacklo_ps(row0, row1);
1401     tmp2 = _mm_unpacklo_ps(row2, row3);
1402     tmp1 = _mm_unpackhi_ps(row0, row1);
1403     tmp3 = _mm_unpackhi_ps(row2, row3);
1404     row0 = _mm_movelh_ps(tmp0, tmp2);
1405     row1 = _mm_movehl_ps(tmp2, tmp0);
1406     row2 = _mm_movelh_ps(tmp1, tmp3);
1407     row3 = _mm_movehl_ps(tmp3, tmp1);
1408 }
1409 
1410 // Note: the only difference between these intrinsics is the signalling
1411 //       behaviour of quiet NaNs. This is incorrect but the case where
1412 //       you would want to differentiate between qNaN and sNaN and then
1413 //       treat them differently on purpose seems extremely rare.
1414 alias _mm_ucomieq_ss = _mm_comieq_ss;
1415 alias _mm_ucomige_ss = _mm_comige_ss;
1416 alias _mm_ucomigt_ss = _mm_comigt_ss;
1417 alias _mm_ucomile_ss = _mm_comile_ss;
1418 alias _mm_ucomilt_ss = _mm_comilt_ss;
1419 alias _mm_ucomineq_ss = _mm_comineq_ss;
1420 
1421 
1422 __m128 _mm_undefined_ps() pure @safe
1423 {
1424     __m128 undef = void;
1425     return undef;
1426 }
1427 
1428 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe
1429 {
1430     return shufflevector!(float4, 2, 6, 3, 7)(a, b);
1431 }
1432 
1433 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe
1434 {
1435     return shufflevector!(float4, 0, 4, 1, 5)(a, b);
1436 }
1437 
1438 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
1439 {
1440     return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
1441 }
1442 
1443 
1444 private
1445 {
1446     /// Returns: `true` if the pointer is suitably aligned.
1447     bool isPointerAligned(void* p, size_t alignment) pure
1448     {
1449         assert(alignment != 0);
1450         return ( cast(size_t)p & (alignment - 1) ) == 0;
1451     }
1452 
1453     /// Returns: next pointer aligned with alignment bytes.
1454     void* nextAlignedPointer(void* start, size_t alignment) pure
1455     {
1456         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
1457     }
1458 
1459     // Returns number of bytes to actually allocate when asking
1460     // for a particular alignment
1461     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
1462     {
1463         enum size_t pointerSize = size_t.sizeof;
1464         return askedSize + alignment - 1 + pointerSize * 3;
1465     }
1466 
1467     // Store pointer given my malloc, size and alignment
1468     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
1469     {
1470         enum size_t pointerSize = size_t.sizeof;
1471         char* start = cast(char*)raw + pointerSize * 3;
1472         void* aligned = nextAlignedPointer(start, alignment);
1473         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
1474         *rawLocation = raw;
1475         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
1476         *sizeLocation = size;
1477         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
1478         *alignmentLocation = alignment;
1479         assert( isPointerAligned(aligned, alignment) );
1480         return aligned;
1481     }
1482 
1483     // Returns: x, multiple of powerOfTwo, so that x >= n.
1484     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
1485     {
1486         // check power-of-two
1487         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
1488 
1489         size_t mask = ~(powerOfTwo - 1);
1490         return (n + powerOfTwo - 1) & mask;
1491     }
1492 }
1493 
1494 unittest
1495 {
1496     assert(nextMultipleOf(0, 4) == 0);
1497     assert(nextMultipleOf(1, 4) == 4);
1498     assert(nextMultipleOf(2, 4) == 4);
1499     assert(nextMultipleOf(3, 4) == 4);
1500     assert(nextMultipleOf(4, 4) == 4);
1501     assert(nextMultipleOf(5, 4) == 8);
1502 
1503     {
1504         void* p = _mm_malloc(23, 16);
1505         assert(p !is null);
1506         assert(((cast(size_t)p) & 0xf) == 0);
1507         _mm_free(p);
1508     }
1509 
1510     void* nullAlloc = _mm_malloc(0, 32);
1511     assert(nullAlloc != null);
1512     _mm_free(nullAlloc);
1513 }