1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.xmmintrin;
7 
8 public import inteli.types;
9 
10 import inteli.internals;
11 
12 import inteli.mmx;
13 import inteli.emmintrin;
14 
15 import core.stdc.stdlib: malloc, free;
16 import core.exception: onOutOfMemoryError;
17 
18 version(D_InlineAsm_X86)
19     version = InlineX86Asm;
20 else version(D_InlineAsm_X86_64)
21     version = InlineX86Asm;
22 
23 
24 // SSE1
25 
26 nothrow @nogc:
27 
28 
29 enum int _MM_EXCEPT_INVALID    = 0x0001;
30 enum int _MM_EXCEPT_DENORM     = 0x0002;
31 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004;
32 enum int _MM_EXCEPT_OVERFLOW   = 0x0008;
33 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010;
34 enum int _MM_EXCEPT_INEXACT    = 0x0020;
35 enum int _MM_EXCEPT_MASK       = 0x003f;
36 
37 enum int _MM_MASK_INVALID      = 0x0080;
38 enum int _MM_MASK_DENORM       = 0x0100;
39 enum int _MM_MASK_DIV_ZERO     = 0x0200;
40 enum int _MM_MASK_OVERFLOW     = 0x0400;
41 enum int _MM_MASK_UNDERFLOW    = 0x0800;
42 enum int _MM_MASK_INEXACT      = 0x1000;
43 enum int _MM_MASK_MASK         = 0x1f80;
44 
45 enum int _MM_ROUND_NEAREST     = 0x0000;
46 enum int _MM_ROUND_DOWN        = 0x2000;
47 enum int _MM_ROUND_UP          = 0x4000;
48 enum int _MM_ROUND_TOWARD_ZERO = 0x6000;
49 enum int _MM_ROUND_MASK        = 0x6000;
50 
51 enum int _MM_FLUSH_ZERO_MASK   = 0x8000;
52 enum int _MM_FLUSH_ZERO_ON     = 0x8000;
53 enum int _MM_FLUSH_ZERO_OFF    = 0x0000;
54 
55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
56 {
57     return a + b;
58 }
59 
60 unittest
61 {
62     __m128 a = [1, 2, 3, 4];
63     a = _mm_add_ps(a, a);
64     assert(a.array[0] == 2);
65     assert(a.array[1] == 4);
66     assert(a.array[2] == 6);
67     assert(a.array[3] == 8);
68 }
69 
70 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
71 {
72     static if (GDC_with_SSE)
73         return __builtin_ia32_addss(a, b);
74     else
75     {
76         a[0] += b[0];
77         return a;
78     }
79 }
80 unittest
81 {
82     __m128 a = [1, 2, 3, 4];
83     a = _mm_add_ss(a, a);
84     assert(a.array == [2.0f, 2, 3, 4]);
85 }
86 
87 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
88 {
89     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
90 }
91 unittest
92 {
93     // Note: tested in emmintrin.d
94 }
95 
96 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
97 {
98     return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
99 }
100 
101 /// Average packed unsigned 16-bit integers in ``a` and `b`.
102 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
103 {
104     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
105 }
106 
107 /// Average packed unsigned 8-bit integers in ``a` and `b`.
108 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
109 {
110     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
111 }
112 
113 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
114 {
115     return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
116 }
117 
118 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
119 {
120     return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
121 }
122 
123 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
124 {
125     return cast(__m128) cmpps!(FPComparison.oge)(a, b);
126 }
127 unittest
128 {
129     __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan),
130                                            _mm_setr_ps(0, 0, 0, 0));
131     int[4] correct = [-1, -1, 0, 0];
132     assert(R.array == correct);
133 }
134 
135 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
136 {
137     return cast(__m128) cmpss!(FPComparison.oge)(a, b);
138 }
139 
140 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
141 {
142     return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
143 }
144 
145 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
146 {
147     return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
148 }
149 
150 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
151 {
152     return cast(__m128) cmpps!(FPComparison.ole)(a, b);
153 }
154 
155 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
156 {
157     return cast(__m128) cmpss!(FPComparison.ole)(a, b);
158 }
159 
160 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
161 {
162     return cast(__m128) cmpps!(FPComparison.olt)(a, b);
163 }
164 
165 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
166 {
167     return cast(__m128) cmpss!(FPComparison.olt)(a, b);
168 }
169 
170 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
171 {
172     return cast(__m128) cmpps!(FPComparison.une)(a, b);
173 }
174 
175 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
176 {
177     return cast(__m128) cmpss!(FPComparison.une)(a, b);
178 }
179 
180 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
181 {
182     return cast(__m128) cmpps!(FPComparison.ult)(a, b);
183 }
184 
185 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
186 {
187     return cast(__m128) cmpss!(FPComparison.ult)(a, b);
188 }
189 
190 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
191 {
192     return cast(__m128) cmpps!(FPComparison.ule)(a, b);
193 }
194 
195 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
196 {
197     return cast(__m128) cmpss!(FPComparison.ule)(a, b);
198 }
199 
200 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
201 {
202     return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
203 }
204 
205 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
206 {
207     return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
208 }
209 
210 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
211 {
212     return cast(__m128) cmpps!(FPComparison.uge)(a, b);
213 }
214 
215 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
216 {
217     return cast(__m128) cmpss!(FPComparison.uge)(a, b);
218 }
219 
220 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
221 {
222     return cast(__m128) cmpps!(FPComparison.ord)(a, b);
223 }
224 
225 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
226 {
227     return cast(__m128) cmpss!(FPComparison.ord)(a, b);
228 }
229 
230 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
231 {
232     return cast(__m128) cmpps!(FPComparison.uno)(a, b);
233 }
234 
235 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
236 {
237     return cast(__m128) cmpss!(FPComparison.uno)(a, b);
238 }
239 
240 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS
241 // Some such comparisons yields true for NaNs, other don't.
242 
243 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete
244 {
245     return comss!(FPComparison.ueq)(a, b); // yields true for NaN!
246 }
247 
248 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae
249 {
250     return comss!(FPComparison.oge)(a, b);
251 }
252 
253 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
254 {
255     return comss!(FPComparison.ogt)(a, b);
256 }
257 
258 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
259 {
260     return comss!(FPComparison.ule)(a, b); // yields true for NaN!
261 }
262 
263 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
264 {
265     return comss!(FPComparison.ult)(a, b); // yields true for NaN!
266 }
267 
268 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
269 {
270     return comss!(FPComparison.one)(a, b);
271 }
272 
273 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
274 
275 __m64 _mm_cvt_ps2pi (__m128 a) @safe
276 {
277     return to_m64(_mm_cvtps_epi32(a));
278 }
279 
280 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @trusted
281 {
282     v.ptr[0] = cast(float)x;
283     return v;
284 }
285 unittest
286 {
287     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
288     assert(a.array == [42f, 0, 0, 0]);
289 }
290 
291 
292 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
293 {
294     __m128i ma = to_m128i(a);
295     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
296     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
297     return _mm_cvtepi32_ps(ma);
298 }
299 unittest
300 {
301     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
302     __m128 R = _mm_cvtpi16_ps(A);
303     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
304     assert(R.array == correct);
305 }
306 
307 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
308 {
309     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
310     a.ptr[0] = fb.array[0];
311     a.ptr[1] = fb.array[1];
312     return a;
313 }
314 unittest
315 {
316     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
317     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
318     assert(R.array == correct);
319 }
320 
321 
322 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
323 {
324     long2 l;
325     l.ptr[0] = a.array[0];
326     l.ptr[1] = b.array[0];
327     return _mm_cvtepi32_ps(cast(__m128i)l);
328 }
329 
330 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
331 {
332     __m128i b = to_m128i(a); 
333 
334     // Zero extend to 32-bit
335     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
336     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
337 
338     // Replicate sign bit
339     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
340     return _mm_cvtepi32_ps(b);
341 }
342 unittest
343 {
344     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
345     __m128 R = _mm_cvtpi8_ps(A);
346     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
347     assert(R.array == correct);
348 }
349 
350 __m64 _mm_cvtps_pi16 (__m128 a) @safe
351 {
352     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
353     // Which means the 16-bit integers should be saturated
354     __m128i b = _mm_cvtps_epi32(a);
355     b = _mm_packs_epi32(b, b);
356     return to_m64(b);
357 }
358 unittest
359 {
360     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
361     short4 R = cast(short4) _mm_cvtps_pi16(A);
362     short[4] correct = [-1, 2, -32768, 32767];
363     assert(R.array == correct);
364 }
365 
366 __m64 _mm_cvtps_pi32 (__m128 a) @safe
367 {
368     return to_m64(_mm_cvtps_epi32(a));
369 }
370 unittest
371 {
372     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
373     int2 R = cast(int2) _mm_cvtps_pi32(A);
374     int[2] correct = [-33000, 70000];
375     assert(R.array == correct);
376 }
377 
378 __m64 _mm_cvtps_pi8 (__m128 a) @safe
379 {
380     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
381     // Which means the 8-bit integers should be saturated
382     __m128i b = _mm_cvtps_epi32(a);
383     b = _mm_packs_epi32(b, _mm_setzero_si128());
384     b = _mm_packs_epi16(b, _mm_setzero_si128());
385     return to_m64(b);
386 }
387 unittest
388 {
389     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
390     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
391     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
392     assert(R.array == correct);
393 }
394 
395 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
396 {
397     __m128i ma = to_m128i(a);
398     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
399     return _mm_cvtepi32_ps(ma);
400 }
401 unittest
402 {
403     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
404     __m128 R = _mm_cvtpu16_ps(A);
405     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
406     assert(R.array == correct);
407 }
408 
409 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
410 {
411     __m128i b = to_m128i(a); 
412 
413     // Zero extend to 32-bit
414     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
415     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
416     return _mm_cvtepi32_ps(b);
417 }
418 unittest
419 {
420     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
421     __m128 R = _mm_cvtpu8_ps(A);
422     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
423     assert(R.array == correct);
424 }
425 
426 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
427 {
428     v.ptr[0] = cast(float)x;
429     return v;
430 }
431 unittest
432 {
433     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
434     assert(a.array == [42.0f, 0, 0, 0]);
435 }
436 
437 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy
438 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
439 {
440     v.ptr[0] = cast(float)x;
441     return v;
442 }
443 unittest
444 {
445     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
446     assert(a.array == [42.0f, 0, 0, 0]);
447 }
448 
449 float _mm_cvtss_f32(__m128 a) pure @safe
450 {
451     return a.array[0];
452 }
453 
454 static if (LDC_with_SSE1)
455 {
456     alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si;
457 }
458 else
459 {
460     int _mm_cvtss_si32 (__m128 a) @safe
461     {
462         return convertFloatToInt32UsingMXCSR(a.array[0]);
463     }
464 }
465 unittest
466 {
467     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
468 }
469 
470 version(LDC)
471 {
472     version(X86_64)
473         alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64;
474     else
475     {
476         // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit
477         long _mm_cvtss_si64 (__m128 a) @safe
478         {
479             return convertFloatToInt64UsingMXCSR(a.array[0]);
480         }
481     }
482 }
483 else
484 {
485     long _mm_cvtss_si64 (__m128 a) @safe
486     {
487         return convertFloatToInt64UsingMXCSR(a.array[0]);
488     }
489 }
490 unittest
491 {
492     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
493 
494     uint savedRounding = _MM_GET_ROUNDING_MODE();
495 
496     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
497     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
498 
499     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
500     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
501 
502     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
503     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
504 
505     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
506     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
507 
508     _MM_SET_ROUNDING_MODE(savedRounding);
509 }
510 
511 
512 static if(LDC_with_SSE1)
513 {
514     alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si;
515 }
516 else
517 {
518     int _mm_cvtt_ss2si (__m128 a) pure @safe
519     {
520         return cast(int)(a.array[0]);
521     }
522 }
523 unittest
524 {
525     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
526 }
527 
528 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
529 {
530     return to_m64(_mm_cvttps_epi32(a));
531 }
532 
533 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op
534 
535 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code.
536 long _mm_cvttss_si64 (__m128 a) pure @safe
537 {
538     return cast(long)(a.array[0]); // Generates cvttss2si as expected
539 }
540 unittest
541 {
542     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
543 }
544 
545 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
546 {
547     return a / b;
548 }
549 unittest
550 {
551     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
552     a = _mm_div_ps(a, a);
553     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
554     assert(a.array == correct);
555 }
556 
557 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
558 {
559     static if (GDC_with_SSE)
560         return __builtin_ia32_divss(a, b);
561     else
562     {
563         a[0] /= b[0];
564         return a;
565     }
566 }
567 unittest
568 {
569     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
570     a = _mm_div_ss(a, a);
571     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
572     assert(a.array == correct);
573 }
574 
575 int _mm_extract_pi16 (__m64 a, int imm8)
576 {
577     short4 sa = cast(short4)a;
578     return cast(ushort)(sa.array[imm8]);
579 }
580 unittest
581 {
582     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
583     assert(_mm_extract_pi16(A, 0) == 65535);
584     assert(_mm_extract_pi16(A, 1) == 6);
585     assert(_mm_extract_pi16(A, 2) == 0);
586     assert(_mm_extract_pi16(A, 3) == 4);
587 }
588 
589 /// Free aligned memory that was allocated with `_mm_malloc`.
590 void _mm_free(void * mem_addr) @trusted
591 {
592     // support for free(NULL)
593     if (mem_addr is null)
594         return;
595 
596     // Technically we don't need to store size and alignement in the chunk, but we do in case we
597     // have to implement _mm_realloc
598 
599     size_t pointerSize = (void*).sizeof;
600     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
601     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
602     size_t alignment = *alignmentLocation;
603     assert(alignment != 0);
604     assert(isPointerAligned(mem_addr, alignment));
605     free(*rawLocation);
606 }
607 
608 uint _MM_GET_EXCEPTION_MASK() @safe
609 {
610     return _mm_getcsr() & _MM_MASK_MASK;
611 }
612 
613 uint _MM_GET_EXCEPTION_STATE() @safe
614 {
615     return _mm_getcsr() & _MM_EXCEPT_MASK;
616 }
617 
618 uint _MM_GET_FLUSH_ZERO_MODE() @safe
619 {
620     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
621 }
622 
623 uint _MM_GET_ROUNDING_MODE() @safe
624 {
625     return _mm_getcsr() & _MM_ROUND_MASK;
626 }
627 
628 uint _mm_getcsr() @trusted
629 {
630     static if (LDC_with_ARM)
631     {
632         // Note: we convert the ARM FPSCR into a x86 SSE control word.
633         // However, only rounding mode and flush to zero are actually set.
634         // The returned control word will have all exceptions masked, and no exception detected.
635 
636         uint fpscr = arm_get_fpcr();
637 
638         uint cw = 0; // No exception detected
639         if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
640         {
641             // ARM has one single flag for ARM.
642             // It does both x86 bits.
643             // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
644             cw |= _MM_FLUSH_ZERO_ON;
645             cw |= 0x40; // set "denormals are zeros"
646         } 
647         cw |= _MM_MASK_MASK; // All exception maske
648 
649         // Rounding mode
650         switch(fpscr & _MM_ROUND_MASK_ARM)
651         {
652             default:
653             case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
654             case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
655             case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
656             case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
657         }
658         return cw;
659     }
660     else version(GNU)
661     {
662         static if (GDC_with_SSE)
663         {
664             return __builtin_ia32_stmxcsr();
665         }
666         else version(X86)
667         {
668             uint sseRounding = 0;
669             asm pure nothrow @nogc @trusted
670             {
671                 "stmxcsr %0;\n" 
672                   : "=m" (sseRounding)
673                   : 
674                   : ;
675             }
676             return sseRounding;
677         }
678         else
679             static assert(false);
680     }
681     else version (InlineX86Asm)
682     {
683         uint controlWord;
684         asm nothrow @nogc pure @safe
685         {
686             stmxcsr controlWord;
687         }
688         return controlWord;
689     }
690     else
691         static assert(0, "Not yet supported");
692 }
693 unittest
694 {
695     uint csr = _mm_getcsr();
696 }
697 
698 __m64 _mm_insert_pi16 (__m64 v, int i, int index) pure @trusted
699 {
700     short4 r = cast(short4)v;
701     r.ptr[index & 3] = cast(short)i;
702     return cast(__m64)r;
703 }
704 unittest
705 {
706     __m64 A = _mm_set_pi16(3, 2, 1, 0);
707     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
708     short[4] correct = [0, 42, 2, 3];
709     assert(R.array == correct);
710 }
711 
712 __m128 _mm_load_ps(const(float)*p) pure @trusted
713 {
714     return *cast(__m128*)p;
715 }
716 
717 __m128 _mm_load_ps1(const(float)*p) pure @trusted
718 {
719     return __m128(*p);
720 }
721 
722 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
723 {
724     __m128 r;
725     r.ptr[0] = *mem_addr;
726     r.ptr[1] = 0;
727     r.ptr[2] = 0;
728     r.ptr[3] = 0;
729     return r;
730 }
731 
732 alias _mm_load1_ps = _mm_load_ps1;
733 
734 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
735 {
736     long2 la = cast(long2)a;
737     la.ptr[1] = (*mem_addr).array[0];
738     return cast(__m128)la;
739 }
740 
741 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
742 {
743     long2 la = cast(long2)a;
744     la.ptr[0] = (*mem_addr).array[0];
745     return cast(__m128)la;
746 }
747 
748 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted
749 {
750     __m128* aligned = cast(__m128*)mem_addr;
751     __m128 a = *aligned;
752     __m128 r;
753     r.ptr[0] = a.array[3];
754     r.ptr[1] = a.array[2];
755     r.ptr[2] = a.array[1];
756     r.ptr[3] = a.array[0];
757     return r;
758 }
759 
760 __m128 _mm_loadu_ps(const(float)*p) pure @safe
761 {
762     return loadUnaligned!(__m128)(p);
763 }
764 
765 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
766 {
767     short r = *cast(short*)(mem_addr);
768     short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
769     result.ptr[0] = r;
770     return cast(__m128i)result;
771 }
772 unittest
773 {
774     short r = 13;
775     short8 A = cast(short8) _mm_loadu_si16(&r);
776     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
777     assert(A.array == correct);
778 }
779 
780 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
781 {
782     long r = *cast(long*)(mem_addr);
783     long2 result = [0, 0];
784     result.ptr[0] = r;
785     return cast(__m128i)result;
786 }
787 unittest
788 {
789     long r = 446446446446;
790     long2 A = cast(long2) _mm_loadu_si64(&r);
791     long[2] correct = [446446446446, 0];
792     assert(A.array == correct);
793 }
794 
795 /// Allocate size bytes of memory, aligned to the alignment specified in align,
796 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
797 /// memory that is allocated with `_mm_malloc`.
798 void* _mm_malloc(size_t size, size_t alignment) @trusted
799 {
800     assert(alignment != 0);
801     size_t request = requestedSize(size, alignment);
802     void* raw = malloc(request);
803     if (request > 0 && raw == null) // malloc(0) can validly return anything
804         onOutOfMemoryError();
805     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
806 }
807 
808 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
809 {
810     // this works since mask is zero-extended
811     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
812 }
813 
814 deprecated alias _m_maskmovq = _mm_maskmove_si64;
815 
816 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
817 {
818     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
819 }
820 
821 static if (GDC_with_SSE)
822 {
823     alias _mm_max_ps = __builtin_ia32_maxps;
824 }
825 else static if (LDC_with_SSE1)
826 {
827     alias _mm_max_ps = __builtin_ia32_maxps;
828 }
829 else
830 {
831     __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
832     {
833         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
834         __m128 r;
835         r[0] = (a[0] > b[0]) ? a[0] : b[0];
836         r[1] = (a[1] > b[1]) ? a[1] : b[1];
837         r[2] = (a[2] > b[2]) ? a[2] : b[2];
838         r[3] = (a[3] > b[3]) ? a[3] : b[3];
839         return r;
840     }
841 }
842 unittest
843 {
844     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
845     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
846     __m128 M = _mm_max_ps(A, B);
847     assert(M.array[0] == 4);
848     assert(M.array[1] == 2);
849     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
850     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
851 }
852 
853 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
854 {
855     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
856 }
857 
858 static if (GDC_with_SSE)
859 {
860     alias _mm_max_ss = __builtin_ia32_maxss;
861 }
862 else static if (LDC_with_SSE1)
863 {
864     alias _mm_max_ss = __builtin_ia32_maxss;
865 }
866 else
867 {
868     __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
869     {
870         __m128 r = a;
871         r[0] = (a[0] > b[0]) ? a[0] : b[0];
872         return r;
873     }
874 }
875 unittest
876 {
877     __m128 A = _mm_setr_ps(1, 2, 3, 4);
878     __m128 B = _mm_setr_ps(4, 1, 4, 1);
879     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
880     __m128 M = _mm_max_ss(A, B);
881     assert(M.array[0] == 4);
882     assert(M.array[1] == 2);
883     assert(M.array[2] == 3);
884     assert(M.array[3] == 4);
885     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
886     assert(M.array[0] != M.array[0]);
887     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
888     assert(M.array[0] == 1);
889 }
890 
891 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
892 {
893     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
894 }
895 
896 static if (GDC_with_SSE)
897 {
898     alias _mm_min_ps = __builtin_ia32_minps;
899 }
900 else static if (LDC_with_SSE1)
901 {
902     alias _mm_min_ps = __builtin_ia32_minps;
903 }
904 else
905 {
906     __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
907     {
908         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
909         __m128 r;
910         r[0] = (a[0] < b[0]) ? a[0] : b[0];
911         r[1] = (a[1] < b[1]) ? a[1] : b[1];
912         r[2] = (a[2] < b[2]) ? a[2] : b[2];
913         r[3] = (a[3] < b[3]) ? a[3] : b[3];
914         return r;
915     }
916 }
917 unittest
918 {
919     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
920     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
921     __m128 M = _mm_min_ps(A, B);
922     assert(M.array[0] == 1);
923     assert(M.array[1] == 1);
924     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
925     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
926 }
927 
928 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
929 {
930     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
931 }
932 
933 static if (GDC_with_SSE)
934 {
935     alias _mm_min_ss = __builtin_ia32_minss;
936 }
937 else static if (LDC_with_SSE1)
938 {
939     alias _mm_min_ss = __builtin_ia32_minss;
940 }
941 else
942 {
943     __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
944     {
945         // Generates minss since LDC 1.3 -O1
946         __m128 r = a;
947         r[0] = (a[0] < b[0]) ? a[0] : b[0];
948         return r;
949     }
950 }
951 unittest
952 {
953     __m128 A = _mm_setr_ps(1, 2, 3, 4);
954     __m128 B = _mm_setr_ps(4, 1, 4, 1);
955     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
956     __m128 M = _mm_min_ss(A, B);
957     assert(M.array[0] == 1);
958     assert(M.array[1] == 2);
959     assert(M.array[2] == 3);
960     assert(M.array[3] == 4);
961     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
962     assert(M.array[0] != M.array[0]);
963     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
964     assert(M.array[0] == 1);
965 }
966 
967 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
968 {
969     a.ptr[0] = b.array[0];
970     return a;
971 }
972 
973 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
974 {
975     b.ptr[0] = a.array[2];
976     b.ptr[1] = a.array[3];
977     return b;
978 }
979 
980 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
981 {
982     a.ptr[2] = b.array[0];
983     a.ptr[3] = b.array[1];
984     return a;
985 }
986 
987 int _mm_movemask_pi8 (__m64 a) pure @safe
988 {
989     return _mm_movemask_epi8(to_m128i(a));
990 }
991 unittest
992 {
993     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
994 }
995 
996 static if (GDC_with_SSE)
997 {
998     alias _mm_movemask_ps = __builtin_ia32_movmskps;
999 }
1000 else static if (LDC_with_SSE1)
1001 {
1002     alias _mm_movemask_ps = __builtin_ia32_movmskps;
1003 }
1004 else static if (LDC_with_ARM)
1005 {
1006     int _mm_movemask_ps (__m128 a) pure @safe
1007     {
1008         int4 ai = cast(int4)a;
1009         int4 shift31 = [31, 31, 31, 31]; 
1010         ai = ai >>> shift31;
1011         int4 shift = [0, 1, 2, 3]; 
1012         ai = ai << shift; // 4-way shift, only efficient on ARM.
1013         int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
1014         return r;
1015     }
1016 }
1017 else
1018 {
1019     int _mm_movemask_ps (__m128 a) pure @safe
1020     {
1021         int4 ai = cast(int4)a;
1022         int r = 0;
1023         if (ai[0] < 0) r += 1;
1024         if (ai[1] < 0) r += 2;
1025         if (ai[2] < 0) r += 4;
1026         if (ai[3] < 0) r += 8;
1027         return r;
1028     }
1029 }
1030 unittest
1031 {
1032     int4 A = [-1, 0, -43, 0];
1033     assert(5 == _mm_movemask_ps(cast(float4)A));
1034 }
1035 
1036 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
1037 {
1038     return a * b;
1039 }
1040 unittest
1041 {
1042     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1043     a = _mm_mul_ps(a, a);
1044     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
1045     assert(a.array == correct);
1046 }
1047 
1048 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
1049 {
1050     static if (GDC_with_SSE)
1051         return __builtin_ia32_mulss(a, b);
1052     else
1053     {
1054         a[0] *= b[0];
1055         return a;
1056     }
1057 }
1058 unittest
1059 {
1060     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1061     a = _mm_mul_ss(a, a);
1062     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
1063     assert(a.array == correct);
1064 }
1065 
1066 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
1067 {
1068     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
1069 }
1070 unittest
1071 {
1072     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
1073     __m64 B = _mm_set1_pi16(16384);
1074     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
1075     short[4] correct = [0, 0x3FFC, 0, 0];
1076     assert(R.array == correct);
1077 }
1078 
1079 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
1080 {
1081     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
1082 }
1083 
1084 deprecated alias 
1085     _m_pavgb = _mm_avg_pu8,
1086     _m_pavgw = _mm_avg_pu16,
1087     _m_pextrw = _mm_extract_pi16,
1088     _m_pinsrw = _mm_insert_pi16,
1089     _m_pmaxsw = _mm_max_pi16,
1090     _m_pmaxub = _mm_max_pu8,
1091     _m_pminsw = _mm_min_pi16,
1092     _m_pminub = _mm_min_pu8,
1093     _m_pmovmskb = _mm_movemask_pi8,
1094     _m_pmulhuw = _mm_mulhi_pu16;
1095 
1096 enum _MM_HINT_T0  = 3; ///
1097 enum _MM_HINT_T1  = 2; ///
1098 enum _MM_HINT_T2  = 1; ///
1099 enum _MM_HINT_NTA = 0; ///
1100 
1101 
1102 version(LDC)
1103 {
1104     // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
1105     // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
1106     static if (__VERSION__ >= 2091) 
1107     {
1108         pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
1109             void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
1110     }
1111 }
1112 
1113 /// Fetch the line of data from memory that contains address `p` to a location in the 
1114 /// cache hierarchy specified by the locality hint i.
1115 ///
1116 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
1117 void _mm_prefetch(int locality)(const(void)* p) pure @trusted
1118 {
1119     static if (GDC_with_SSE)
1120     {
1121         return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
1122     }
1123     else version(LDC)
1124     {
1125         static if (__VERSION__ >= 2091)
1126         {
1127             // const_cast here. `llvm_prefetch` wants a mutable pointer
1128             llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
1129         }
1130         else
1131         {
1132             // const_cast here. `llvm_prefetch` wants a mutable pointer
1133             llvm_prefetch( cast(void*)p, 0, locality, 1);
1134         }
1135     }
1136     else version(D_InlineAsm_X86_64)
1137     {
1138         static if (locality == _MM_HINT_NTA)
1139         {
1140             asm pure nothrow @nogc @trusted
1141             {
1142                 mov RAX, p;
1143                 prefetchnta [RAX];
1144             }
1145         }
1146         else static if (locality == _MM_HINT_T0)
1147         {
1148             asm pure nothrow @nogc @trusted
1149             {
1150                 mov RAX, p;
1151                 prefetcht0 [RAX];
1152             }
1153         }
1154         else static if (locality == _MM_HINT_T1)
1155         {
1156             asm pure nothrow @nogc @trusted
1157             {
1158                 mov RAX, p;
1159                 prefetcht1 [RAX];
1160             }
1161         }
1162         else static if (locality == _MM_HINT_T2)
1163         {
1164             asm pure nothrow @nogc @trusted
1165             {
1166                 mov RAX, p;
1167                 prefetcht2 [RAX];
1168             }
1169         }
1170         else
1171             assert(false); // invalid locality hint
1172     }
1173     else version(D_InlineAsm_X86)
1174     {
1175         static if (locality == _MM_HINT_NTA)
1176         {
1177             asm pure nothrow @nogc @trusted
1178             {
1179                 mov EAX, p;
1180                 prefetchnta [EAX];
1181             }
1182         }
1183         else static if (locality == _MM_HINT_T0)
1184         {
1185             asm pure nothrow @nogc @trusted
1186             {
1187                 mov EAX, p;
1188                 prefetcht0 [EAX];
1189             }
1190         }
1191         else static if (locality == _MM_HINT_T1)
1192         {
1193             asm pure nothrow @nogc @trusted
1194             {
1195                 mov EAX, p;
1196                 prefetcht1 [EAX];
1197             }
1198         }
1199         else static if (locality == _MM_HINT_T2)
1200         {
1201             asm pure nothrow @nogc @trusted
1202             {
1203                 mov EAX, p;
1204                 prefetcht2 [EAX];
1205             }
1206         }
1207         else 
1208             assert(false); // invalid locality hint
1209     }
1210     else
1211     {
1212         // Generic version: do nothing. From bitter experience, 
1213         // it's unlikely you get ANY speed-up with manual prefetching.
1214         // Prefetching or not doesn't change program behaviour.
1215     }
1216 }
1217 unittest
1218 {
1219     // From Intel documentation:
1220     // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 32 bytes."
1221     ubyte[256] cacheline; // though it seems it cannot generate GP fault
1222     _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
1223     _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
1224     _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
1225     _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
1226 }
1227 
1228 deprecated alias
1229     _m_psadbw = _mm_sad_pu8,
1230     _m_pshufw = _mm_shuffle_pi16;
1231 
1232 static if (GDC_with_SSE)
1233 {
1234     alias _mm_rcp_ps = __builtin_ia32_rcpps;
1235 }
1236 else static if (LDC_with_SSE1)
1237 {
1238     alias _mm_rcp_ps = __builtin_ia32_rcpps;
1239 }
1240 else
1241 {
1242     __m128 _mm_rcp_ps (__m128 a) pure @safe
1243     {
1244         a[0] = 1.0f / a[0];
1245         a[1] = 1.0f / a[1];
1246         a[2] = 1.0f / a[2];
1247         a[3] = 1.0f / a[3];
1248         return a;
1249     }
1250 }
1251 
1252 static if (GDC_with_SSE)
1253 {
1254     alias _mm_rcp_ss = __builtin_ia32_rcpss;
1255 }
1256 else static if (LDC_with_SSE1)
1257 {
1258     alias _mm_rcp_ss = __builtin_ia32_rcpss;
1259 }
1260 else
1261 {
1262     __m128 _mm_rcp_ss (__m128 a) pure @safe
1263     {
1264         a[0] = 1.0f / a[0];
1265         return a;
1266     }
1267 }
1268 
1269 static if (GDC_with_SSE)
1270 {
1271     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
1272 }
1273 else static if (LDC_with_SSE1)
1274 {
1275     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
1276 }
1277 else version(LDC)
1278 {
1279     __m128 _mm_rsqrt_ps (__m128 a) pure @safe
1280     {
1281         a[0] = 1.0f / llvm_sqrt(a[0]);
1282         a[1] = 1.0f / llvm_sqrt(a[1]);
1283         a[2] = 1.0f / llvm_sqrt(a[2]);
1284         a[3] = 1.0f / llvm_sqrt(a[3]);
1285         return a;
1286     }
1287 }
1288 else
1289 {
1290     __m128 _mm_rsqrt_ps (__m128 a) pure @safe
1291     {
1292         a[0] = 1.0f / sqrt(a[0]);
1293         a[1] = 1.0f / sqrt(a[1]);
1294         a[2] = 1.0f / sqrt(a[2]);
1295         a[3] = 1.0f / sqrt(a[3]);
1296         return a;
1297     }
1298 }
1299 
1300 static if (GDC_with_SSE)
1301 {
1302     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
1303 }
1304 else static if (LDC_with_SSE1)
1305 {
1306     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
1307 }
1308 else version(LDC)
1309 {
1310     __m128 _mm_rsqrt_ss (__m128 a) pure @safe
1311     {
1312         a[0] = 1.0f / llvm_sqrt(a[0]);
1313         return a;
1314     }
1315 }
1316 else
1317 {
1318     __m128 _mm_rsqrt_ss (__m128 a) pure @safe
1319     {
1320         a[0] = 1.0f / sqrt(a[0]);
1321         return a;
1322     }
1323 }
1324 
1325 unittest
1326 {
1327     double maxRelativeError = 0.000245; // -72 dB
1328     void testInvSqrt(float number) nothrow @nogc
1329     {
1330         __m128 A = _mm_set1_ps(number);
1331 
1332         // test _mm_rcp_ps
1333         __m128 B = _mm_rcp_ps(A);
1334         foreach(i; 0..4)
1335         {
1336             double exact = 1.0f / A.array[i];
1337             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
1338             assert(abs(ratio - 1) <= maxRelativeError);
1339         }
1340 
1341         // test _mm_rcp_ss
1342         {
1343             B = _mm_rcp_ss(A);
1344             double exact = 1.0f / A.array[0];
1345             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
1346             assert(abs(ratio - 1) <= maxRelativeError);
1347         }
1348 
1349         // test _mm_rsqrt_ps
1350         B = _mm_rsqrt_ps(A);
1351         foreach(i; 0..4)
1352         {
1353             double exact = 1.0f / sqrt(A.array[i]);
1354             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
1355             assert(abs(ratio - 1) <= maxRelativeError);
1356         }
1357 
1358         // test _mm_rsqrt_ss
1359         {
1360             B = _mm_rsqrt_ss(A);
1361             double exact = 1.0f / sqrt(A.array[0]);
1362             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
1363             assert(abs(ratio - 1) <= maxRelativeError);
1364         }
1365     }
1366 
1367     testInvSqrt(1.1f);
1368     testInvSqrt(2.45674864151f);
1369     testInvSqrt(27841456468.0f);
1370 }
1371 
1372 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
1373 {
1374     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
1375 }
1376 
1377 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
1378 {
1379     // Note: unsupported on ARM
1380     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
1381 }
1382 
1383 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
1384 {
1385     // Note: unsupported on ARM
1386     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
1387 }
1388 
1389 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
1390 {
1391     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
1392 }
1393 
1394 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
1395 {
1396     // Note: despite appearances, generates sensible code,
1397     //       inlines correctly and is constant folded
1398     float[4] result = [e0, e1, e2, e3];
1399     return loadUnaligned!(float4)(result.ptr);
1400 }
1401 unittest
1402 {
1403     __m128 A = _mm_set_ps(3, 2, 1, 546);
1404     float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
1405     assert(A.array == correct);
1406     assert(A.array[0] == 546.0f);
1407     assert(A.array[1] == 1.0f);
1408     assert(A.array[2] == 2.0f);
1409     assert(A.array[3] == 3.0f);
1410 }
1411 
1412 alias _mm_set_ps1 = _mm_set1_ps;
1413 
1414 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
1415 {
1416     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
1417 }
1418 
1419 __m128 _mm_set_ss (float a) pure @trusted
1420 {
1421     __m128 r = _mm_setzero_ps();
1422     r.ptr[0] = a;
1423     return r;
1424 }
1425 unittest
1426 {
1427     float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
1428     __m128 A = _mm_set_ss(42.0f);
1429     assert(A.array == correct);
1430 }
1431 
1432 __m128 _mm_set1_ps (float a) pure @trusted
1433 {
1434     __m128 r = void;
1435     r.ptr[0] = a;
1436     r.ptr[1] = a;
1437     r.ptr[2] = a;
1438     r.ptr[3] = a;
1439     return r;
1440 }
1441 unittest
1442 {
1443     float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
1444     __m128 A = _mm_set1_ps(42.0f);
1445     assert(A.array == correct);
1446 }
1447 
1448 void _mm_setcsr(uint controlWord) @trusted
1449 {
1450     static if (LDC_with_ARM)
1451     {
1452         // Convert from SSE to ARM control word. This is done _partially_
1453         // and only support rounding mode changes.
1454 
1455         // "To alter some bits of a VFP system register without 
1456         // affecting other bits, use a read-modify-write procedure"
1457         uint fpscr = arm_get_fpcr();
1458         
1459         // Bits 23 to 22 are rounding modes, however not used in NEON
1460         fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
1461         switch(controlWord & _MM_ROUND_MASK)
1462         {
1463             default:
1464             case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
1465             case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
1466             case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
1467             case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
1468         }
1469         fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
1470         if (controlWord & _MM_FLUSH_ZERO_MASK)
1471             fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
1472         arm_set_fpcr(fpscr);
1473     }
1474     else version(GNU)
1475     {
1476         static if (GDC_with_SSE)
1477         {
1478             __builtin_ia32_ldmxcsr(controlWord);
1479         }
1480         else version(X86)
1481         {
1482             asm pure nothrow @nogc @trusted
1483             {
1484                 "ldmxcsr %0;\n" 
1485                   : 
1486                   : "m" (controlWord)
1487                   : ;
1488             }
1489         }
1490         else
1491             static assert(false);
1492     }
1493     else version (InlineX86Asm)
1494     {
1495         asm pure nothrow @nogc @safe
1496         {
1497             ldmxcsr controlWord;
1498         }
1499     }
1500     else
1501         static assert(0, "Not yet supported");
1502 }
1503 unittest
1504 {
1505     _mm_setcsr(_mm_getcsr());
1506 }
1507 
1508 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
1509 {
1510     float[4] result = [e3, e2, e1, e0];
1511     return loadUnaligned!(float4)(result.ptr);
1512 }
1513 unittest
1514 {
1515     __m128 A = _mm_setr_ps(3, 2, 1, 546);
1516     float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
1517     assert(A.array == correct);
1518     assert(A.array[0] == 3.0f);
1519     assert(A.array[1] == 2.0f);
1520     assert(A.array[2] == 1.0f);
1521     assert(A.array[3] == 546.0f);
1522 }
1523 
1524 __m128 _mm_setzero_ps() pure @trusted
1525 {
1526     // Compiles to xorps without problems
1527     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
1528     return loadUnaligned!(float4)(result.ptr);
1529 }
1530 
1531 version(GNU)
1532 {
1533     void _mm_sfence() pure @trusted
1534     {
1535         static if (GDC_with_SSE)
1536         {
1537             __builtin_ia32_sfence();
1538         }
1539         else version(X86)
1540         {
1541             asm pure nothrow @nogc @trusted
1542             {
1543                 "sfence;\n" : : : ;
1544             }
1545         }
1546         else
1547             static assert(false);
1548     }
1549 }
1550 else static if (LDC_with_SSE1)
1551 {
1552     alias _mm_sfence = __builtin_ia32_sfence;
1553 }
1554 else static if (DMD_with_asm)
1555 {
1556     void _mm_sfence() pure @safe
1557     {
1558         asm nothrow @nogc pure @safe
1559         {
1560             sfence;
1561         }
1562     }
1563 }
1564 else version(LDC)
1565 {
1566     void _mm_sfence() pure @safe
1567     {
1568         llvm_memory_fence(); // PERF: generates mfence
1569     }
1570 }
1571 else
1572     static assert(false);
1573 unittest
1574 {
1575     _mm_sfence();
1576 }
1577 
1578 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe
1579 {
1580     return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ),
1581                                               ( (imm8 >> 2) & 3 ),
1582                                               ( (imm8 >> 4) & 3 ),
1583                                               ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
1584 }
1585 unittest
1586 {
1587     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
1588     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1589     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
1590     short[4] expectedB = [ 3, 2, 1, 0 ];
1591     assert(B.array == expectedB);
1592 }
1593 
1594 // Note: the immediate shuffle value is given at compile-time instead of runtime.
1595 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
1596 {
1597     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
1598 }
1599 
1600 static if (GDC_with_SSE)
1601 {
1602     alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
1603 }
1604 else version(LDC)
1605 {
1606     // Disappeared with LDC 1.11
1607     static if (__VERSION__ < 2081)
1608         alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
1609     else
1610     {
1611         __m128 _mm_sqrt_ps(__m128 vec) pure @safe
1612         {
1613             vec.array[0] = llvm_sqrt(vec.array[0]);
1614             vec.array[1] = llvm_sqrt(vec.array[1]);
1615             vec.array[2] = llvm_sqrt(vec.array[2]);
1616             vec.array[3] = llvm_sqrt(vec.array[3]);
1617             return vec;
1618         }
1619     }
1620 }
1621 else
1622 {
1623     __m128 _mm_sqrt_ps(__m128 vec) pure @trusted
1624     {
1625         vec.ptr[0] = sqrt(vec.array[0]);
1626         vec.ptr[1] = sqrt(vec.array[1]);
1627         vec.ptr[2] = sqrt(vec.array[2]);
1628         vec.ptr[3] = sqrt(vec.array[3]);
1629         return vec;
1630     }
1631 }
1632 unittest
1633 {
1634     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
1635     assert(A.array[0] == 2.0f);
1636     assert(A.array[1] == 2.0f);
1637     assert(A.array[2] == 2.0f);
1638     assert(A.array[3] == 2.0f);
1639 }
1640 
1641 static if (GDC_with_SSE)
1642 {
1643     alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
1644 }
1645 else version(LDC)
1646 {
1647     // Disappeared with LDC 1.11
1648     static if (__VERSION__ < 2081)
1649         alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
1650     else
1651     {
1652         __m128 _mm_sqrt_ss(__m128 vec) pure @safe
1653         {
1654             vec.array[0] = llvm_sqrt(vec.array[0]);
1655             vec.array[1] = vec.array[1];
1656             vec.array[2] = vec.array[2];
1657             vec.array[3] = vec.array[3];
1658             return vec;
1659         }
1660     }
1661 }
1662 else
1663 {
1664     __m128 _mm_sqrt_ss(__m128 vec) pure @trusted
1665     {
1666         vec.ptr[0] = sqrt(vec.array[0]);
1667         return vec;
1668     }
1669 }
1670 unittest
1671 {
1672     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
1673     assert(A.array[0] == 2.0f);
1674     assert(A.array[1] == 4.0f);
1675     assert(A.array[2] == 4.0f);
1676     assert(A.array[3] == 4.0f);
1677 }
1678 
1679 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1680 {
1681     __m128* aligned = cast(__m128*)mem_addr;
1682     *aligned = a;
1683 }
1684 
1685 alias _mm_store_ps1 = _mm_store1_ps;
1686 
1687 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
1688 {
1689     *mem_addr = a.array[0];
1690 }
1691 unittest
1692 {
1693     float a;
1694     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
1695     assert(a == 546);
1696 }
1697 
1698 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment
1699 {
1700     __m128* aligned = cast(__m128*)mem_addr;
1701     __m128 r;
1702     r.ptr[0] = a.array[0];
1703     r.ptr[1] = a.array[0];
1704     r.ptr[2] = a.array[0];
1705     r.ptr[3] = a.array[0];
1706     *aligned = r;
1707 }
1708 
1709 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
1710 {
1711     long2 la = cast(long2)a;
1712     (*p).ptr[0] = la.array[1];
1713 }
1714 unittest
1715 {
1716     __m64 R = _mm_setzero_si64();
1717     long2 A = [13, 25];
1718     _mm_storeh_pi(&R, cast(__m128)A);
1719     assert(R.array[0] == 25);
1720 }
1721 
1722 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
1723 {
1724     long2 la = cast(long2)a;
1725     (*p).ptr[0] = la.array[0];
1726 }
1727 unittest
1728 {
1729     __m64 R = _mm_setzero_si64();
1730     long2 A = [13, 25];
1731     _mm_storel_pi(&R, cast(__m128)A);
1732     assert(R.array[0] == 13);
1733 }
1734 
1735 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment
1736 {
1737     __m128* aligned = cast(__m128*)mem_addr;
1738     __m128 r;
1739     r.ptr[0] = a.array[3];
1740     r.ptr[1] = a.array[2];
1741     r.ptr[2] = a.array[1];
1742     r.ptr[3] = a.array[0];
1743     *aligned = r;
1744 }
1745 
1746 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe
1747 {
1748     storeUnaligned!(float4)(a, mem_addr);
1749 }
1750 
1751 void _mm_stream_pi (__m64* mem_addr, __m64 a)
1752 {
1753     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
1754     *mem_addr = a; // it's a regular move instead
1755 }
1756 
1757 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal
1758 // needs some IR outside this function that would say:
1759 //
1760 //  !0 = !{ i32 1 }
1761 //
1762 // It's a LLVM IR metadata description.
1763 // Regardless, non-temporal moves are really dangerous for performance...
1764 void _mm_stream_ps (float* mem_addr, __m128 a)
1765 {
1766     __m128* dest = cast(__m128*)mem_addr;
1767     *dest = a; // it's a regular move instead
1768 }
1769 unittest
1770 {
1771     align(16) float[4] A;
1772     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
1773     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
1774 }
1775 
1776 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
1777 {
1778     return a - b;
1779 }
1780 unittest
1781 {
1782     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1783     a = _mm_sub_ps(a, a);
1784     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
1785     assert(a.array == correct);
1786 }
1787 
1788 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
1789 {
1790     static if (GDC_with_SSE)
1791         return __builtin_ia32_subss(a, b);
1792     else
1793     {
1794         a[0] -= b[0];
1795         return a;
1796     }
1797 }
1798 unittest
1799 {
1800     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1801     a = _mm_sub_ss(a, a);
1802     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
1803     assert(a.array == correct);
1804 }
1805 
1806 
1807 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
1808 {
1809     __m128 tmp3, tmp2, tmp1, tmp0;
1810     tmp0 = _mm_unpacklo_ps(row0, row1);
1811     tmp2 = _mm_unpacklo_ps(row2, row3);
1812     tmp1 = _mm_unpackhi_ps(row0, row1);
1813     tmp3 = _mm_unpackhi_ps(row2, row3);
1814     row0 = _mm_movelh_ps(tmp0, tmp2);
1815     row1 = _mm_movehl_ps(tmp2, tmp0);
1816     row2 = _mm_movelh_ps(tmp1, tmp3);
1817     row3 = _mm_movehl_ps(tmp3, tmp1);
1818 }
1819 
1820 // Note: the only difference between these intrinsics is the signalling
1821 //       behaviour of quiet NaNs. This is incorrect but the case where
1822 //       you would want to differentiate between qNaN and sNaN and then
1823 //       treat them differently on purpose seems extremely rare.
1824 alias _mm_ucomieq_ss = _mm_comieq_ss;
1825 alias _mm_ucomige_ss = _mm_comige_ss;
1826 alias _mm_ucomigt_ss = _mm_comigt_ss;
1827 alias _mm_ucomile_ss = _mm_comile_ss;
1828 alias _mm_ucomilt_ss = _mm_comilt_ss;
1829 alias _mm_ucomineq_ss = _mm_comineq_ss;
1830 
1831 
1832 __m128 _mm_undefined_ps() pure @safe
1833 {
1834     __m128 undef = void;
1835     return undef;
1836 }
1837 
1838 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
1839 {
1840     __m128 r;
1841     r.ptr[0] = a.array[2];
1842     r.ptr[1] = b.array[2];
1843     r.ptr[2] = a.array[3];
1844     r.ptr[3] = b.array[3];
1845     return r;
1846 }
1847 
1848 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
1849 {
1850     __m128 r;
1851     r.ptr[0] = a.array[0];
1852     r.ptr[1] = b.array[0];
1853     r.ptr[2] = a.array[1];
1854     r.ptr[3] = b.array[1];
1855     return r;
1856 }
1857 
1858 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
1859 {
1860     return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
1861 }
1862 
1863 
1864 private
1865 {
1866     /// Returns: `true` if the pointer is suitably aligned.
1867     bool isPointerAligned(void* p, size_t alignment) pure
1868     {
1869         assert(alignment != 0);
1870         return ( cast(size_t)p & (alignment - 1) ) == 0;
1871     }
1872 
1873     /// Returns: next pointer aligned with alignment bytes.
1874     void* nextAlignedPointer(void* start, size_t alignment) pure
1875     {
1876         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
1877     }
1878 
1879     // Returns number of bytes to actually allocate when asking
1880     // for a particular alignment
1881     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
1882     {
1883         enum size_t pointerSize = size_t.sizeof;
1884         return askedSize + alignment - 1 + pointerSize * 3;
1885     }
1886 
1887     // Store pointer given my malloc, size and alignment
1888     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
1889     {
1890         enum size_t pointerSize = size_t.sizeof;
1891         char* start = cast(char*)raw + pointerSize * 3;
1892         void* aligned = nextAlignedPointer(start, alignment);
1893         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
1894         *rawLocation = raw;
1895         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
1896         *sizeLocation = size;
1897         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
1898         *alignmentLocation = alignment;
1899         assert( isPointerAligned(aligned, alignment) );
1900         return aligned;
1901     }
1902 
1903     // Returns: x, multiple of powerOfTwo, so that x >= n.
1904     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
1905     {
1906         // check power-of-two
1907         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
1908 
1909         size_t mask = ~(powerOfTwo - 1);
1910         return (n + powerOfTwo - 1) & mask;
1911     }
1912 }
1913 
1914 unittest
1915 {
1916     assert(nextMultipleOf(0, 4) == 0);
1917     assert(nextMultipleOf(1, 4) == 4);
1918     assert(nextMultipleOf(2, 4) == 4);
1919     assert(nextMultipleOf(3, 4) == 4);
1920     assert(nextMultipleOf(4, 4) == 4);
1921     assert(nextMultipleOf(5, 4) == 8);
1922 
1923     {
1924         void* p = _mm_malloc(23, 16);
1925         assert(p !is null);
1926         assert(((cast(size_t)p) & 0xf) == 0);
1927         _mm_free(p);
1928     }
1929 
1930     void* nullAlloc = _mm_malloc(0, 32);
1931     assert(nullAlloc != null);
1932     _mm_free(nullAlloc);
1933 }
1934 
1935 // For some reason, order of declaration is important for this one... 4
1936 // so it is misplaced.
1937 // Note: is just another name for _mm_cvtss_si32
1938 alias _mm_cvt_ss2si = _mm_cvtss_si32;