inteli.xmmintrin source code

1 /**
2 * SSE intrinsics.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.xmmintrin;
8 
9 public import inteli.types;
10 
11 import inteli.internals;
12 
13 import inteli.mmx;
14 import inteli.emmintrin;
15 
16 import core.stdc.stdlib: malloc, free;
17 import core.exception: onOutOfMemoryError;
18 
19 version(D_InlineAsm_X86)
20     version = InlineX86Asm;
21 else version(D_InlineAsm_X86_64)
22     version = InlineX86Asm;
23 
24 
25 // SSE1
26 
27 nothrow @nogc:
28 
29 
30 enum int _MM_EXCEPT_INVALID    = 0x0001; /// MXCSR Exception states.
31 enum int _MM_EXCEPT_DENORM     = 0x0002; ///ditto
32 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004; ///ditto
33 enum int _MM_EXCEPT_OVERFLOW   = 0x0008; ///ditto
34 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010; ///ditto
35 enum int _MM_EXCEPT_INEXACT    = 0x0020; ///ditto
36 enum int _MM_EXCEPT_MASK       = 0x003f; /// MXCSR Exception states mask.
37 
38 enum int _MM_MASK_INVALID      = 0x0080; /// MXCSR Exception masks.
39 enum int _MM_MASK_DENORM       = 0x0100; ///ditto
40 enum int _MM_MASK_DIV_ZERO     = 0x0200; ///ditto
41 enum int _MM_MASK_OVERFLOW     = 0x0400; ///ditto
42 enum int _MM_MASK_UNDERFLOW    = 0x0800; ///ditto
43 enum int _MM_MASK_INEXACT      = 0x1000; ///ditto
44 enum int _MM_MASK_MASK         = 0x1f80; /// MXCSR Exception masks mask.
45 
46 enum int _MM_ROUND_NEAREST     = 0x0000; /// MXCSR Rounding mode.
47 enum int _MM_ROUND_DOWN        = 0x2000; ///ditto
48 enum int _MM_ROUND_UP          = 0x4000; ///ditto
49 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto
50 enum int _MM_ROUND_MASK        = 0x6000; /// MXCSR Rounding mode mask.
51 
52 enum int _MM_FLUSH_ZERO_MASK   = 0x8000; /// MXCSR Denormal flush to zero mask.
53 enum int _MM_FLUSH_ZERO_ON     = 0x8000; /// MXCSR Denormal flush to zero modes.
54 enum int _MM_FLUSH_ZERO_OFF    = 0x0000; ///ditto
55 
56 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
57 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
58 {
59     return a + b;
60 }
61 unittest
62 {
63     __m128 a = [1, 2, 3, 4];
64     a = _mm_add_ps(a, a);
65     assert(a.array[0] == 2);
66     assert(a.array[1] == 4);
67     assert(a.array[2] == 6);
68     assert(a.array[3] == 8);
69 }
70 
71 /// Add the lower single-precision (32-bit) floating-point element 
72 /// in `a` and `b`, store the result in the lower element of result, 
73 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
74 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
75 {
76     static if (GDC_with_SSE)
77     {
78         return __builtin_ia32_addss(a, b);
79     }
80     else static if (DMD_with_DSIMD)
81     {
82         return cast(__m128) __simd(XMM.ADDSS, a, b);
83     }
84     else
85     {
86         a[0] += b[0];
87         return a;
88     }
89 }
90 unittest
91 {
92     __m128 a = [1, 2, 3, 4];
93     a = _mm_add_ss(a, a);
94     assert(a.array == [2.0f, 2, 3, 4]);
95 }
96 
97 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
98 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
99 {
100     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
101 }
102 unittest
103 {
104     float a = 4.32f;
105     float b = -78.99f;
106     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
107     __m128 A = _mm_set_ps(a, b, a, b);
108     __m128 B = _mm_set_ps(b, a, b, a);
109     int4 R = cast(int4)( _mm_and_ps(A, B) );
110     assert(R.array[0] == correct);
111     assert(R.array[1] == correct);
112     assert(R.array[2] == correct);
113     assert(R.array[3] == correct);
114 }
115 
116 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
117 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
118 {
119     return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
120 }
121 unittest
122 {
123     float a = 4.32f;
124     float b = -78.99f;
125     int correct  = ~(*cast(int*)(&a)) &  (*cast(int*)(&b));
126     int correct2 =  (*cast(int*)(&a)) & ~(*cast(int*)(&b));
127     __m128 A = _mm_set_ps(a, b, a, b);
128     __m128 B = _mm_set_ps(b, a, b, a);
129     int4 R = cast(int4)( _mm_andnot_ps(A, B) );
130     assert(R.array[0] == correct2);
131     assert(R.array[1] == correct);
132     assert(R.array[2] == correct2);
133     assert(R.array[3] == correct);
134 }
135 
136 /// Average packed unsigned 16-bit integers in ``a` and `b`.
137 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
138 {
139     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
140 }
141 
142 /// Average packed unsigned 8-bit integers in ``a` and `b`.
143 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
144 {
145     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
146 }
147 
148 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality.
149 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
150 {
151     return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
152 }
153 
154 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 
155 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
156 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
157 {
158     return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
159 }
160 
161 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal.
162 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
163 {
164     return cast(__m128) cmpps!(FPComparison.oge)(a, b);
165 }
166 unittest
167 {
168     __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan),
169                                            _mm_setr_ps(0, 0, 0, 0));
170     int[4] correct = [-1, -1, 0, 0];
171     assert(R.array == correct);
172 }
173 
174 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 
175 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
176 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
177 {
178     return cast(__m128) cmpss!(FPComparison.oge)(a, b);
179 }
180 
181 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than.
182 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
183 {
184     return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
185 }
186 
187 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 
188 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
189 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
190 {
191     return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
192 }
193 
194 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal.
195 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
196 {
197     return cast(__m128) cmpps!(FPComparison.ole)(a, b);
198 }
199 
200 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 
201 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
202 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
203 {
204     return cast(__m128) cmpss!(FPComparison.ole)(a, b);
205 }
206 
207 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than.
208 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
209 {
210     return cast(__m128) cmpps!(FPComparison.olt)(a, b);
211 }
212 
213 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 
214 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
215 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
216 {
217     return cast(__m128) cmpss!(FPComparison.olt)(a, b);
218 }
219 
220 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal.
221 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
222 {
223     return cast(__m128) cmpps!(FPComparison.une)(a, b);
224 }
225 
226 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 
227 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
228 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
229 {
230     return cast(__m128) cmpss!(FPComparison.une)(a, b);
231 }
232 
233 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal.
234 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
235 {
236     return cast(__m128) cmpps!(FPComparison.ult)(a, b);
237 }
238 
239 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 
240 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
241 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
242 {
243     return cast(__m128) cmpss!(FPComparison.ult)(a, b);
244 }
245 
246 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than.
247 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
248 {
249     return cast(__m128) cmpps!(FPComparison.ule)(a, b);
250 }
251 
252 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 
253 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
254 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
255 {
256     return cast(__m128) cmpss!(FPComparison.ule)(a, b);
257 }
258 
259 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal.
260 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
261 {
262     return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
263 }
264 
265 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 
266 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
267 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
268 {
269     return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
270 }
271 
272 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than.
273 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
274 {
275     return cast(__m128) cmpps!(FPComparison.uge)(a, b);
276 }
277 
278 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 
279 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
280 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
281 {
282     return cast(__m128) cmpss!(FPComparison.uge)(a, b);
283 }
284 
285 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN.
286 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
287 {
288     return cast(__m128) cmpps!(FPComparison.ord)(a, b);
289 }
290 
291 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 
292 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
293 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
294 {
295     return cast(__m128) cmpss!(FPComparison.ord)(a, b);
296 }
297 
298 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
299 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
300 {
301     return cast(__m128) cmpps!(FPComparison.uno)(a, b);
302 }
303 
304 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
305 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
306 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
307 {
308     return cast(__m128) cmpss!(FPComparison.uno)(a, b);
309 }
310 
311 // Note: we've reversed clang and GCC behaviour with regards to EFLAGS
312 // Some such comparisons yields true for NaNs, other don't.
313 
314 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 
315 /// and return the boolean result (0 or 1).
316 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete
317 {
318     return comss!(FPComparison.ueq)(a, b); // yields true for NaN!
319 }
320 
321 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 
322 /// and return the boolean result (0 or 1).
323 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae
324 {
325     return comss!(FPComparison.oge)(a, b);
326 }
327 
328 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 
329 /// and return the boolean result (0 or 1).
330 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
331 {
332     return comss!(FPComparison.ogt)(a, b);
333 }
334 
335 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 
336 /// and return the boolean result (0 or 1).
337 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
338 {
339     return comss!(FPComparison.ule)(a, b); // yields true for NaN!
340 }
341 
342 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 
343 /// and return the boolean result (0 or 1).
344 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
345 {
346     return comss!(FPComparison.ult)(a, b); // yields true for NaN!
347 }
348 
349 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 
350 /// and return the boolean result (0 or 1).
351 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
352 {
353     return comss!(FPComparison.one)(a, b);
354 }
355 
356 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
357 /// floating-point elements, store the results in the lower 2 elements, 
358 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
359 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
360 
361 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 
362 /// to packed 32-bit integers.
363 __m64 _mm_cvt_ps2pi (__m128 a) @safe
364 {
365     return to_m64(_mm_cvtps_epi32(a));
366 }
367 
368 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
369 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
370 /// upper elements of the result.
371 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted
372 {
373     v.ptr[0] = cast(float)x;
374     return v;
375 }
376 unittest
377 {
378     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
379     assert(a.array == [42f, 0, 0, 0]);
380 }
381 
382 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
383 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
384 {
385     __m128i ma = to_m128i(a);
386     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
387     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
388     return _mm_cvtepi32_ps(ma);
389 }
390 unittest
391 {
392     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
393     __m128 R = _mm_cvtpi16_ps(A);
394     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
395     assert(R.array == correct);
396 }
397 
398 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
399 /// floating-point elements, store the results in the lower 2 elements, 
400 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
401 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
402 {
403     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
404     a.ptr[0] = fb.array[0];
405     a.ptr[1] = fb.array[1];
406     return a;
407 }
408 unittest
409 {
410     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
411     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
412     assert(R.array == correct);
413 }
414 
415 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 
416 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 
417 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements.
418 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
419 {
420     long2 l;
421     l.ptr[0] = a.array[0];
422     l.ptr[1] = b.array[0];
423     return _mm_cvtepi32_ps(cast(__m128i)l);
424 }
425 unittest
426 {
427     __m64 A = _mm_setr_pi32(-45, 128);
428     __m64 B = _mm_setr_pi32(0, 1000);
429     __m128 R = _mm_cvtpi32x2_ps(A, B);
430     float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f];
431     assert(R.array == correct);
432 }
433 
434 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
435 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
436 {
437     __m128i b = to_m128i(a); 
438 
439     // Zero extend to 32-bit
440     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
441     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
442 
443     // Replicate sign bit
444     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
445     return _mm_cvtepi32_ps(b);
446 }
447 unittest
448 {
449     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
450     __m128 R = _mm_cvtpi8_ps(A);
451     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
452     assert(R.array == correct);
453 }
454 
455 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers.
456 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.
457 __m64 _mm_cvtps_pi16 (__m128 a) @safe
458 {
459     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
460     // Which means the 16-bit integers should be saturated
461     __m128i b = _mm_cvtps_epi32(a);
462     b = _mm_packs_epi32(b, b);
463     return to_m64(b);
464 }
465 unittest
466 {
467     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
468     short4 R = cast(short4) _mm_cvtps_pi16(A);
469     short[4] correct = [-1, 2, -32768, 32767];
470     assert(R.array == correct);
471 }
472 
473 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
474 __m64 _mm_cvtps_pi32 (__m128 a) @safe
475 {
476     return to_m64(_mm_cvtps_epi32(a));
477 }
478 unittest
479 {
480     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
481     int2 R = cast(int2) _mm_cvtps_pi32(A);
482     int[2] correct = [-33000, 70000];
483     assert(R.array == correct);
484 }
485 
486 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 
487 /// and store the results in lower 4 elements. 
488 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.
489 __m64 _mm_cvtps_pi8 (__m128 a) @safe
490 {
491     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
492     // Which means the 8-bit integers should be saturated
493     __m128i b = _mm_cvtps_epi32(a);
494     b = _mm_packs_epi32(b, _mm_setzero_si128());
495     b = _mm_packs_epi16(b, _mm_setzero_si128());
496     return to_m64(b);
497 }
498 unittest
499 {
500     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
501     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
502     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
503     assert(R.array == correct);
504 }
505 
506 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
507 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
508 {
509     __m128i ma = to_m128i(a);
510     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
511     return _mm_cvtepi32_ps(ma);
512 }
513 unittest
514 {
515     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
516     __m128 R = _mm_cvtpu16_ps(A);
517     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
518     assert(R.array == correct);
519 }
520 
521 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element.
522 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
523 {
524     __m128i b = to_m128i(a); 
525 
526     // Zero extend to 32-bit
527     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
528     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
529     return _mm_cvtepi32_ps(b);
530 }
531 unittest
532 {
533     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
534     __m128 R = _mm_cvtpu8_ps(A);
535     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
536     assert(R.array == correct);
537 }
538 
539 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
540 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
541 /// upper elements of result.
542 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
543 {
544     v.ptr[0] = cast(float)x;
545     return v;
546 }
547 unittest
548 {
549     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
550     assert(a.array == [42.0f, 0, 0, 0]);
551 }
552 
553 
554 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 
555 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
556 /// upper elements of result.
557 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
558 {
559     v.ptr[0] = cast(float)x;
560     return v;
561 }
562 unittest
563 {
564     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
565     assert(a.array == [42.0f, 0, 0, 0]);
566 }
567 
568 /// Take the lower single-precision (32-bit) floating-point element of `a`.
569 float _mm_cvtss_f32(__m128 a) pure @safe
570 {
571     return a.array[0];
572 }
573 
574 static if (LDC_with_SSE1)
575 {
576     /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
577     int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC
578     {
579         return __builtin_ia32_cvtss2si(a);
580     }
581 }
582 else
583 {
584     /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
585     int _mm_cvtss_si32 (__m128 a) @safe
586     {
587         return convertFloatToInt32UsingMXCSR(a.array[0]);
588     }
589 }
590 unittest
591 {
592     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
593 }
594 
595 version(LDC)
596 {
597     version(X86_64)
598     {
599         /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
600         long _mm_cvtss_si64 (__m128 a) @safe
601         {
602             return __builtin_ia32_cvtss2si64(a);
603         }
604     }
605     else
606     {
607         /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
608         long _mm_cvtss_si64 (__m128 a) @safe
609         {
610             // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit
611             return convertFloatToInt64UsingMXCSR(a.array[0]);
612         }
613     }
614 }
615 else
616 {
617     /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
618     long _mm_cvtss_si64 (__m128 a) @safe
619     {
620         return convertFloatToInt64UsingMXCSR(a.array[0]);
621     }
622 }
623 unittest
624 {
625     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
626 
627     uint savedRounding = _MM_GET_ROUNDING_MODE();
628 
629     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
630     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
631 
632     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
633     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
634 
635     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
636     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
637 
638     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
639     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
640 
641     _MM_SET_ROUNDING_MODE(savedRounding);
642 }
643 
644 
645 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 
646 /// integer with truncation.
647 int _mm_cvtt_ss2si (__m128 a) pure @safe
648 {
649     // x86: cvttss2si always generated, even in -O0
650     return cast(int)(a.array[0]);
651 }
652 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto
653 unittest
654 {
655     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
656 }
657 
658 
659 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
660 /// integers with truncation.
661 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
662 {
663     return to_m64(_mm_cvttps_epi32(a));
664 }
665 
666 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 
667 /// integer with truncation.
668 long _mm_cvttss_si64 (__m128 a) pure @safe
669 {
670     return cast(long)(a.array[0]);
671 }
672 unittest
673 {
674     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
675 }
676 
677 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
678 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
679 {
680     return a / b;
681 }
682 unittest
683 {
684     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
685     a = _mm_div_ps(a, a);
686     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
687     assert(a.array == correct);
688 }
689 
690 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 
691 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 
692 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
693 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
694 {
695     static if (GDC_with_SSE)
696         return __builtin_ia32_divss(a, b);
697     else
698     {
699         a[0] /= b[0];
700         return a;
701     }
702 }
703 unittest
704 {
705     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
706     a = _mm_div_ss(a, a);
707     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
708     assert(a.array == correct);
709 }
710 
711 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended.
712 int _mm_extract_pi16 (__m64 a, int imm8)
713 {
714     short4 sa = cast(short4)a;
715     return cast(ushort)(sa.array[imm8]);
716 }
717 unittest
718 {
719     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
720     assert(_mm_extract_pi16(A, 0) == 65535);
721     assert(_mm_extract_pi16(A, 1) == 6);
722     assert(_mm_extract_pi16(A, 2) == 0);
723     assert(_mm_extract_pi16(A, 3) == 4);
724 }
725 
726 /// Free aligned memory that was allocated with `_mm_malloc`.
727 void _mm_free(void * mem_addr) @trusted
728 {
729     // support for free(NULL)
730     if (mem_addr is null)
731         return;
732 
733     // Technically we don't need to store size and alignement in the chunk, but we do in case we
734     // have to implement _mm_realloc
735 
736     size_t pointerSize = (void*).sizeof;
737     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
738     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
739     size_t alignment = *alignmentLocation;
740     assert(alignment != 0);
741     assert(isPointerAligned(mem_addr, alignment));
742     free(*rawLocation);
743 }
744 
745 /// Get the exception mask bits from the MXCSR control and status register. 
746 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 
747 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
748 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated.
749 uint _MM_GET_EXCEPTION_MASK() @safe
750 {
751     return _mm_getcsr() & _MM_MASK_MASK;
752 }
753 
754 /// Get the exception state bits from the MXCSR control and status register. 
755 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
756 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
757 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported.
758 uint _MM_GET_EXCEPTION_STATE() @safe
759 {
760     return _mm_getcsr() & _MM_EXCEPT_MASK;
761 }
762 
763 /// Get the flush zero bits from the MXCSR control and status register. 
764 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`
765 uint _MM_GET_FLUSH_ZERO_MODE() @safe
766 {
767     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
768 }
769 
770 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 
771 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
772 uint _MM_GET_ROUNDING_MODE() @safe
773 {
774     return _mm_getcsr() & _MM_ROUND_MASK;
775 }
776 
777 /// Get the unsigned 32-bit value of the MXCSR control and status register.
778 /// Note: this is emulated on ARM, because there is no MXCSR register then.
779 uint _mm_getcsr() @trusted
780 {
781     static if (LDC_with_ARM)
782     {
783         // Note: we convert the ARM FPSCR into a x86 SSE control word.
784         // However, only rounding mode and flush to zero are actually set.
785         // The returned control word will have all exceptions masked, and no exception detected.
786 
787         uint fpscr = arm_get_fpcr();
788 
789         uint cw = 0; // No exception detected
790         if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
791         {
792             // ARM has one single flag for ARM.
793             // It does both x86 bits.
794             // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
795             cw |= _MM_FLUSH_ZERO_ON;
796             cw |= 0x40; // set "denormals are zeros"
797         } 
798         cw |= _MM_MASK_MASK; // All exception maske
799 
800         // Rounding mode
801         switch(fpscr & _MM_ROUND_MASK_ARM)
802         {
803             default:
804             case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
805             case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
806             case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
807             case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
808         }
809         return cw;
810     }
811     else version(GNU)
812     {
813         static if (GDC_with_SSE)
814         {
815             return __builtin_ia32_stmxcsr();
816         }
817         else version(X86)
818         {
819             uint sseRounding = 0;
820             asm pure nothrow @nogc @trusted
821             {
822                 "stmxcsr %0;\n" 
823                   : "=m" (sseRounding)
824                   : 
825                   : ;
826             }
827             return sseRounding;
828         }
829         else
830             static assert(false);
831     }
832     else version (InlineX86Asm)
833     {
834         uint controlWord;
835         asm nothrow @nogc pure @safe
836         {
837             stmxcsr controlWord;
838         }
839         return controlWord;
840     }
841     else
842         static assert(0, "Not yet supported");
843 }
844 unittest
845 {
846     uint csr = _mm_getcsr();
847 }
848 
849 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`.
850 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted
851 {
852     short4 r = cast(short4)v;
853     r.ptr[imm8 & 3] = cast(short)i;
854     return cast(__m64)r;
855 }
856 unittest
857 {
858     __m64 A = _mm_set_pi16(3, 2, 1, 0);
859     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
860     short[4] correct = [0, 42, 2, 3];
861     assert(R.array == correct);
862 }
863 
864 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory.
865 //  `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
866 __m128 _mm_load_ps(const(float)*p) pure @trusted // TODO shouldn't be trusted
867 {
868     return *cast(__m128*)p;
869 }
870 unittest
871 {
872     static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f];
873     __m128 A = _mm_load_ps(correct.ptr);
874     assert(A.array == correct);
875 }
876 
877 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
878 __m128 _mm_load_ps1(const(float)*p) pure @trusted
879 {
880     return __m128(*p);
881 }
882 unittest
883 {
884     float n = 2.5f;
885     float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f];
886     __m128 A = _mm_load_ps1(&n);
887     assert(A.array == correct);
888 }
889 
890 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 
891 /// elements. `mem_addr` does not need to be aligned on any particular boundary.
892 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
893 {
894     __m128 r;
895     r.ptr[0] = *mem_addr;
896     r.ptr[1] = 0;
897     r.ptr[2] = 0;
898     r.ptr[3] = 0;
899     return r;
900 }
901 unittest
902 {
903     float n = 2.5f;
904     float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f];
905     __m128 A = _mm_load_ss(&n);
906     assert(A.array == correct);
907 }
908 
909 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
910 alias _mm_load1_ps = _mm_load_ps1;
911 
912 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 
913 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary.
914 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
915 {
916     // x86: movlhps generated since LDC 1.9.0 -O1
917     long2 la = cast(long2)a;
918     la.ptr[1] = (*mem_addr).array[0];
919     return cast(__m128)la;
920 }
921 unittest
922 {
923     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
924     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
925     __m64 M = to_m64(cast(__m128i)B);
926      __m128 R = _mm_loadh_pi(A, &M);
927     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
928     assert(R.array == correct);
929 }
930 
931 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 
932 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
933 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
934 {
935     // x86: movlpd/movlps generated with all LDC -01
936     long2 la = cast(long2)a;
937     la.ptr[0] = (*mem_addr).array[0];
938     return cast(__m128)la;
939 }
940 unittest
941 {
942     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
943     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
944     __m64 M = to_m64(cast(__m128i)B);
945      __m128 R = _mm_loadl_pi(A, &M);
946     float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f];
947     assert(R.array == correct);
948 }
949 
950 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 
951 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
952 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // TODO shouldn't be trusted
953 {
954     __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1
955     __m128 a = *aligned;
956     __m128 r;
957     r.ptr[0] = a.array[3];
958     r.ptr[1] = a.array[2];
959     r.ptr[2] = a.array[1];
960     r.ptr[3] = a.array[0];
961     return r;
962 }
963 unittest
964 {
965     align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ];
966     __m128 A = _mm_loadr_ps(arr.ptr);
967     float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ];
968     assert(A.array == correct);
969 }
970 
971 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into dst. 
972 /// `p` does not need to be aligned on any particular boundary.
973 __m128 _mm_loadu_ps(const(float)*p) pure @safe
974 {
975     return loadUnaligned!(__m128)(p);
976 }
977 unittest
978 {
979     align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ];  // force unaligned load
980     __m128 A = _mm_loadu_ps(&arr[1]);
981     float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ];
982     assert(A.array == correct);
983 }
984 
985 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
986 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
987 {
988     short r = *cast(short*)(mem_addr);
989     short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
990     result.ptr[0] = r;
991     return cast(__m128i)result;
992 }
993 unittest
994 {
995     short r = 13;
996     short8 A = cast(short8) _mm_loadu_si16(&r);
997     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
998     assert(A.array == correct);
999 }
1000 
1001 /// Load unaligned 64-bit integer from memory into the first element of result.
1002 /// Upper 64-bit is zeroed.
1003 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
1004 {
1005     long r = *cast(long*)(mem_addr);
1006     long2 result = [0, 0];
1007     result.ptr[0] = r;
1008     return cast(__m128i)result;
1009 }
1010 unittest
1011 {
1012     long r = 446446446446;
1013     long2 A = cast(long2) _mm_loadu_si64(&r);
1014     long[2] correct = [446446446446, 0];
1015     assert(A.array == correct);
1016 }
1017 
1018 /// Allocate size bytes of memory, aligned to the alignment specified in align,
1019 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
1020 /// memory that is allocated with `_mm_malloc`.
1021 void* _mm_malloc(size_t size, size_t alignment) @trusted
1022 {
1023     assert(alignment != 0);
1024     size_t request = requestedSize(size, alignment);
1025     void* raw = malloc(request);
1026     if (request > 0 && raw == null) // malloc(0) can validly return anything
1027         onOutOfMemoryError();
1028     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
1029 }
1030 
1031 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 
1032 /// bit is not set in the corresponding element) and a non-temporal memory hint.
1033 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
1034 {
1035     // this works since mask is zero-extended
1036     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
1037 }
1038 
1039 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;///
1040 
1041 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value.
1042 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
1043 {
1044     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
1045 }
1046 
1047 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1048 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
1049 {
1050     static if (GDC_with_SSE)
1051     {
1052         return __builtin_ia32_maxps(a, b);
1053     }
1054     else static if (LDC_with_SSE1)
1055     {
1056         return __builtin_ia32_maxps(a, b);
1057     }
1058     else
1059     {
1060         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1061         __m128 r;
1062         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1063         r[1] = (a[1] > b[1]) ? a[1] : b[1];
1064         r[2] = (a[2] > b[2]) ? a[2] : b[2];
1065         r[3] = (a[3] > b[3]) ? a[3] : b[3];
1066         return r;    
1067     }
1068 }
1069 unittest
1070 {
1071     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1072     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1073     __m128 M = _mm_max_ps(A, B);
1074     assert(M.array[0] == 4);
1075     assert(M.array[1] == 2);
1076     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1077     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1078 }
1079 
1080 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
1081 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
1082 {
1083     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
1084 }
1085 
1086 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 
1087 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1088  __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
1089 {
1090     static if (GDC_with_SSE)
1091     {
1092         return __builtin_ia32_maxss(a, b);
1093     }
1094     else static if (LDC_with_SSE1)
1095     {
1096         return __builtin_ia32_maxss(a, b); 
1097     }
1098     else
1099     {  
1100         __m128 r = a;
1101         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1102         return r;
1103     }
1104 }
1105 unittest
1106 {
1107     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1108     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1109     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1110     __m128 M = _mm_max_ss(A, B);
1111     assert(M.array[0] == 4);
1112     assert(M.array[1] == 2);
1113     assert(M.array[2] == 3);
1114     assert(M.array[3] == 4);
1115     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
1116     assert(M.array[0] != M.array[0]);
1117     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
1118     assert(M.array[0] == 1);
1119 }
1120 
1121 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values.
1122 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
1123 {
1124     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
1125 }
1126 
1127 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1128 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
1129 {
1130     static if (GDC_with_SSE)
1131     {
1132         return __builtin_ia32_minps(a, b);
1133     }
1134     else static if (LDC_with_SSE1)
1135     {
1136         // not technically needed, but better perf in debug mode
1137         return __builtin_ia32_minps(a, b);
1138     }
1139     else
1140     {
1141         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1142         __m128 r;
1143         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1144         r[1] = (a[1] < b[1]) ? a[1] : b[1];
1145         r[2] = (a[2] < b[2]) ? a[2] : b[2];
1146         r[3] = (a[3] < b[3]) ? a[3] : b[3];
1147         return r;
1148     }
1149 }
1150 unittest
1151 {
1152     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1153     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1154     __m128 M = _mm_min_ps(A, B);
1155     assert(M.array[0] == 1);
1156     assert(M.array[1] == 1);
1157     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1158     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1159 }
1160 
1161 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
1162 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
1163 {
1164     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
1165 }
1166 
1167 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 
1168 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1169 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
1170 {
1171     static if (GDC_with_SSE)
1172     {
1173         return __builtin_ia32_minss(a, b);
1174     }
1175     else static if (LDC_with_SSE1)
1176     {
1177         return __builtin_ia32_minss(a, b);
1178     }
1179     else
1180     {
1181         // Generates minss since LDC 1.3 -O1
1182         __m128 r = a;
1183         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1184         return r;
1185     }
1186 }
1187 unittest
1188 {
1189     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1190     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1191     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1192     __m128 M = _mm_min_ss(A, B);
1193     assert(M.array[0] == 1);
1194     assert(M.array[1] == 2);
1195     assert(M.array[2] == 3);
1196     assert(M.array[3] == 4);
1197     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
1198     assert(M.array[0] != M.array[0]);
1199     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
1200     assert(M.array[0] == 1);
1201 }
1202 
1203 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 
1204 /// the upper 3 packed elements from `a` to the upper elements of result.
1205 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
1206 {
1207     a.ptr[0] = b.array[0];
1208     return a;
1209 }
1210 unittest
1211 {
1212     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1213     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1214     __m128 R = _mm_move_ss(A, B);
1215     float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f];
1216     assert(R.array == correct);
1217 }
1218 
1219 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 
1220 /// copy the upper 2 elements from `a` to the upper 2 elements of dst.
1221 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
1222 {
1223     a.ptr[0] = b.array[2];
1224     a.ptr[1] = b.array[3];
1225     return a;
1226 }
1227 unittest
1228 {
1229     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1230     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1231     __m128 R = _mm_movehl_ps(A, B);
1232     float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f];
1233     assert(R.array == correct);
1234 }
1235 
1236 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 
1237 /// copy the lower 2 elements from `a` to the lower 2 elements of result
1238 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
1239 {
1240     a.ptr[2] = b.array[0];
1241     a.ptr[3] = b.array[1];
1242     return a;
1243 }
1244 unittest
1245 {
1246     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1247     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1248     __m128 R = _mm_movelh_ps(A, B);
1249     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1250     assert(R.array == correct);
1251 }
1252 
1253 /// Create mask from the most significant bit of each 8-bit element in `a`.
1254 int _mm_movemask_pi8 (__m64 a) pure @safe
1255 {
1256     return _mm_movemask_epi8(to_m128i(a));
1257 }
1258 unittest
1259 {
1260     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
1261 }
1262 
1263 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 
1264 /// floating-point element in `a`.
1265 int _mm_movemask_ps (__m128 a) pure @trusted
1266 {
1267     static if (GDC_with_SSE)
1268     {
1269         return __builtin_ia32_movmskps(a);
1270     }
1271     else static if (LDC_with_SSE1)
1272     {
1273         return __builtin_ia32_movmskps(a);
1274     }
1275     else static if (LDC_with_ARM)
1276     {
1277         int4 ai = cast(int4)a;
1278         int4 shift31 = [31, 31, 31, 31]; 
1279         ai = ai >>> shift31;
1280         int4 shift = [0, 1, 2, 3]; 
1281         ai = ai << shift; // 4-way shift, only efficient on ARM.
1282         int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
1283         return r;
1284     }
1285     else
1286     {
1287         int4 ai = cast(int4)a;
1288         int r = 0;
1289         if (ai.array[0] < 0) r += 1;
1290         if (ai.array[1] < 0) r += 2;
1291         if (ai.array[2] < 0) r += 4;
1292         if (ai.array[3] < 0) r += 8;
1293         return r;
1294     }
1295 }
1296 unittest
1297 {
1298     int4 A = [-1, 0, -43, 0];
1299     assert(5 == _mm_movemask_ps(cast(float4)A));
1300 }
1301 
1302 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
1303 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
1304 {
1305     return a * b;
1306 }
1307 unittest
1308 {
1309     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1310     a = _mm_mul_ps(a, a);
1311     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
1312     assert(a.array == correct);
1313 }
1314 
1315 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 
1316 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1317 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
1318 {
1319     static if (GDC_with_SSE)
1320         return __builtin_ia32_mulss(a, b);
1321     else
1322     {
1323         a[0] *= b[0];
1324         return a;
1325     }
1326 }
1327 unittest
1328 {
1329     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1330     a = _mm_mul_ss(a, a);
1331     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
1332     assert(a.array == correct);
1333 }
1334 
1335 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
1336 /// and return the high 16 bits of the intermediate integers.
1337 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
1338 {
1339     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
1340 }
1341 unittest
1342 {
1343     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
1344     __m64 B = _mm_set1_pi16(16384);
1345     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
1346     short[4] correct = [0, 0x3FFC, 0, 0];
1347     assert(R.array == correct);
1348 }
1349 
1350 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 
1351 /// return the result.
1352 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
1353 {
1354     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
1355 }
1356 
1357 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;///
1358 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;///
1359 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;///
1360 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;///
1361 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;///
1362 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;///
1363 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;///
1364 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;///
1365 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;///
1366 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;///
1367 
1368 enum _MM_HINT_T0  = 3; ///
1369 enum _MM_HINT_T1  = 2; ///
1370 enum _MM_HINT_T2  = 1; ///
1371 enum _MM_HINT_NTA = 0; ///
1372 
1373 
1374 version(LDC)
1375 {
1376     // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
1377     // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
1378     static if (__VERSION__ >= 2091) 
1379     {
1380         pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
1381             void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
1382     }
1383 }
1384 
1385 /// Fetch the line of data from memory that contains address `p` to a location in the 
1386 /// cache hierarchy specified by the locality hint i.
1387 ///
1388 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
1389 void _mm_prefetch(int locality)(const(void)* p) pure @trusted
1390 {
1391     static if (GDC_with_SSE)
1392     {
1393         return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
1394     }
1395     else version(LDC)
1396     {
1397         static if (__VERSION__ >= 2091)
1398         {
1399             // const_cast here. `llvm_prefetch` wants a mutable pointer
1400             llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
1401         }
1402         else
1403         {
1404             // const_cast here. `llvm_prefetch` wants a mutable pointer
1405             llvm_prefetch( cast(void*)p, 0, locality, 1);
1406         }
1407     }
1408     else version(D_InlineAsm_X86_64)
1409     {
1410         static if (locality == _MM_HINT_NTA)
1411         {
1412             asm pure nothrow @nogc @trusted
1413             {
1414                 mov RAX, p;
1415                 prefetchnta [RAX];
1416             }
1417         }
1418         else static if (locality == _MM_HINT_T0)
1419         {
1420             asm pure nothrow @nogc @trusted
1421             {
1422                 mov RAX, p;
1423                 prefetcht0 [RAX];
1424             }
1425         }
1426         else static if (locality == _MM_HINT_T1)
1427         {
1428             asm pure nothrow @nogc @trusted
1429             {
1430                 mov RAX, p;
1431                 prefetcht1 [RAX];
1432             }
1433         }
1434         else static if (locality == _MM_HINT_T2)
1435         {
1436             asm pure nothrow @nogc @trusted
1437             {
1438                 mov RAX, p;
1439                 prefetcht2 [RAX];
1440             }
1441         }
1442         else
1443             assert(false); // invalid locality hint
1444     }
1445     else version(D_InlineAsm_X86)
1446     {
1447         static if (locality == _MM_HINT_NTA)
1448         {
1449             asm pure nothrow @nogc @trusted
1450             {
1451                 mov EAX, p;
1452                 prefetchnta [EAX];
1453             }
1454         }
1455         else static if (locality == _MM_HINT_T0)
1456         {
1457             asm pure nothrow @nogc @trusted
1458             {
1459                 mov EAX, p;
1460                 prefetcht0 [EAX];
1461             }
1462         }
1463         else static if (locality == _MM_HINT_T1)
1464         {
1465             asm pure nothrow @nogc @trusted
1466             {
1467                 mov EAX, p;
1468                 prefetcht1 [EAX];
1469             }
1470         }
1471         else static if (locality == _MM_HINT_T2)
1472         {
1473             asm pure nothrow @nogc @trusted
1474             {
1475                 mov EAX, p;
1476                 prefetcht2 [EAX];
1477             }
1478         }
1479         else 
1480             assert(false); // invalid locality hint
1481     }
1482     else
1483     {
1484         // Generic version: do nothing. From bitter experience, 
1485         // it's unlikely you get ANY speed-up with manual prefetching.
1486         // Prefetching or not doesn't change program behaviour.
1487     }
1488 }
1489 unittest
1490 {
1491     // From Intel documentation:
1492     // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 
1493     // 32 bytes."
1494     ubyte[256] cacheline; // though it seems it cannot generate GP fault
1495     _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
1496     _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
1497     _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
1498     _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
1499 }
1500 
1501 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;///
1502 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;///
1503 
1504 
1505 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 
1506 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12.
1507 __m128 _mm_rcp_ps (__m128 a) pure @trusted
1508 {
1509     static if (GDC_with_SSE)
1510     {
1511         return __builtin_ia32_rcpps(a);
1512     }
1513     else static if (LDC_with_SSE1)
1514     {
1515         return __builtin_ia32_rcpps(a);
1516     }
1517     else
1518     {        
1519         a.ptr[0] = 1.0f / a.array[0];
1520         a.ptr[1] = 1.0f / a.array[1];
1521         a.ptr[2] = 1.0f / a.array[2];
1522         a.ptr[3] = 1.0f / a.array[3];
1523         return a;
1524     }
1525 }
1526 unittest
1527 {
1528     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
1529     __m128 groundTruth = _mm_set1_ps(1.0f) / A;
1530     __m128 result = _mm_rcp_ps(A);
1531     foreach(i; 0..4)
1532     {
1533         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
1534         assert(abs(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
1535     }
1536 }
1537 
1538 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 
1539 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 
1540 /// The maximum relative error for this approximation is less than 1.5*2^-12.
1541 __m128 _mm_rcp_ss (__m128 a) pure @trusted
1542 {
1543     static if (GDC_with_SSE)
1544     {
1545         return __builtin_ia32_rcpss(a);
1546     }
1547     else static if (LDC_with_SSE1)
1548     {
1549         return __builtin_ia32_rcpss(a);
1550     }
1551     else
1552     {
1553         a.ptr[0] = 1.0f / a.array[0];
1554         return a;
1555     }
1556 }
1557 unittest
1558 {
1559     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
1560     __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f);
1561     __m128 R = _mm_rcp_ss(A);
1562     double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1;
1563     assert(abs(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
1564     assert(R.array[1] == correct.array[1]);
1565     assert(R.array[2] == correct.array[2]);
1566     assert(R.array[3] == correct.array[3]);
1567 }
1568 
1569 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 
1570 /// The maximum relative error for this approximation is less than 1.5*2^-12.
1571 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted
1572 {
1573     static if (GDC_with_SSE)
1574     {
1575         return __builtin_ia32_rsqrtps(a);
1576     }
1577     else static if (LDC_with_SSE1)
1578     {
1579         return __builtin_ia32_rsqrtps(a);
1580     }
1581     else version(LDC)
1582     {
1583         a[0] = 1.0f / llvm_sqrt(a[0]);
1584         a[1] = 1.0f / llvm_sqrt(a[1]);
1585         a[2] = 1.0f / llvm_sqrt(a[2]);
1586         a[3] = 1.0f / llvm_sqrt(a[3]);
1587         return a;
1588     }
1589     else
1590     {
1591         a.ptr[0] = 1.0f / sqrt(a.array[0]);
1592         a.ptr[1] = 1.0f / sqrt(a.array[1]);
1593         a.ptr[2] = 1.0f / sqrt(a.array[2]);
1594         a.ptr[3] = 1.0f / sqrt(a.array[3]);
1595         return a;
1596     }
1597 }
1598 unittest
1599 {
1600     __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f);
1601     __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
1602     __m128 result = _mm_rsqrt_ps(A);
1603     foreach(i; 0..4)
1604     {
1605         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
1606         assert(abs(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
1607     }
1608 }
1609 
1610 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`,
1611 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 
1612 /// The maximum relative error for this approximation is less than 1.5*2^-12.
1613 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted
1614 {   
1615     static if (GDC_with_SSE)
1616     {
1617         return __builtin_ia32_rsqrtss(a);
1618     }
1619     else static if (LDC_with_SSE1)
1620     {
1621         return __builtin_ia32_rsqrtss(a);
1622     }
1623     else version(LDC)
1624     {
1625         a[0] = 1.0f / llvm_sqrt(a[0]);
1626         return a;
1627     }
1628     else
1629     {
1630         a[0] = 1.0f / sqrt(a[0]);
1631         return a;
1632     }
1633 }
1634 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss
1635 {
1636     double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc?
1637     void testApproximateSSE(float number) nothrow @nogc
1638     {
1639         __m128 A = _mm_set1_ps(number);
1640 
1641         // test _mm_rcp_ps
1642         __m128 B = _mm_rcp_ps(A);
1643         foreach(i; 0..4)
1644         {
1645             double exact = 1.0f / A.array[i];
1646             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
1647             assert(abs(ratio - 1) <= maxRelativeError);
1648         }
1649 
1650         // test _mm_rcp_ss
1651         {
1652             B = _mm_rcp_ss(A);
1653             double exact = 1.0f / A.array[0];
1654             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
1655             assert(abs(ratio - 1) <= maxRelativeError);
1656         }
1657 
1658         // test _mm_rsqrt_ps
1659         B = _mm_rsqrt_ps(A);
1660         foreach(i; 0..4)
1661         {
1662             double exact = 1.0f / sqrt(A.array[i]);
1663             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
1664             assert(abs(ratio - 1) <= maxRelativeError);
1665         }
1666 
1667         // test _mm_rsqrt_ss
1668         {
1669             B = _mm_rsqrt_ss(A);
1670             double exact = 1.0f / sqrt(A.array[0]);
1671             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
1672             assert(abs(ratio - 1) <= maxRelativeError);
1673         }
1674     }
1675 
1676     testApproximateSSE(0.00001f);
1677     testApproximateSSE(1.1f);
1678     testApproximateSSE(345.0f);
1679     testApproximateSSE(2.45674864151f);
1680     testApproximateSSE(700000.0f);
1681     testApproximateSSE(10000000.0f);
1682     testApproximateSSE(27841456468.0f);
1683 }
1684 
1685 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
1686 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
1687 /// low 16 bits of result.
1688 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
1689 {
1690     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
1691 }
1692 
1693 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
1694 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`,
1695 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1696 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
1697 {
1698     // Note: unsupported on ARM
1699     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
1700 }
1701 
1702 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
1703 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
1704 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
1705 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
1706 {
1707     // Note: unsupported on ARM
1708     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
1709 }
1710 
1711 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
1712 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`.
1713 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
1714 {
1715     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
1716 }
1717 
1718 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
1719 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
1720 {
1721     // Note: despite appearances, generates sensible code,
1722     //       inlines correctly and is constant folded
1723     float[4] result = [e0, e1, e2, e3];
1724     return loadUnaligned!(float4)(result.ptr);
1725 }
1726 unittest
1727 {
1728     __m128 A = _mm_set_ps(3, 2, 1, 546);
1729     float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
1730     assert(A.array == correct);
1731 }
1732 
1733 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; ///
1734 
1735 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
1736 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 
1737 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
1738 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
1739 {
1740     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
1741 }
1742 
1743 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements.
1744 __m128 _mm_set_ss (float a) pure @trusted
1745 {
1746     __m128 r = _mm_setzero_ps();
1747     r.ptr[0] = a;
1748     return r;
1749 }
1750 unittest
1751 {
1752     float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
1753     __m128 A = _mm_set_ss(42.0f);
1754     assert(A.array == correct);
1755 }
1756 
1757 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements.
1758 __m128 _mm_set1_ps (float a) pure @trusted
1759 {
1760     __m128 r = void;
1761     r.ptr[0] = a;
1762     r.ptr[1] = a;
1763     r.ptr[2] = a;
1764     r.ptr[3] = a;
1765     return r;
1766 }
1767 unittest
1768 {
1769     float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
1770     __m128 A = _mm_set1_ps(42.0f);
1771     assert(A.array == correct);
1772 }
1773 
1774 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`.
1775 void _mm_setcsr(uint controlWord) @trusted
1776 {
1777     static if (LDC_with_ARM)
1778     {
1779         // Convert from SSE to ARM control word. This is done _partially_
1780         // and only support rounding mode changes.
1781 
1782         // "To alter some bits of a VFP system register without 
1783         // affecting other bits, use a read-modify-write procedure"
1784         uint fpscr = arm_get_fpcr();
1785         
1786         // Bits 23 to 22 are rounding modes, however not used in NEON
1787         fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
1788         switch(controlWord & _MM_ROUND_MASK)
1789         {
1790             default:
1791             case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
1792             case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
1793             case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
1794             case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
1795         }
1796         fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
1797         if (controlWord & _MM_FLUSH_ZERO_MASK)
1798             fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
1799         arm_set_fpcr(fpscr);
1800     }
1801     else version(GNU)
1802     {
1803         static if (GDC_with_SSE)
1804         {
1805             __builtin_ia32_ldmxcsr(controlWord);
1806         }
1807         else version(X86)
1808         {
1809             asm pure nothrow @nogc @trusted
1810             {
1811                 "ldmxcsr %0;\n" 
1812                   : 
1813                   : "m" (controlWord)
1814                   : ;
1815             }
1816         }
1817         else
1818             static assert(false);
1819     }
1820     else version (InlineX86Asm)
1821     {
1822         asm pure nothrow @nogc @safe
1823         {
1824             ldmxcsr controlWord;
1825         }
1826     }
1827     else
1828         static assert(0, "Not yet supported");
1829 }
1830 unittest
1831 {
1832     _mm_setcsr(_mm_getcsr());
1833 }
1834 
1835 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
1836 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
1837 {
1838     float[4] result = [e3, e2, e1, e0];
1839     return loadUnaligned!(float4)(result.ptr);
1840 }
1841 unittest
1842 {
1843     __m128 A = _mm_setr_ps(3, 2, 1, 546);
1844     float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
1845     assert(A.array == correct);
1846     assert(A.array[0] == 3.0f);
1847     assert(A.array[1] == 2.0f);
1848     assert(A.array[2] == 1.0f);
1849     assert(A.array[3] == 546.0f);
1850 }
1851 
1852 /// Return vector of type `__m128` with all elements set to zero.
1853 __m128 _mm_setzero_ps() pure @trusted
1854 {
1855     // Compiles to xorps without problems
1856     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
1857     return loadUnaligned!(float4)(result.ptr);
1858 }
1859 
1860 /// Perform a serializing operation on all store-to-memory instructions that were issued prior 
1861 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 
1862 /// is globally visible before any store instruction which follows the fence in program order.
1863 void _mm_sfence() @trusted
1864 {
1865     version(GNU)
1866     {
1867         static if (GDC_with_SSE)
1868         {
1869             __builtin_ia32_sfence();
1870         }
1871         else version(X86)
1872         {
1873             asm pure nothrow @nogc @trusted
1874             {
1875                 "sfence;\n" : : : ;
1876             }
1877         }
1878         else
1879             static assert(false);
1880     }
1881     else static if (LDC_with_SSE1)
1882     {
1883         __builtin_ia32_sfence();
1884     }
1885     else static if (DMD_with_asm)
1886     {
1887         asm nothrow @nogc pure @safe
1888         {
1889             sfence;
1890         }
1891     }
1892     else version(LDC)
1893     {
1894         llvm_memory_fence(); // PERF: this generates mfence instead of sfence
1895     }
1896     else
1897         static assert(false);
1898 }
1899 unittest
1900 {
1901     _mm_sfence();
1902 }
1903 
1904 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
1905 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe
1906 {
1907     return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ),
1908                                               ( (imm8 >> 2) & 3 ),
1909                                               ( (imm8 >> 4) & 3 ),
1910                                               ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
1911 }
1912 unittest
1913 {
1914     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
1915     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1916     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
1917     short[4] expectedB = [ 3, 2, 1, 0 ];
1918     assert(B.array == expectedB);
1919 }
1920 
1921 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
1922 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
1923 {
1924     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
1925 }
1926 
1927 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
1928 __m128 _mm_sqrt_ps(__m128 a) @trusted
1929 {
1930     static if (GDC_with_SSE)
1931     {
1932         return __builtin_ia32_sqrtps(a);
1933     }
1934     else version(LDC)
1935     {
1936         // Disappeared with LDC 1.11
1937         static if (__VERSION__ < 2081)
1938             return __builtin_ia32_sqrtps(a);
1939         else
1940         {
1941             a[0] = llvm_sqrt(a[0]);
1942             a[1] = llvm_sqrt(a[1]);
1943             a[2] = llvm_sqrt(a[2]);
1944             a[3] = llvm_sqrt(a[3]);
1945             return a;
1946         }
1947     }
1948     else
1949     {
1950         a.ptr[0] = sqrt(a.array[0]);
1951         a.ptr[1] = sqrt(a.array[1]);
1952         a.ptr[2] = sqrt(a.array[2]);
1953         a.ptr[3] = sqrt(a.array[3]);
1954         return a;
1955     }
1956 }
1957 unittest
1958 {
1959     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
1960     assert(A.array[0] == 2.0f);
1961     assert(A.array[1] == 2.0f);
1962     assert(A.array[2] == 2.0f);
1963     assert(A.array[3] == 2.0f);
1964 }
1965 
1966 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower
1967 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result.
1968 __m128 _mm_sqrt_ss(__m128 a) @trusted
1969 {
1970     static if (GDC_with_SSE)
1971     {
1972         return __builtin_ia32_sqrtss(a);
1973     }
1974     else version(LDC)
1975     {
1976         a.ptr[0] = llvm_sqrt(a.array[0]);
1977         return a;
1978     }
1979     else
1980     {   
1981         a.ptr[0] = sqrt(a.array[0]);
1982         return a;
1983     }
1984 }
1985 unittest
1986 {
1987     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
1988     assert(A.array[0] == 2.0f);
1989     assert(A.array[1] == 4.0f);
1990     assert(A.array[2] == 4.0f);
1991     assert(A.array[3] == 4.0f);
1992 }
1993 
1994 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
1995 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1996 void _mm_store_ps (float* mem_addr, __m128 a) pure
1997 {
1998     __m128* aligned = cast(__m128*)mem_addr;
1999     *aligned = a;
2000 }
2001 
2002 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; ///
2003 
2004 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 
2005 /// `mem_addr` does not need to be aligned on any particular boundary.
2006 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
2007 {
2008     *mem_addr = a.array[0];
2009 }
2010 unittest
2011 {
2012     float a;
2013     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
2014     assert(a == 546);
2015 }
2016 
2017 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 
2018 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2019 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // TODO: shouldn't be trusted
2020 {
2021     __m128* aligned = cast(__m128*)mem_addr;
2022     __m128 r;
2023     r.ptr[0] = a.array[0];
2024     r.ptr[1] = a.array[0];
2025     r.ptr[2] = a.array[0];
2026     r.ptr[3] = a.array[0];
2027     *aligned = r;
2028 }
2029 unittest
2030 {
2031     align(16) float[4] A;
2032     _mm_store1_ps(A.ptr, _mm_set_ss(42.0f));
2033     float[4] correct = [42.0f, 42, 42, 42];
2034     assert(A == correct);
2035 }
2036 
2037 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory.
2038 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
2039 {
2040     long2 la = cast(long2)a;
2041     (*p).ptr[0] = la.array[1];
2042 }
2043 unittest
2044 {
2045     __m64 R = _mm_setzero_si64();
2046     long2 A = [13, 25];
2047     _mm_storeh_pi(&R, cast(__m128)A);
2048     assert(R.array[0] == 25);
2049 }
2050 
2051 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory.
2052 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
2053 {
2054     long2 la = cast(long2)a;
2055     (*p).ptr[0] = la.array[0];
2056 }
2057 unittest
2058 {
2059     __m64 R = _mm_setzero_si64();
2060     long2 A = [13, 25];
2061     _mm_storel_pi(&R, cast(__m128)A);
2062     assert(R.array[0] == 13);
2063 }
2064 
2065 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 
2066 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2067 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // TODO should not be trusted
2068 {
2069     __m128* aligned = cast(__m128*)mem_addr;
2070     __m128 r;
2071     r.ptr[0] = a.array[3];
2072     r.ptr[1] = a.array[2];
2073     r.ptr[2] = a.array[1];
2074     r.ptr[3] = a.array[0];
2075     *aligned = r;
2076 }
2077 unittest
2078 {
2079     align(16) float[4] A;
2080     _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4));
2081     float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f];
2082     assert(A == correct);
2083 }
2084 
2085 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2086 /// `mem_addr` does not need to be aligned on any particular boundary.
2087 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe // TODO should not be trusted
2088 {
2089     storeUnaligned!(float4)(a, mem_addr);
2090 }
2091 
2092 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint.
2093 void _mm_stream_pi (__m64* mem_addr, __m64 a)
2094 {
2095     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2096     *mem_addr = a; // it's a regular move instead
2097 }
2098 
2099 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`s into memory using
2100 /// a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be
2101 /// generated.
2102 void _mm_stream_ps (float* mem_addr, __m128 a)
2103 {
2104     // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal
2105     // needs some IR outside this function that would say:
2106     //
2107     //  !0 = !{ i32 1 }
2108     //
2109     // It's a LLVM IR metadata description.
2110     __m128* dest = cast(__m128*)mem_addr;
2111     *dest = a; // it's a regular move instead
2112 }
2113 unittest
2114 {
2115     align(16) float[4] A;
2116     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
2117     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
2118 }
2119 
2120 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 
2121 /// floating-point elements in `a`.
2122 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
2123 {
2124     return a - b;
2125 }
2126 unittest
2127 {
2128     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2129     a = _mm_sub_ps(a, a);
2130     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
2131     assert(a.array == correct);
2132 }
2133 
2134 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit)
2135 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 
2136 /// packed elements from a to the upper elements of result.
2137 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
2138 {
2139     static if (GDC_with_SSE)
2140         return __builtin_ia32_subss(a, b);
2141     else
2142     {
2143         a[0] -= b[0];
2144         return a;
2145     }
2146 }
2147 unittest
2148 {
2149     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2150     a = _mm_sub_ss(a, a);
2151     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
2152     assert(a.array == correct);
2153 }
2154 
2155 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 
2156 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
2157 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
2158 {
2159     __m128 tmp3, tmp2, tmp1, tmp0;
2160     tmp0 = _mm_unpacklo_ps(row0, row1);
2161     tmp2 = _mm_unpacklo_ps(row2, row3);
2162     tmp1 = _mm_unpackhi_ps(row0, row1);
2163     tmp3 = _mm_unpackhi_ps(row2, row3);
2164     row0 = _mm_movelh_ps(tmp0, tmp2);
2165     row1 = _mm_movehl_ps(tmp2, tmp0);
2166     row2 = _mm_movelh_ps(tmp1, tmp3);
2167     row3 = _mm_movehl_ps(tmp3, tmp1);
2168 }
2169 unittest
2170 {
2171     __m128 l0 = _mm_setr_ps(0, 1, 2, 3);
2172     __m128 l1 = _mm_setr_ps(4, 5, 6, 7);
2173     __m128 l2 = _mm_setr_ps(8, 9, 10, 11);
2174     __m128 l3 = _mm_setr_ps(12, 13, 14, 15);
2175     _MM_TRANSPOSE4_PS(l0, l1, l2, l3);
2176     float[4] r0 = [0.0f, 4, 8, 12];
2177     float[4] r1 = [1.0f, 5, 9, 13];
2178     float[4] r2 = [2.0f, 6, 10, 14];
2179     float[4] r3 = [3.0f, 7, 11, 15];
2180     assert(l0.array == r0);
2181     assert(l1.array == r1);
2182     assert(l2.array == r2);
2183     assert(l3.array == r3);
2184 }
2185 
2186 // Note: the only difference between these intrinsics is the signalling
2187 //       behaviour of quiet NaNs. This is incorrect but the case where
2188 //       you would want to differentiate between qNaN and sNaN and then
2189 //       treat them differently on purpose seems extremely rare.
2190 alias _mm_ucomieq_ss = _mm_comieq_ss;
2191 alias _mm_ucomige_ss = _mm_comige_ss;
2192 alias _mm_ucomigt_ss = _mm_comigt_ss;
2193 alias _mm_ucomile_ss = _mm_comile_ss;
2194 alias _mm_ucomilt_ss = _mm_comilt_ss;
2195 alias _mm_ucomineq_ss = _mm_comineq_ss;
2196 
2197 /// Return vector of type `__m128` with undefined elements.
2198 __m128 _mm_undefined_ps() pure @safe
2199 {
2200     __m128 undef = void;
2201     return undef;
2202 }
2203 
2204 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`.
2205 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
2206 {
2207     version(LDC)
2208     {
2209         // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0
2210         return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b);
2211     }
2212     else
2213     {
2214         __m128 r;
2215         r.ptr[0] = a.array[2];
2216         r.ptr[1] = b.array[2];
2217         r.ptr[2] = a.array[3];
2218         r.ptr[3] = b.array[3];
2219         return r;
2220     }
2221 }
2222 unittest
2223 {
2224     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
2225     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
2226     __m128 R = _mm_unpackhi_ps(A, B);
2227     float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f];
2228     assert(R.array == correct);
2229 }
2230 
2231 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`.
2232 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
2233 {
2234     version(LDC)
2235     {
2236         // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0
2237         return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b);
2238     }
2239     else
2240     {
2241         __m128 r;
2242         r.ptr[0] = a.array[0];
2243         r.ptr[1] = b.array[0];
2244         r.ptr[2] = a.array[1];
2245         r.ptr[3] = b.array[1];
2246         return r;
2247     }
2248 }
2249 unittest
2250 {
2251     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
2252     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
2253     __m128 R = _mm_unpacklo_ps(A, B);
2254     float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f];
2255     assert(R.array == correct);
2256 }
2257 
2258 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
2259 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
2260 {
2261     return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
2262 }
2263 
2264 
2265 private
2266 {
2267     // Returns: `true` if the pointer is suitably aligned.
2268     bool isPointerAligned(void* p, size_t alignment) pure
2269     {
2270         assert(alignment != 0);
2271         return ( cast(size_t)p & (alignment - 1) ) == 0;
2272     }
2273 
2274     // Returns: next pointer aligned with alignment bytes.
2275     void* nextAlignedPointer(void* start, size_t alignment) pure
2276     {
2277         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
2278     }
2279 
2280     // Returns number of bytes to actually allocate when asking
2281     // for a particular alignment
2282     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
2283     {
2284         enum size_t pointerSize = size_t.sizeof;
2285         return askedSize + alignment - 1 + pointerSize * 3;
2286     }
2287 
2288     // Store pointer given my malloc, size and alignment
2289     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
2290     {
2291         enum size_t pointerSize = size_t.sizeof;
2292         char* start = cast(char*)raw + pointerSize * 3;
2293         void* aligned = nextAlignedPointer(start, alignment);
2294         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
2295         *rawLocation = raw;
2296         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
2297         *sizeLocation = size;
2298         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
2299         *alignmentLocation = alignment;
2300         assert( isPointerAligned(aligned, alignment) );
2301         return aligned;
2302     }
2303 
2304     // Returns: x, multiple of powerOfTwo, so that x >= n.
2305     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
2306     {
2307         // check power-of-two
2308         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
2309 
2310         size_t mask = ~(powerOfTwo - 1);
2311         return (n + powerOfTwo - 1) & mask;
2312     }
2313 }
2314 
2315 unittest
2316 {
2317     assert(nextMultipleOf(0, 4) == 0);
2318     assert(nextMultipleOf(1, 4) == 4);
2319     assert(nextMultipleOf(2, 4) == 4);
2320     assert(nextMultipleOf(3, 4) == 4);
2321     assert(nextMultipleOf(4, 4) == 4);
2322     assert(nextMultipleOf(5, 4) == 8);
2323 
2324     {
2325         void* p = _mm_malloc(23, 16);
2326         assert(p !is null);
2327         assert(((cast(size_t)p) & 0xf) == 0);
2328         _mm_free(p);
2329     }
2330 
2331     void* nullAlloc = _mm_malloc(0, 32);
2332     assert(nullAlloc != null);
2333     _mm_free(nullAlloc);
2334 }
2335 
2336 // For some reason, order of declaration is important for this one
2337 // so it is misplaced.
2338 // Note: is just another name for _mm_cvtss_si32
2339 alias _mm_cvt_ss2si = _mm_cvtss_si32;