1 /**
2 * SSE intrinsics.
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 * Authors:   Guillaume Piolat
7 */
8 module inteli.xmmintrin;
9 
10 public import inteli.types;
11 
12 import inteli.internals;
13 
14 import inteli.mmx;
15 import inteli.emmintrin;
16 
17 import core.stdc.stdlib: malloc, free;
18 import core.exception: onOutOfMemoryError;
19 
20 version(D_InlineAsm_X86)
21     version = InlineX86Asm;
22 else version(D_InlineAsm_X86_64)
23     version = InlineX86Asm;
24 
25 
26 // SSE1
27 
28 nothrow @nogc:
29 
30 
31 enum int _MM_EXCEPT_INVALID    = 0x0001; /// MXCSR Exception states.
32 enum int _MM_EXCEPT_DENORM     = 0x0002; ///ditto
33 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004; ///ditto
34 enum int _MM_EXCEPT_OVERFLOW   = 0x0008; ///ditto
35 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010; ///ditto
36 enum int _MM_EXCEPT_INEXACT    = 0x0020; ///ditto
37 enum int _MM_EXCEPT_MASK       = 0x003f; /// MXCSR Exception states mask.
38 
39 enum int _MM_MASK_INVALID      = 0x0080; /// MXCSR Exception masks.
40 enum int _MM_MASK_DENORM       = 0x0100; ///ditto
41 enum int _MM_MASK_DIV_ZERO     = 0x0200; ///ditto
42 enum int _MM_MASK_OVERFLOW     = 0x0400; ///ditto
43 enum int _MM_MASK_UNDERFLOW    = 0x0800; ///ditto
44 enum int _MM_MASK_INEXACT      = 0x1000; ///ditto
45 enum int _MM_MASK_MASK         = 0x1f80; /// MXCSR Exception masks mask.
46 
47 enum int _MM_ROUND_NEAREST     = 0x0000; /// MXCSR Rounding mode.
48 enum int _MM_ROUND_DOWN        = 0x2000; ///ditto
49 enum int _MM_ROUND_UP          = 0x4000; ///ditto
50 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto
51 enum int _MM_ROUND_MASK        = 0x6000; /// MXCSR Rounding mode mask.
52 
53 enum int _MM_FLUSH_ZERO_MASK   = 0x8000; /// MXCSR Denormal flush to zero mask.
54 enum int _MM_FLUSH_ZERO_ON     = 0x8000; /// MXCSR Denormal flush to zero modes.
55 enum int _MM_FLUSH_ZERO_OFF    = 0x0000; ///ditto
56 
57 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
58 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
59 {
60     return a + b;
61 }
62 unittest
63 {
64     __m128 a = [1, 2, 3, 4];
65     a = _mm_add_ps(a, a);
66     assert(a.array[0] == 2);
67     assert(a.array[1] == 4);
68     assert(a.array[2] == 6);
69     assert(a.array[3] == 8);
70 }
71 
72 /// Add the lower single-precision (32-bit) floating-point element 
73 /// in `a` and `b`, store the result in the lower element of result, 
74 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
75 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
76 {
77     static if (GDC_with_SSE)
78         return __builtin_ia32_addss(a, b);
79     else
80     {
81         a[0] += b[0];
82         return a;
83     }
84 }
85 unittest
86 {
87     __m128 a = [1, 2, 3, 4];
88     a = _mm_add_ss(a, a);
89     assert(a.array == [2.0f, 2, 3, 4]);
90 }
91 
92 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
93 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
94 {
95     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
96 }
97 unittest
98 {
99     float a = 4.32f;
100     float b = -78.99f;
101     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
102     __m128 A = _mm_set_ps(a, b, a, b);
103     __m128 B = _mm_set_ps(b, a, b, a);
104     int4 R = cast(int4)( _mm_and_ps(A, B) );
105     assert(R.array[0] == correct);
106     assert(R.array[1] == correct);
107     assert(R.array[2] == correct);
108     assert(R.array[3] == correct);
109 }
110 
111 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
112 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
113 {
114     return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
115 }
116 unittest
117 {
118     float a = 4.32f;
119     float b = -78.99f;
120     int correct  = ~(*cast(int*)(&a)) &  (*cast(int*)(&b));
121     int correct2 =  (*cast(int*)(&a)) & ~(*cast(int*)(&b));
122     __m128 A = _mm_set_ps(a, b, a, b);
123     __m128 B = _mm_set_ps(b, a, b, a);
124     int4 R = cast(int4)( _mm_andnot_ps(A, B) );
125     assert(R.array[0] == correct2);
126     assert(R.array[1] == correct);
127     assert(R.array[2] == correct2);
128     assert(R.array[3] == correct);
129 }
130 
131 /// Average packed unsigned 16-bit integers in ``a` and `b`.
132 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
133 {
134     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
135 }
136 
137 /// Average packed unsigned 8-bit integers in ``a` and `b`.
138 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
139 {
140     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
141 }
142 
143 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality.
144 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
145 {
146     return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
147 }
148 
149 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 
150 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
151 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
152 {
153     return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
154 }
155 
156 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal.
157 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
158 {
159     return cast(__m128) cmpps!(FPComparison.oge)(a, b);
160 }
161 unittest
162 {
163     __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan),
164                                            _mm_setr_ps(0, 0, 0, 0));
165     int[4] correct = [-1, -1, 0, 0];
166     assert(R.array == correct);
167 }
168 
169 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 
170 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
171 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
172 {
173     return cast(__m128) cmpss!(FPComparison.oge)(a, b);
174 }
175 
176 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than.
177 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
178 {
179     return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
180 }
181 
182 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 
183 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
184 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
185 {
186     return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
187 }
188 
189 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal.
190 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
191 {
192     return cast(__m128) cmpps!(FPComparison.ole)(a, b);
193 }
194 
195 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 
196 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
197 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
198 {
199     return cast(__m128) cmpss!(FPComparison.ole)(a, b);
200 }
201 
202 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than.
203 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
204 {
205     return cast(__m128) cmpps!(FPComparison.olt)(a, b);
206 }
207 
208 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 
209 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
210 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
211 {
212     return cast(__m128) cmpss!(FPComparison.olt)(a, b);
213 }
214 
215 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal.
216 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
217 {
218     return cast(__m128) cmpps!(FPComparison.une)(a, b);
219 }
220 
221 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 
222 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
223 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
224 {
225     return cast(__m128) cmpss!(FPComparison.une)(a, b);
226 }
227 
228 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal.
229 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
230 {
231     return cast(__m128) cmpps!(FPComparison.ult)(a, b);
232 }
233 
234 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 
235 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
236 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
237 {
238     return cast(__m128) cmpss!(FPComparison.ult)(a, b);
239 }
240 
241 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than.
242 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
243 {
244     return cast(__m128) cmpps!(FPComparison.ule)(a, b);
245 }
246 
247 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 
248 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
249 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
250 {
251     return cast(__m128) cmpss!(FPComparison.ule)(a, b);
252 }
253 
254 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal.
255 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
256 {
257     return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
258 }
259 
260 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 
261 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
262 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
263 {
264     return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
265 }
266 
267 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than.
268 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
269 {
270     return cast(__m128) cmpps!(FPComparison.uge)(a, b);
271 }
272 
273 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 
274 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
275 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
276 {
277     return cast(__m128) cmpss!(FPComparison.uge)(a, b);
278 }
279 
280 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN.
281 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
282 {
283     return cast(__m128) cmpps!(FPComparison.ord)(a, b);
284 }
285 
286 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 
287 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
288 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
289 {
290     return cast(__m128) cmpss!(FPComparison.ord)(a, b);
291 }
292 
293 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
294 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
295 {
296     return cast(__m128) cmpps!(FPComparison.uno)(a, b);
297 }
298 
299 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
300 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
301 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
302 {
303     return cast(__m128) cmpss!(FPComparison.uno)(a, b);
304 }
305 
306 // Note: we've reversed clang and GCC behaviour with regards to EFLAGS
307 // Some such comparisons yields true for NaNs, other don't.
308 
309 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 
310 /// and return the boolean result (0 or 1).
311 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete
312 {
313     return comss!(FPComparison.ueq)(a, b); // yields true for NaN!
314 }
315 
316 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 
317 /// and return the boolean result (0 or 1).
318 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae
319 {
320     return comss!(FPComparison.oge)(a, b);
321 }
322 
323 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 
324 /// and return the boolean result (0 or 1).
325 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
326 {
327     return comss!(FPComparison.ogt)(a, b);
328 }
329 
330 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 
331 /// and return the boolean result (0 or 1).
332 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
333 {
334     return comss!(FPComparison.ule)(a, b); // yields true for NaN!
335 }
336 
337 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 
338 /// and return the boolean result (0 or 1).
339 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
340 {
341     return comss!(FPComparison.ult)(a, b); // yields true for NaN!
342 }
343 
344 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 
345 /// and return the boolean result (0 or 1).
346 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
347 {
348     return comss!(FPComparison.one)(a, b);
349 }
350 
351 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
352 /// floating-point elements, store the results in the lower 2 elements, 
353 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
354 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
355 
356 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 
357 /// to packed 32-bit integers.
358 __m64 _mm_cvt_ps2pi (__m128 a) @safe
359 {
360     return to_m64(_mm_cvtps_epi32(a));
361 }
362 
363 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
364 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
365 /// upper elements of the result.
366 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted
367 {
368     v.ptr[0] = cast(float)x;
369     return v;
370 }
371 unittest
372 {
373     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
374     assert(a.array == [42f, 0, 0, 0]);
375 }
376 
377 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
378 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
379 {
380     __m128i ma = to_m128i(a);
381     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
382     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
383     return _mm_cvtepi32_ps(ma);
384 }
385 unittest
386 {
387     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
388     __m128 R = _mm_cvtpi16_ps(A);
389     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
390     assert(R.array == correct);
391 }
392 
393 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
394 /// floating-point elements, store the results in the lower 2 elements, 
395 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
396 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
397 {
398     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
399     a.ptr[0] = fb.array[0];
400     a.ptr[1] = fb.array[1];
401     return a;
402 }
403 unittest
404 {
405     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
406     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
407     assert(R.array == correct);
408 }
409 
410 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 
411 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 
412 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements.
413 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
414 {
415     long2 l;
416     l.ptr[0] = a.array[0];
417     l.ptr[1] = b.array[0];
418     return _mm_cvtepi32_ps(cast(__m128i)l);
419 }
420 unittest
421 {
422     __m64 A = _mm_setr_pi32(-45, 128);
423     __m64 B = _mm_setr_pi32(0, 1000);
424     __m128 R = _mm_cvtpi32x2_ps(A, B);
425     float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f];
426     assert(R.array == correct);
427 }
428 
429 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
430 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
431 {
432     __m128i b = to_m128i(a); 
433 
434     // Zero extend to 32-bit
435     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
436     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
437 
438     // Replicate sign bit
439     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
440     return _mm_cvtepi32_ps(b);
441 }
442 unittest
443 {
444     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
445     __m128 R = _mm_cvtpi8_ps(A);
446     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
447     assert(R.array == correct);
448 }
449 
450 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers.
451 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.
452 __m64 _mm_cvtps_pi16 (__m128 a) @safe
453 {
454     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
455     // Which means the 16-bit integers should be saturated
456     __m128i b = _mm_cvtps_epi32(a);
457     b = _mm_packs_epi32(b, b);
458     return to_m64(b);
459 }
460 unittest
461 {
462     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
463     short4 R = cast(short4) _mm_cvtps_pi16(A);
464     short[4] correct = [-1, 2, -32768, 32767];
465     assert(R.array == correct);
466 }
467 
468 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
469 __m64 _mm_cvtps_pi32 (__m128 a) @safe
470 {
471     return to_m64(_mm_cvtps_epi32(a));
472 }
473 unittest
474 {
475     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
476     int2 R = cast(int2) _mm_cvtps_pi32(A);
477     int[2] correct = [-33000, 70000];
478     assert(R.array == correct);
479 }
480 
481 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 
482 /// and store the results in lower 4 elements. 
483 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.
484 __m64 _mm_cvtps_pi8 (__m128 a) @safe
485 {
486     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
487     // Which means the 8-bit integers should be saturated
488     __m128i b = _mm_cvtps_epi32(a);
489     b = _mm_packs_epi32(b, _mm_setzero_si128());
490     b = _mm_packs_epi16(b, _mm_setzero_si128());
491     return to_m64(b);
492 }
493 unittest
494 {
495     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
496     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
497     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
498     assert(R.array == correct);
499 }
500 
501 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
502 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
503 {
504     __m128i ma = to_m128i(a);
505     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
506     return _mm_cvtepi32_ps(ma);
507 }
508 unittest
509 {
510     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
511     __m128 R = _mm_cvtpu16_ps(A);
512     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
513     assert(R.array == correct);
514 }
515 
516 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element.
517 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
518 {
519     __m128i b = to_m128i(a); 
520 
521     // Zero extend to 32-bit
522     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
523     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
524     return _mm_cvtepi32_ps(b);
525 }
526 unittest
527 {
528     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
529     __m128 R = _mm_cvtpu8_ps(A);
530     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
531     assert(R.array == correct);
532 }
533 
534 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
535 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
536 /// upper elements of result.
537 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
538 {
539     v.ptr[0] = cast(float)x;
540     return v;
541 }
542 unittest
543 {
544     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
545     assert(a.array == [42.0f, 0, 0, 0]);
546 }
547 
548 
549 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 
550 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
551 /// upper elements of result.
552 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
553 {
554     v.ptr[0] = cast(float)x;
555     return v;
556 }
557 unittest
558 {
559     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
560     assert(a.array == [42.0f, 0, 0, 0]);
561 }
562 
563 /// Take the lower single-precision (32-bit) floating-point element of `a`.
564 float _mm_cvtss_f32(__m128 a) pure @safe
565 {
566     return a.array[0];
567 }
568 
569 static if (LDC_with_SSE1)
570 {
571     /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
572     int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC
573     {
574         return __builtin_ia32_cvtss2si(a);
575     }
576 }
577 else
578 {
579     /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
580     int _mm_cvtss_si32 (__m128 a) @safe
581     {
582         return convertFloatToInt32UsingMXCSR(a.array[0]);
583     }
584 }
585 unittest
586 {
587     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
588 }
589 
590 version(LDC)
591 {
592     version(X86_64)
593     {
594         /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
595         long _mm_cvtss_si64 (__m128 a) @safe
596         {
597             return __builtin_ia32_cvtss2si64(a);
598         }
599     }
600     else
601     {
602         /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
603         long _mm_cvtss_si64 (__m128 a) @safe
604         {
605             // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit
606             return convertFloatToInt64UsingMXCSR(a.array[0]);
607         }
608     }
609 }
610 else
611 {
612     /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
613     long _mm_cvtss_si64 (__m128 a) @safe
614     {
615         return convertFloatToInt64UsingMXCSR(a.array[0]);
616     }
617 }
618 unittest
619 {
620     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
621 
622     uint savedRounding = _MM_GET_ROUNDING_MODE();
623 
624     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
625     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
626 
627     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
628     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
629 
630     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
631     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
632 
633     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
634     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
635 
636     _MM_SET_ROUNDING_MODE(savedRounding);
637 }
638 
639 
640 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 
641 /// integer with truncation.
642 int _mm_cvtt_ss2si (__m128 a) pure @safe
643 {
644     // x86: cvttss2si always generated, even in -O0
645     return cast(int)(a.array[0]);
646 }
647 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto
648 unittest
649 {
650     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
651 }
652 
653 
654 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
655 /// integers with truncation.
656 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
657 {
658     return to_m64(_mm_cvttps_epi32(a));
659 }
660 
661 
662 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code.
663 long _mm_cvttss_si64 (__m128 a) pure @safe
664 {
665     return cast(long)(a.array[0]); // Generates cvttss2si as expected
666 }
667 unittest
668 {
669     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
670 }
671 
672 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
673 {
674     return a / b;
675 }
676 unittest
677 {
678     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
679     a = _mm_div_ps(a, a);
680     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
681     assert(a.array == correct);
682 }
683 
684 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
685 {
686     static if (GDC_with_SSE)
687         return __builtin_ia32_divss(a, b);
688     else
689     {
690         a[0] /= b[0];
691         return a;
692     }
693 }
694 unittest
695 {
696     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
697     a = _mm_div_ss(a, a);
698     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
699     assert(a.array == correct);
700 }
701 
702 int _mm_extract_pi16 (__m64 a, int imm8)
703 {
704     short4 sa = cast(short4)a;
705     return cast(ushort)(sa.array[imm8]);
706 }
707 unittest
708 {
709     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
710     assert(_mm_extract_pi16(A, 0) == 65535);
711     assert(_mm_extract_pi16(A, 1) == 6);
712     assert(_mm_extract_pi16(A, 2) == 0);
713     assert(_mm_extract_pi16(A, 3) == 4);
714 }
715 
716 /// Free aligned memory that was allocated with `_mm_malloc`.
717 void _mm_free(void * mem_addr) @trusted
718 {
719     // support for free(NULL)
720     if (mem_addr is null)
721         return;
722 
723     // Technically we don't need to store size and alignement in the chunk, but we do in case we
724     // have to implement _mm_realloc
725 
726     size_t pointerSize = (void*).sizeof;
727     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
728     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
729     size_t alignment = *alignmentLocation;
730     assert(alignment != 0);
731     assert(isPointerAligned(mem_addr, alignment));
732     free(*rawLocation);
733 }
734 
735 uint _MM_GET_EXCEPTION_MASK() @safe
736 {
737     return _mm_getcsr() & _MM_MASK_MASK;
738 }
739 
740 uint _MM_GET_EXCEPTION_STATE() @safe
741 {
742     return _mm_getcsr() & _MM_EXCEPT_MASK;
743 }
744 
745 uint _MM_GET_FLUSH_ZERO_MODE() @safe
746 {
747     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
748 }
749 
750 uint _MM_GET_ROUNDING_MODE() @safe
751 {
752     return _mm_getcsr() & _MM_ROUND_MASK;
753 }
754 
755 uint _mm_getcsr() @trusted
756 {
757     static if (LDC_with_ARM)
758     {
759         // Note: we convert the ARM FPSCR into a x86 SSE control word.
760         // However, only rounding mode and flush to zero are actually set.
761         // The returned control word will have all exceptions masked, and no exception detected.
762 
763         uint fpscr = arm_get_fpcr();
764 
765         uint cw = 0; // No exception detected
766         if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
767         {
768             // ARM has one single flag for ARM.
769             // It does both x86 bits.
770             // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
771             cw |= _MM_FLUSH_ZERO_ON;
772             cw |= 0x40; // set "denormals are zeros"
773         } 
774         cw |= _MM_MASK_MASK; // All exception maske
775 
776         // Rounding mode
777         switch(fpscr & _MM_ROUND_MASK_ARM)
778         {
779             default:
780             case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
781             case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
782             case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
783             case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
784         }
785         return cw;
786     }
787     else version(GNU)
788     {
789         static if (GDC_with_SSE)
790         {
791             return __builtin_ia32_stmxcsr();
792         }
793         else version(X86)
794         {
795             uint sseRounding = 0;
796             asm pure nothrow @nogc @trusted
797             {
798                 "stmxcsr %0;\n" 
799                   : "=m" (sseRounding)
800                   : 
801                   : ;
802             }
803             return sseRounding;
804         }
805         else
806             static assert(false);
807     }
808     else version (InlineX86Asm)
809     {
810         uint controlWord;
811         asm nothrow @nogc pure @safe
812         {
813             stmxcsr controlWord;
814         }
815         return controlWord;
816     }
817     else
818         static assert(0, "Not yet supported");
819 }
820 unittest
821 {
822     uint csr = _mm_getcsr();
823 }
824 
825 __m64 _mm_insert_pi16 (__m64 v, int i, int index) pure @trusted
826 {
827     short4 r = cast(short4)v;
828     r.ptr[index & 3] = cast(short)i;
829     return cast(__m64)r;
830 }
831 unittest
832 {
833     __m64 A = _mm_set_pi16(3, 2, 1, 0);
834     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
835     short[4] correct = [0, 42, 2, 3];
836     assert(R.array == correct);
837 }
838 
839 __m128 _mm_load_ps(const(float)*p) pure @trusted
840 {
841     return *cast(__m128*)p;
842 }
843 
844 __m128 _mm_load_ps1(const(float)*p) pure @trusted
845 {
846     return __m128(*p);
847 }
848 
849 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
850 {
851     __m128 r;
852     r.ptr[0] = *mem_addr;
853     r.ptr[1] = 0;
854     r.ptr[2] = 0;
855     r.ptr[3] = 0;
856     return r;
857 }
858 
859 alias _mm_load1_ps = _mm_load_ps1;
860 
861 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
862 {
863     long2 la = cast(long2)a;
864     la.ptr[1] = (*mem_addr).array[0];
865     return cast(__m128)la;
866 }
867 
868 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
869 {
870     long2 la = cast(long2)a;
871     la.ptr[0] = (*mem_addr).array[0];
872     return cast(__m128)la;
873 }
874 
875 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted
876 {
877     __m128* aligned = cast(__m128*)mem_addr;
878     __m128 a = *aligned;
879     __m128 r;
880     r.ptr[0] = a.array[3];
881     r.ptr[1] = a.array[2];
882     r.ptr[2] = a.array[1];
883     r.ptr[3] = a.array[0];
884     return r;
885 }
886 
887 __m128 _mm_loadu_ps(const(float)*p) pure @safe
888 {
889     return loadUnaligned!(__m128)(p);
890 }
891 
892 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
893 {
894     short r = *cast(short*)(mem_addr);
895     short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
896     result.ptr[0] = r;
897     return cast(__m128i)result;
898 }
899 unittest
900 {
901     short r = 13;
902     short8 A = cast(short8) _mm_loadu_si16(&r);
903     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
904     assert(A.array == correct);
905 }
906 
907 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
908 {
909     long r = *cast(long*)(mem_addr);
910     long2 result = [0, 0];
911     result.ptr[0] = r;
912     return cast(__m128i)result;
913 }
914 unittest
915 {
916     long r = 446446446446;
917     long2 A = cast(long2) _mm_loadu_si64(&r);
918     long[2] correct = [446446446446, 0];
919     assert(A.array == correct);
920 }
921 
922 /// Allocate size bytes of memory, aligned to the alignment specified in align,
923 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
924 /// memory that is allocated with `_mm_malloc`.
925 void* _mm_malloc(size_t size, size_t alignment) @trusted
926 {
927     assert(alignment != 0);
928     size_t request = requestedSize(size, alignment);
929     void* raw = malloc(request);
930     if (request > 0 && raw == null) // malloc(0) can validly return anything
931         onOutOfMemoryError();
932     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
933 }
934 
935 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
936 {
937     // this works since mask is zero-extended
938     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
939 }
940 
941 deprecated alias _m_maskmovq = _mm_maskmove_si64;
942 
943 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
944 {
945     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
946 }
947 
948 static if (GDC_with_SSE)
949 {
950     alias _mm_max_ps = __builtin_ia32_maxps;
951 }
952 else static if (LDC_with_SSE1)
953 {
954     alias _mm_max_ps = __builtin_ia32_maxps;
955 }
956 else
957 {
958     __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
959     {
960         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
961         __m128 r;
962         r[0] = (a[0] > b[0]) ? a[0] : b[0];
963         r[1] = (a[1] > b[1]) ? a[1] : b[1];
964         r[2] = (a[2] > b[2]) ? a[2] : b[2];
965         r[3] = (a[3] > b[3]) ? a[3] : b[3];
966         return r;
967     }
968 }
969 unittest
970 {
971     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
972     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
973     __m128 M = _mm_max_ps(A, B);
974     assert(M.array[0] == 4);
975     assert(M.array[1] == 2);
976     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
977     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
978 }
979 
980 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
981 {
982     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
983 }
984 
985 static if (GDC_with_SSE)
986 {
987     alias _mm_max_ss = __builtin_ia32_maxss;
988 }
989 else static if (LDC_with_SSE1)
990 {
991     alias _mm_max_ss = __builtin_ia32_maxss;
992 }
993 else
994 {
995     __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
996     {
997         __m128 r = a;
998         r[0] = (a[0] > b[0]) ? a[0] : b[0];
999         return r;
1000     }
1001 }
1002 unittest
1003 {
1004     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1005     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1006     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1007     __m128 M = _mm_max_ss(A, B);
1008     assert(M.array[0] == 4);
1009     assert(M.array[1] == 2);
1010     assert(M.array[2] == 3);
1011     assert(M.array[3] == 4);
1012     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
1013     assert(M.array[0] != M.array[0]);
1014     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
1015     assert(M.array[0] == 1);
1016 }
1017 
1018 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
1019 {
1020     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
1021 }
1022 
1023 static if (GDC_with_SSE)
1024 {
1025     alias _mm_min_ps = __builtin_ia32_minps;
1026 }
1027 else static if (LDC_with_SSE1)
1028 {
1029     alias _mm_min_ps = __builtin_ia32_minps;
1030 }
1031 else
1032 {
1033     __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
1034     {
1035         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1036         __m128 r;
1037         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1038         r[1] = (a[1] < b[1]) ? a[1] : b[1];
1039         r[2] = (a[2] < b[2]) ? a[2] : b[2];
1040         r[3] = (a[3] < b[3]) ? a[3] : b[3];
1041         return r;
1042     }
1043 }
1044 unittest
1045 {
1046     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1047     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1048     __m128 M = _mm_min_ps(A, B);
1049     assert(M.array[0] == 1);
1050     assert(M.array[1] == 1);
1051     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1052     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1053 }
1054 
1055 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
1056 {
1057     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
1058 }
1059 
1060 static if (GDC_with_SSE)
1061 {
1062     alias _mm_min_ss = __builtin_ia32_minss;
1063 }
1064 else static if (LDC_with_SSE1)
1065 {
1066     alias _mm_min_ss = __builtin_ia32_minss;
1067 }
1068 else
1069 {
1070     __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
1071     {
1072         // Generates minss since LDC 1.3 -O1
1073         __m128 r = a;
1074         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1075         return r;
1076     }
1077 }
1078 unittest
1079 {
1080     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1081     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1082     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1083     __m128 M = _mm_min_ss(A, B);
1084     assert(M.array[0] == 1);
1085     assert(M.array[1] == 2);
1086     assert(M.array[2] == 3);
1087     assert(M.array[3] == 4);
1088     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
1089     assert(M.array[0] != M.array[0]);
1090     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
1091     assert(M.array[0] == 1);
1092 }
1093 
1094 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
1095 {
1096     a.ptr[0] = b.array[0];
1097     return a;
1098 }
1099 
1100 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
1101 {
1102     b.ptr[0] = a.array[2];
1103     b.ptr[1] = a.array[3];
1104     return b;
1105 }
1106 
1107 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
1108 {
1109     a.ptr[2] = b.array[0];
1110     a.ptr[3] = b.array[1];
1111     return a;
1112 }
1113 
1114 int _mm_movemask_pi8 (__m64 a) pure @safe
1115 {
1116     return _mm_movemask_epi8(to_m128i(a));
1117 }
1118 unittest
1119 {
1120     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
1121 }
1122 
1123 static if (GDC_with_SSE)
1124 {
1125     alias _mm_movemask_ps = __builtin_ia32_movmskps;
1126 }
1127 else static if (LDC_with_SSE1)
1128 {
1129     alias _mm_movemask_ps = __builtin_ia32_movmskps;
1130 }
1131 else static if (LDC_with_ARM)
1132 {
1133     int _mm_movemask_ps (__m128 a) pure @safe
1134     {
1135         int4 ai = cast(int4)a;
1136         int4 shift31 = [31, 31, 31, 31]; 
1137         ai = ai >>> shift31;
1138         int4 shift = [0, 1, 2, 3]; 
1139         ai = ai << shift; // 4-way shift, only efficient on ARM.
1140         int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
1141         return r;
1142     }
1143 }
1144 else
1145 {
1146     int _mm_movemask_ps (__m128 a) pure @safe
1147     {
1148         int4 ai = cast(int4)a;
1149         int r = 0;
1150         if (ai[0] < 0) r += 1;
1151         if (ai[1] < 0) r += 2;
1152         if (ai[2] < 0) r += 4;
1153         if (ai[3] < 0) r += 8;
1154         return r;
1155     }
1156 }
1157 unittest
1158 {
1159     int4 A = [-1, 0, -43, 0];
1160     assert(5 == _mm_movemask_ps(cast(float4)A));
1161 }
1162 
1163 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
1164 {
1165     return a * b;
1166 }
1167 unittest
1168 {
1169     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1170     a = _mm_mul_ps(a, a);
1171     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
1172     assert(a.array == correct);
1173 }
1174 
1175 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
1176 {
1177     static if (GDC_with_SSE)
1178         return __builtin_ia32_mulss(a, b);
1179     else
1180     {
1181         a[0] *= b[0];
1182         return a;
1183     }
1184 }
1185 unittest
1186 {
1187     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1188     a = _mm_mul_ss(a, a);
1189     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
1190     assert(a.array == correct);
1191 }
1192 
1193 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
1194 {
1195     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
1196 }
1197 unittest
1198 {
1199     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
1200     __m64 B = _mm_set1_pi16(16384);
1201     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
1202     short[4] correct = [0, 0x3FFC, 0, 0];
1203     assert(R.array == correct);
1204 }
1205 
1206 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
1207 {
1208     return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
1209 }
1210 
1211 deprecated alias 
1212     _m_pavgb = _mm_avg_pu8,
1213     _m_pavgw = _mm_avg_pu16,
1214     _m_pextrw = _mm_extract_pi16,
1215     _m_pinsrw = _mm_insert_pi16,
1216     _m_pmaxsw = _mm_max_pi16,
1217     _m_pmaxub = _mm_max_pu8,
1218     _m_pminsw = _mm_min_pi16,
1219     _m_pminub = _mm_min_pu8,
1220     _m_pmovmskb = _mm_movemask_pi8,
1221     _m_pmulhuw = _mm_mulhi_pu16;
1222 
1223 enum _MM_HINT_T0  = 3; ///
1224 enum _MM_HINT_T1  = 2; ///
1225 enum _MM_HINT_T2  = 1; ///
1226 enum _MM_HINT_NTA = 0; ///
1227 
1228 
1229 version(LDC)
1230 {
1231     // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
1232     // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
1233     static if (__VERSION__ >= 2091) 
1234     {
1235         pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
1236             void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
1237     }
1238 }
1239 
1240 /// Fetch the line of data from memory that contains address `p` to a location in the 
1241 /// cache hierarchy specified by the locality hint i.
1242 ///
1243 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
1244 void _mm_prefetch(int locality)(const(void)* p) pure @trusted
1245 {
1246     static if (GDC_with_SSE)
1247     {
1248         return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
1249     }
1250     else version(LDC)
1251     {
1252         static if (__VERSION__ >= 2091)
1253         {
1254             // const_cast here. `llvm_prefetch` wants a mutable pointer
1255             llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
1256         }
1257         else
1258         {
1259             // const_cast here. `llvm_prefetch` wants a mutable pointer
1260             llvm_prefetch( cast(void*)p, 0, locality, 1);
1261         }
1262     }
1263     else version(D_InlineAsm_X86_64)
1264     {
1265         static if (locality == _MM_HINT_NTA)
1266         {
1267             asm pure nothrow @nogc @trusted
1268             {
1269                 mov RAX, p;
1270                 prefetchnta [RAX];
1271             }
1272         }
1273         else static if (locality == _MM_HINT_T0)
1274         {
1275             asm pure nothrow @nogc @trusted
1276             {
1277                 mov RAX, p;
1278                 prefetcht0 [RAX];
1279             }
1280         }
1281         else static if (locality == _MM_HINT_T1)
1282         {
1283             asm pure nothrow @nogc @trusted
1284             {
1285                 mov RAX, p;
1286                 prefetcht1 [RAX];
1287             }
1288         }
1289         else static if (locality == _MM_HINT_T2)
1290         {
1291             asm pure nothrow @nogc @trusted
1292             {
1293                 mov RAX, p;
1294                 prefetcht2 [RAX];
1295             }
1296         }
1297         else
1298             assert(false); // invalid locality hint
1299     }
1300     else version(D_InlineAsm_X86)
1301     {
1302         static if (locality == _MM_HINT_NTA)
1303         {
1304             asm pure nothrow @nogc @trusted
1305             {
1306                 mov EAX, p;
1307                 prefetchnta [EAX];
1308             }
1309         }
1310         else static if (locality == _MM_HINT_T0)
1311         {
1312             asm pure nothrow @nogc @trusted
1313             {
1314                 mov EAX, p;
1315                 prefetcht0 [EAX];
1316             }
1317         }
1318         else static if (locality == _MM_HINT_T1)
1319         {
1320             asm pure nothrow @nogc @trusted
1321             {
1322                 mov EAX, p;
1323                 prefetcht1 [EAX];
1324             }
1325         }
1326         else static if (locality == _MM_HINT_T2)
1327         {
1328             asm pure nothrow @nogc @trusted
1329             {
1330                 mov EAX, p;
1331                 prefetcht2 [EAX];
1332             }
1333         }
1334         else 
1335             assert(false); // invalid locality hint
1336     }
1337     else
1338     {
1339         // Generic version: do nothing. From bitter experience, 
1340         // it's unlikely you get ANY speed-up with manual prefetching.
1341         // Prefetching or not doesn't change program behaviour.
1342     }
1343 }
1344 unittest
1345 {
1346     // From Intel documentation:
1347     // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 32 bytes."
1348     ubyte[256] cacheline; // though it seems it cannot generate GP fault
1349     _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
1350     _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
1351     _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
1352     _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
1353 }
1354 
1355 deprecated alias
1356     _m_psadbw = _mm_sad_pu8,
1357     _m_pshufw = _mm_shuffle_pi16;
1358 
1359 static if (GDC_with_SSE)
1360 {
1361     alias _mm_rcp_ps = __builtin_ia32_rcpps;
1362 }
1363 else static if (LDC_with_SSE1)
1364 {
1365     alias _mm_rcp_ps = __builtin_ia32_rcpps;
1366 }
1367 else
1368 {
1369     __m128 _mm_rcp_ps (__m128 a) pure @safe
1370     {
1371         a[0] = 1.0f / a[0];
1372         a[1] = 1.0f / a[1];
1373         a[2] = 1.0f / a[2];
1374         a[3] = 1.0f / a[3];
1375         return a;
1376     }
1377 }
1378 
1379 static if (GDC_with_SSE)
1380 {
1381     alias _mm_rcp_ss = __builtin_ia32_rcpss;
1382 }
1383 else static if (LDC_with_SSE1)
1384 {
1385     alias _mm_rcp_ss = __builtin_ia32_rcpss;
1386 }
1387 else
1388 {
1389     __m128 _mm_rcp_ss (__m128 a) pure @safe
1390     {
1391         a[0] = 1.0f / a[0];
1392         return a;
1393     }
1394 }
1395 
1396 static if (GDC_with_SSE)
1397 {
1398     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
1399 }
1400 else static if (LDC_with_SSE1)
1401 {
1402     alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps;
1403 }
1404 else version(LDC)
1405 {
1406     __m128 _mm_rsqrt_ps (__m128 a) pure @safe
1407     {
1408         a[0] = 1.0f / llvm_sqrt(a[0]);
1409         a[1] = 1.0f / llvm_sqrt(a[1]);
1410         a[2] = 1.0f / llvm_sqrt(a[2]);
1411         a[3] = 1.0f / llvm_sqrt(a[3]);
1412         return a;
1413     }
1414 }
1415 else
1416 {
1417     __m128 _mm_rsqrt_ps (__m128 a) pure @safe
1418     {
1419         a[0] = 1.0f / sqrt(a[0]);
1420         a[1] = 1.0f / sqrt(a[1]);
1421         a[2] = 1.0f / sqrt(a[2]);
1422         a[3] = 1.0f / sqrt(a[3]);
1423         return a;
1424     }
1425 }
1426 
1427 static if (GDC_with_SSE)
1428 {
1429     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
1430 }
1431 else static if (LDC_with_SSE1)
1432 {
1433     alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss;
1434 }
1435 else version(LDC)
1436 {
1437     __m128 _mm_rsqrt_ss (__m128 a) pure @safe
1438     {
1439         a[0] = 1.0f / llvm_sqrt(a[0]);
1440         return a;
1441     }
1442 }
1443 else
1444 {
1445     __m128 _mm_rsqrt_ss (__m128 a) pure @safe
1446     {
1447         a[0] = 1.0f / sqrt(a[0]);
1448         return a;
1449     }
1450 }
1451 
1452 unittest
1453 {
1454     double maxRelativeError = 0.000245; // -72 dB
1455     void testInvSqrt(float number) nothrow @nogc
1456     {
1457         __m128 A = _mm_set1_ps(number);
1458 
1459         // test _mm_rcp_ps
1460         __m128 B = _mm_rcp_ps(A);
1461         foreach(i; 0..4)
1462         {
1463             double exact = 1.0f / A.array[i];
1464             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
1465             assert(abs(ratio - 1) <= maxRelativeError);
1466         }
1467 
1468         // test _mm_rcp_ss
1469         {
1470             B = _mm_rcp_ss(A);
1471             double exact = 1.0f / A.array[0];
1472             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
1473             assert(abs(ratio - 1) <= maxRelativeError);
1474         }
1475 
1476         // test _mm_rsqrt_ps
1477         B = _mm_rsqrt_ps(A);
1478         foreach(i; 0..4)
1479         {
1480             double exact = 1.0f / sqrt(A.array[i]);
1481             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
1482             assert(abs(ratio - 1) <= maxRelativeError);
1483         }
1484 
1485         // test _mm_rsqrt_ss
1486         {
1487             B = _mm_rsqrt_ss(A);
1488             double exact = 1.0f / sqrt(A.array[0]);
1489             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
1490             assert(abs(ratio - 1) <= maxRelativeError);
1491         }
1492     }
1493 
1494     testInvSqrt(1.1f);
1495     testInvSqrt(2.45674864151f);
1496     testInvSqrt(27841456468.0f);
1497 }
1498 
1499 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
1500 {
1501     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
1502 }
1503 
1504 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
1505 {
1506     // Note: unsupported on ARM
1507     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
1508 }
1509 
1510 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
1511 {
1512     // Note: unsupported on ARM
1513     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
1514 }
1515 
1516 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
1517 {
1518     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
1519 }
1520 
1521 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
1522 {
1523     // Note: despite appearances, generates sensible code,
1524     //       inlines correctly and is constant folded
1525     float[4] result = [e0, e1, e2, e3];
1526     return loadUnaligned!(float4)(result.ptr);
1527 }
1528 unittest
1529 {
1530     __m128 A = _mm_set_ps(3, 2, 1, 546);
1531     float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
1532     assert(A.array == correct);
1533     assert(A.array[0] == 546.0f);
1534     assert(A.array[1] == 1.0f);
1535     assert(A.array[2] == 2.0f);
1536     assert(A.array[3] == 3.0f);
1537 }
1538 
1539 alias _mm_set_ps1 = _mm_set1_ps;
1540 
1541 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
1542 {
1543     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
1544 }
1545 
1546 __m128 _mm_set_ss (float a) pure @trusted
1547 {
1548     __m128 r = _mm_setzero_ps();
1549     r.ptr[0] = a;
1550     return r;
1551 }
1552 unittest
1553 {
1554     float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
1555     __m128 A = _mm_set_ss(42.0f);
1556     assert(A.array == correct);
1557 }
1558 
1559 __m128 _mm_set1_ps (float a) pure @trusted
1560 {
1561     __m128 r = void;
1562     r.ptr[0] = a;
1563     r.ptr[1] = a;
1564     r.ptr[2] = a;
1565     r.ptr[3] = a;
1566     return r;
1567 }
1568 unittest
1569 {
1570     float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
1571     __m128 A = _mm_set1_ps(42.0f);
1572     assert(A.array == correct);
1573 }
1574 
1575 void _mm_setcsr(uint controlWord) @trusted
1576 {
1577     static if (LDC_with_ARM)
1578     {
1579         // Convert from SSE to ARM control word. This is done _partially_
1580         // and only support rounding mode changes.
1581 
1582         // "To alter some bits of a VFP system register without 
1583         // affecting other bits, use a read-modify-write procedure"
1584         uint fpscr = arm_get_fpcr();
1585         
1586         // Bits 23 to 22 are rounding modes, however not used in NEON
1587         fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
1588         switch(controlWord & _MM_ROUND_MASK)
1589         {
1590             default:
1591             case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
1592             case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
1593             case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
1594             case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
1595         }
1596         fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
1597         if (controlWord & _MM_FLUSH_ZERO_MASK)
1598             fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
1599         arm_set_fpcr(fpscr);
1600     }
1601     else version(GNU)
1602     {
1603         static if (GDC_with_SSE)
1604         {
1605             __builtin_ia32_ldmxcsr(controlWord);
1606         }
1607         else version(X86)
1608         {
1609             asm pure nothrow @nogc @trusted
1610             {
1611                 "ldmxcsr %0;\n" 
1612                   : 
1613                   : "m" (controlWord)
1614                   : ;
1615             }
1616         }
1617         else
1618             static assert(false);
1619     }
1620     else version (InlineX86Asm)
1621     {
1622         asm pure nothrow @nogc @safe
1623         {
1624             ldmxcsr controlWord;
1625         }
1626     }
1627     else
1628         static assert(0, "Not yet supported");
1629 }
1630 unittest
1631 {
1632     _mm_setcsr(_mm_getcsr());
1633 }
1634 
1635 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
1636 {
1637     float[4] result = [e3, e2, e1, e0];
1638     return loadUnaligned!(float4)(result.ptr);
1639 }
1640 unittest
1641 {
1642     __m128 A = _mm_setr_ps(3, 2, 1, 546);
1643     float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
1644     assert(A.array == correct);
1645     assert(A.array[0] == 3.0f);
1646     assert(A.array[1] == 2.0f);
1647     assert(A.array[2] == 1.0f);
1648     assert(A.array[3] == 546.0f);
1649 }
1650 
1651 __m128 _mm_setzero_ps() pure @trusted
1652 {
1653     // Compiles to xorps without problems
1654     float[4] result = [0.0f, 0.0f, 0.0f, 0.0f];
1655     return loadUnaligned!(float4)(result.ptr);
1656 }
1657 
1658 version(GNU)
1659 {
1660     void _mm_sfence() pure @trusted
1661     {
1662         static if (GDC_with_SSE)
1663         {
1664             __builtin_ia32_sfence();
1665         }
1666         else version(X86)
1667         {
1668             asm pure nothrow @nogc @trusted
1669             {
1670                 "sfence;\n" : : : ;
1671             }
1672         }
1673         else
1674             static assert(false);
1675     }
1676 }
1677 else static if (LDC_with_SSE1)
1678 {
1679     alias _mm_sfence = __builtin_ia32_sfence;
1680 }
1681 else static if (DMD_with_asm)
1682 {
1683     void _mm_sfence() pure @safe
1684     {
1685         asm nothrow @nogc pure @safe
1686         {
1687             sfence;
1688         }
1689     }
1690 }
1691 else version(LDC)
1692 {
1693     void _mm_sfence() pure @safe
1694     {
1695         llvm_memory_fence(); // PERF: generates mfence
1696     }
1697 }
1698 else
1699     static assert(false);
1700 unittest
1701 {
1702     _mm_sfence();
1703 }
1704 
1705 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
1706 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe
1707 {
1708     return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ),
1709                                               ( (imm8 >> 2) & 3 ),
1710                                               ( (imm8 >> 4) & 3 ),
1711                                               ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
1712 }
1713 unittest
1714 {
1715     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
1716     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
1717     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
1718     short[4] expectedB = [ 3, 2, 1, 0 ];
1719     assert(B.array == expectedB);
1720 }
1721 
1722 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
1723 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
1724 {
1725     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
1726 }
1727 
1728 static if (GDC_with_SSE)
1729 {
1730     alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
1731 }
1732 else version(LDC)
1733 {
1734     // Disappeared with LDC 1.11
1735     static if (__VERSION__ < 2081)
1736         alias _mm_sqrt_ps = __builtin_ia32_sqrtps;
1737     else
1738     {
1739         __m128 _mm_sqrt_ps(__m128 vec) pure @safe
1740         {
1741             vec.array[0] = llvm_sqrt(vec.array[0]);
1742             vec.array[1] = llvm_sqrt(vec.array[1]);
1743             vec.array[2] = llvm_sqrt(vec.array[2]);
1744             vec.array[3] = llvm_sqrt(vec.array[3]);
1745             return vec;
1746         }
1747     }
1748 }
1749 else
1750 {
1751     __m128 _mm_sqrt_ps(__m128 vec) pure @trusted
1752     {
1753         vec.ptr[0] = sqrt(vec.array[0]);
1754         vec.ptr[1] = sqrt(vec.array[1]);
1755         vec.ptr[2] = sqrt(vec.array[2]);
1756         vec.ptr[3] = sqrt(vec.array[3]);
1757         return vec;
1758     }
1759 }
1760 unittest
1761 {
1762     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
1763     assert(A.array[0] == 2.0f);
1764     assert(A.array[1] == 2.0f);
1765     assert(A.array[2] == 2.0f);
1766     assert(A.array[3] == 2.0f);
1767 }
1768 
1769 static if (GDC_with_SSE)
1770 {
1771     alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
1772 }
1773 else version(LDC)
1774 {
1775     // Disappeared with LDC 1.11
1776     static if (__VERSION__ < 2081)
1777         alias _mm_sqrt_ss = __builtin_ia32_sqrtss;
1778     else
1779     {
1780         __m128 _mm_sqrt_ss(__m128 vec) pure @safe
1781         {
1782             vec.array[0] = llvm_sqrt(vec.array[0]);
1783             vec.array[1] = vec.array[1];
1784             vec.array[2] = vec.array[2];
1785             vec.array[3] = vec.array[3];
1786             return vec;
1787         }
1788     }
1789 }
1790 else
1791 {
1792     __m128 _mm_sqrt_ss(__m128 vec) pure @trusted
1793     {
1794         vec.ptr[0] = sqrt(vec.array[0]);
1795         return vec;
1796     }
1797 }
1798 unittest
1799 {
1800     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
1801     assert(A.array[0] == 2.0f);
1802     assert(A.array[1] == 4.0f);
1803     assert(A.array[2] == 4.0f);
1804     assert(A.array[3] == 4.0f);
1805 }
1806 
1807 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment
1808 {
1809     __m128* aligned = cast(__m128*)mem_addr;
1810     *aligned = a;
1811 }
1812 
1813 alias _mm_store_ps1 = _mm_store1_ps;
1814 
1815 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
1816 {
1817     *mem_addr = a.array[0];
1818 }
1819 unittest
1820 {
1821     float a;
1822     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
1823     assert(a == 546);
1824 }
1825 
1826 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment
1827 {
1828     __m128* aligned = cast(__m128*)mem_addr;
1829     __m128 r;
1830     r.ptr[0] = a.array[0];
1831     r.ptr[1] = a.array[0];
1832     r.ptr[2] = a.array[0];
1833     r.ptr[3] = a.array[0];
1834     *aligned = r;
1835 }
1836 
1837 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
1838 {
1839     long2 la = cast(long2)a;
1840     (*p).ptr[0] = la.array[1];
1841 }
1842 unittest
1843 {
1844     __m64 R = _mm_setzero_si64();
1845     long2 A = [13, 25];
1846     _mm_storeh_pi(&R, cast(__m128)A);
1847     assert(R.array[0] == 25);
1848 }
1849 
1850 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
1851 {
1852     long2 la = cast(long2)a;
1853     (*p).ptr[0] = la.array[0];
1854 }
1855 unittest
1856 {
1857     __m64 R = _mm_setzero_si64();
1858     long2 A = [13, 25];
1859     _mm_storel_pi(&R, cast(__m128)A);
1860     assert(R.array[0] == 13);
1861 }
1862 
1863 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment
1864 {
1865     __m128* aligned = cast(__m128*)mem_addr;
1866     __m128 r;
1867     r.ptr[0] = a.array[3];
1868     r.ptr[1] = a.array[2];
1869     r.ptr[2] = a.array[1];
1870     r.ptr[3] = a.array[0];
1871     *aligned = r;
1872 }
1873 
1874 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe
1875 {
1876     storeUnaligned!(float4)(a, mem_addr);
1877 }
1878 
1879 void _mm_stream_pi (__m64* mem_addr, __m64 a)
1880 {
1881     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
1882     *mem_addr = a; // it's a regular move instead
1883 }
1884 
1885 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal
1886 // needs some IR outside this function that would say:
1887 //
1888 //  !0 = !{ i32 1 }
1889 //
1890 // It's a LLVM IR metadata description.
1891 // Regardless, non-temporal moves are really dangerous for performance...
1892 void _mm_stream_ps (float* mem_addr, __m128 a)
1893 {
1894     __m128* dest = cast(__m128*)mem_addr;
1895     *dest = a; // it's a regular move instead
1896 }
1897 unittest
1898 {
1899     align(16) float[4] A;
1900     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
1901     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
1902 }
1903 
1904 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
1905 {
1906     return a - b;
1907 }
1908 unittest
1909 {
1910     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1911     a = _mm_sub_ps(a, a);
1912     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
1913     assert(a.array == correct);
1914 }
1915 
1916 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
1917 {
1918     static if (GDC_with_SSE)
1919         return __builtin_ia32_subss(a, b);
1920     else
1921     {
1922         a[0] -= b[0];
1923         return a;
1924     }
1925 }
1926 unittest
1927 {
1928     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1929     a = _mm_sub_ss(a, a);
1930     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
1931     assert(a.array == correct);
1932 }
1933 
1934 
1935 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
1936 {
1937     __m128 tmp3, tmp2, tmp1, tmp0;
1938     tmp0 = _mm_unpacklo_ps(row0, row1);
1939     tmp2 = _mm_unpacklo_ps(row2, row3);
1940     tmp1 = _mm_unpackhi_ps(row0, row1);
1941     tmp3 = _mm_unpackhi_ps(row2, row3);
1942     row0 = _mm_movelh_ps(tmp0, tmp2);
1943     row1 = _mm_movehl_ps(tmp2, tmp0);
1944     row2 = _mm_movelh_ps(tmp1, tmp3);
1945     row3 = _mm_movehl_ps(tmp3, tmp1);
1946 }
1947 
1948 // Note: the only difference between these intrinsics is the signalling
1949 //       behaviour of quiet NaNs. This is incorrect but the case where
1950 //       you would want to differentiate between qNaN and sNaN and then
1951 //       treat them differently on purpose seems extremely rare.
1952 alias _mm_ucomieq_ss = _mm_comieq_ss;
1953 alias _mm_ucomige_ss = _mm_comige_ss;
1954 alias _mm_ucomigt_ss = _mm_comigt_ss;
1955 alias _mm_ucomile_ss = _mm_comile_ss;
1956 alias _mm_ucomilt_ss = _mm_comilt_ss;
1957 alias _mm_ucomineq_ss = _mm_comineq_ss;
1958 
1959 
1960 __m128 _mm_undefined_ps() pure @safe
1961 {
1962     __m128 undef = void;
1963     return undef;
1964 }
1965 
1966 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
1967 {
1968     __m128 r;
1969     r.ptr[0] = a.array[2];
1970     r.ptr[1] = b.array[2];
1971     r.ptr[2] = a.array[3];
1972     r.ptr[3] = b.array[3];
1973     return r;
1974 }
1975 
1976 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
1977 {
1978     __m128 r;
1979     r.ptr[0] = a.array[0];
1980     r.ptr[1] = b.array[0];
1981     r.ptr[2] = a.array[1];
1982     r.ptr[3] = b.array[1];
1983     return r;
1984 }
1985 
1986 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
1987 {
1988     return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
1989 }
1990 
1991 
1992 private
1993 {
1994     /// Returns: `true` if the pointer is suitably aligned.
1995     bool isPointerAligned(void* p, size_t alignment) pure
1996     {
1997         assert(alignment != 0);
1998         return ( cast(size_t)p & (alignment - 1) ) == 0;
1999     }
2000 
2001     /// Returns: next pointer aligned with alignment bytes.
2002     void* nextAlignedPointer(void* start, size_t alignment) pure
2003     {
2004         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
2005     }
2006 
2007     // Returns number of bytes to actually allocate when asking
2008     // for a particular alignment
2009     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
2010     {
2011         enum size_t pointerSize = size_t.sizeof;
2012         return askedSize + alignment - 1 + pointerSize * 3;
2013     }
2014 
2015     // Store pointer given my malloc, size and alignment
2016     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
2017     {
2018         enum size_t pointerSize = size_t.sizeof;
2019         char* start = cast(char*)raw + pointerSize * 3;
2020         void* aligned = nextAlignedPointer(start, alignment);
2021         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
2022         *rawLocation = raw;
2023         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
2024         *sizeLocation = size;
2025         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
2026         *alignmentLocation = alignment;
2027         assert( isPointerAligned(aligned, alignment) );
2028         return aligned;
2029     }
2030 
2031     // Returns: x, multiple of powerOfTwo, so that x >= n.
2032     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
2033     {
2034         // check power-of-two
2035         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
2036 
2037         size_t mask = ~(powerOfTwo - 1);
2038         return (n + powerOfTwo - 1) & mask;
2039     }
2040 }
2041 
2042 unittest
2043 {
2044     assert(nextMultipleOf(0, 4) == 0);
2045     assert(nextMultipleOf(1, 4) == 4);
2046     assert(nextMultipleOf(2, 4) == 4);
2047     assert(nextMultipleOf(3, 4) == 4);
2048     assert(nextMultipleOf(4, 4) == 4);
2049     assert(nextMultipleOf(5, 4) == 8);
2050 
2051     {
2052         void* p = _mm_malloc(23, 16);
2053         assert(p !is null);
2054         assert(((cast(size_t)p) & 0xf) == 0);
2055         _mm_free(p);
2056     }
2057 
2058     void* nullAlloc = _mm_malloc(0, 32);
2059     assert(nullAlloc != null);
2060     _mm_free(nullAlloc);
2061 }
2062 
2063 // For some reason, order of declaration is important for this one... 4
2064 // so it is misplaced.
2065 // Note: is just another name for _mm_cvtss_si32
2066 alias _mm_cvt_ss2si = _mm_cvtss_si32;