1 /**
2 * SSE intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE
4 * 
5 * Copyright: Copyright Guillaume Piolat 2016-2020.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.xmmintrin;
9 
10 public import inteli.types;
11 
12 import inteli.internals;
13 
14 import inteli.mmx;
15 import inteli.emmintrin;
16 
17 version(D_InlineAsm_X86)
18     version = InlineX86Asm;
19 else version(D_InlineAsm_X86_64)
20     version = InlineX86Asm;
21 
22 
23 // SSE1
24 
25 nothrow @nogc:
26 
27 
28 enum int _MM_EXCEPT_INVALID    = 0x0001; /// MXCSR Exception states.
29 enum int _MM_EXCEPT_DENORM     = 0x0002; ///ditto
30 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004; ///ditto
31 enum int _MM_EXCEPT_OVERFLOW   = 0x0008; ///ditto
32 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010; ///ditto
33 enum int _MM_EXCEPT_INEXACT    = 0x0020; ///ditto
34 enum int _MM_EXCEPT_MASK       = 0x003f; /// MXCSR Exception states mask.
35 
36 enum int _MM_MASK_INVALID      = 0x0080; /// MXCSR Exception masks.
37 enum int _MM_MASK_DENORM       = 0x0100; ///ditto
38 enum int _MM_MASK_DIV_ZERO     = 0x0200; ///ditto
39 enum int _MM_MASK_OVERFLOW     = 0x0400; ///ditto
40 enum int _MM_MASK_UNDERFLOW    = 0x0800; ///ditto
41 enum int _MM_MASK_INEXACT      = 0x1000; ///ditto
42 enum int _MM_MASK_MASK         = 0x1f80; /// MXCSR Exception masks mask.
43 
44 enum int _MM_ROUND_NEAREST     = 0x0000; /// MXCSR Rounding mode.
45 enum int _MM_ROUND_DOWN        = 0x2000; ///ditto
46 enum int _MM_ROUND_UP          = 0x4000; ///ditto
47 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto
48 enum int _MM_ROUND_MASK        = 0x6000; /// MXCSR Rounding mode mask.
49 
50 enum int _MM_FLUSH_ZERO_MASK   = 0x8000; /// MXCSR Denormal flush to zero mask.
51 enum int _MM_FLUSH_ZERO_ON     = 0x8000; /// MXCSR Denormal flush to zero modes.
52 enum int _MM_FLUSH_ZERO_OFF    = 0x0000; ///ditto
53 
54 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
56 {
57     pragma(inline, true);
58     return a + b;
59 }
60 unittest
61 {
62     __m128 a = [1, 2, 3, 4];
63     a = _mm_add_ps(a, a);
64     assert(a.array[0] == 2);
65     assert(a.array[1] == 4);
66     assert(a.array[2] == 6);
67     assert(a.array[3] == 8);
68 }
69 
70 /// Add the lower single-precision (32-bit) floating-point element 
71 /// in `a` and `b`, store the result in the lower element of result, 
72 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
73 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
74 {
75     static if (GDC_with_SSE)
76     {
77         return __builtin_ia32_addss(a, b);
78     }
79     else static if (DMD_with_DSIMD)
80     {
81         return cast(__m128) __simd(XMM.ADDSS, a, b);
82     }
83     else
84     {
85         a[0] += b[0];
86         return a;
87     }
88 }
89 unittest
90 {
91     __m128 a = [1, 2, 3, 4];
92     a = _mm_add_ss(a, a);
93     assert(a.array == [2.0f, 2, 3, 4]);
94 }
95 
96 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
97 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
98 {
99     pragma(inline, true);
100     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
101 }
102 unittest
103 {
104     float a = 4.32f;
105     float b = -78.99f;
106     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
107     __m128 A = _mm_set_ps(a, b, a, b);
108     __m128 B = _mm_set_ps(b, a, b, a);
109     int4 R = cast(int4)( _mm_and_ps(A, B) );
110     assert(R.array[0] == correct);
111     assert(R.array[1] == correct);
112     assert(R.array[2] == correct);
113     assert(R.array[3] == correct);
114 }
115 
116 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
117 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
118 {
119     static if (DMD_with_DSIMD)
120         return cast(__m128) __simd(XMM.ANDNPS, a, b);
121     else
122         return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
123 }
124 unittest
125 {
126     float a = 4.32f;
127     float b = -78.99f;
128     int correct  = ~(*cast(int*)(&a)) &  (*cast(int*)(&b));
129     int correct2 =  (*cast(int*)(&a)) & ~(*cast(int*)(&b));
130     __m128 A = _mm_set_ps(a, b, a, b);
131     __m128 B = _mm_set_ps(b, a, b, a);
132     int4 R = cast(int4)( _mm_andnot_ps(A, B) );
133     assert(R.array[0] == correct2);
134     assert(R.array[1] == correct);
135     assert(R.array[2] == correct2);
136     assert(R.array[3] == correct);
137 }
138 
139 /// Average packed unsigned 16-bit integers in ``a` and `b`.
140 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
141 {
142     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
143 }
144 
145 /// Average packed unsigned 8-bit integers in ``a` and `b`.
146 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
147 {
148     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
149 }
150 
151 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality.
152 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
153 {
154     static if (DMD_with_DSIMD)
155         return cast(__m128) __simd(XMM.CMPPS, a, b, 0);
156     else
157         return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
158 }
159 unittest
160 {
161     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
162     __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan);
163     __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B);
164     int[4] correct = [0, -1, 0, 0];
165     assert(R.array == correct);
166 }
167 
168 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 
169 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
170 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
171 {
172     static if (DMD_with_DSIMD)
173         return cast(__m128) __simd(XMM.CMPSS, a, b, 0);
174     else
175         return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
176 }
177 unittest
178 {
179     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
180     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
181     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
182     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
183     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
184     __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B);
185     __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C);
186     __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D);
187     __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E);
188     int[4] correct1 = [-1, 0, 0, 0];
189     int[4] correct2 = [0, 0, 0, 0];
190     int[4] correct3 = [0, 0, 0, 0];
191     int[4] correct4 = [0, 0, 0, 0];
192     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
193 }
194 
195 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal.
196 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
197 {
198     static if (DMD_with_DSIMD)
199         return cast(__m128) __simd(XMM.CMPPS, b, a, 2);
200     else
201         return cast(__m128) cmpps!(FPComparison.oge)(a, b);
202 }
203 unittest
204 {
205     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
206     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
207     __m128i R = cast(__m128i) _mm_cmpge_ps(A, B);
208     int[4] correct = [0, -1,-1, 0];
209     assert(R.array == correct);
210 }
211 
212 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 
213 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
214 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
215 {
216     static if (DMD_with_DSIMD)
217     {
218         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2);
219         a[0] = c[0];
220         return a;
221     }
222     else
223         return cast(__m128) cmpss!(FPComparison.oge)(a, b);
224 }
225 unittest
226 {
227     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
228     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
229     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
230     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
231     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
232     __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B);
233     __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C);
234     __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D);
235     __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E);
236     int[4] correct1 = [-1, 0, 0, 0];
237     int[4] correct2 = [-1, 0, 0, 0];
238     int[4] correct3 = [0, 0, 0, 0];
239     int[4] correct4 = [0, 0, 0, 0];
240     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
241 }
242 
243 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than.
244 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
245 {
246     static if (DMD_with_DSIMD)
247         return cast(__m128) __simd(XMM.CMPPS, b, a, 1);
248     else
249         return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
250 }
251 unittest
252 {
253     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
254     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
255     __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B);
256     int[4] correct = [0, 0,-1, 0];
257     assert(R.array == correct);
258 }
259 
260 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 
261 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
262 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
263 {
264     static if (DMD_with_DSIMD)
265     {
266         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1);
267         a[0] = c[0];
268         return a;
269     }
270     else
271         return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
272 }
273 unittest
274 {
275     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
276     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
277     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
278     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
279     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
280     __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B);
281     __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C);
282     __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D);
283     __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E);
284     int[4] correct1 = [0, 0, 0, 0];
285     int[4] correct2 = [-1, 0, 0, 0];
286     int[4] correct3 = [0, 0, 0, 0];
287     int[4] correct4 = [0, 0, 0, 0];
288     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
289 }
290 
291 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal.
292 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
293 {
294     static if (DMD_with_DSIMD)
295         return cast(__m128) __simd(XMM.CMPPS, a, b, 2);
296     else
297         return cast(__m128) cmpps!(FPComparison.ole)(a, b);
298 }
299 unittest
300 {
301     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
302     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
303     __m128i R = cast(__m128i) _mm_cmple_ps(A, B);
304     int[4] correct = [-1, -1, 0, 0];
305     assert(R.array == correct);
306 }
307 
308 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 
309 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
310 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
311 {
312     static if (DMD_with_DSIMD)
313         return cast(__m128) __simd(XMM.CMPSS, a, b, 2);
314     else
315         return cast(__m128) cmpss!(FPComparison.ole)(a, b);
316 }
317 unittest
318 {
319     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
320     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
321     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
322     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
323     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
324     __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B);
325     __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C);
326     __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D);
327     __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E);
328     int[4] correct1 = [-1, 0, 0, 0];
329     int[4] correct2 = [0, 0, 0, 0];
330     int[4] correct3 = [0, 0, 0, 0];
331     int[4] correct4 = [-1, 0, 0, 0];
332     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
333 }
334 
335 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than.
336 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
337 {
338     static if (DMD_with_DSIMD)
339         return cast(__m128) __simd(XMM.CMPPS, a, b, 1);
340     else
341         return cast(__m128) cmpps!(FPComparison.olt)(a, b);
342 }
343 unittest
344 {
345     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
346     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
347     __m128i R = cast(__m128i) _mm_cmplt_ps(A, B);
348     int[4] correct = [-1, 0, 0, 0];
349     assert(R.array == correct);
350 }
351 
352 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 
353 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
354 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
355 {
356     static if (DMD_with_DSIMD)
357         return cast(__m128) __simd(XMM.CMPSS, a, b, 1);
358     else
359         return cast(__m128) cmpss!(FPComparison.olt)(a, b);
360 }
361 unittest
362 {
363     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
364     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
365     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
366     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
367     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
368     __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B);
369     __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C);
370     __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D);
371     __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E);
372     int[4] correct1 = [0, 0, 0, 0];
373     int[4] correct2 = [0, 0, 0, 0];
374     int[4] correct3 = [0, 0, 0, 0];
375     int[4] correct4 = [-1, 0, 0, 0];
376     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
377 }
378 
379 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal.
380 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
381 {
382     static if (DMD_with_DSIMD)
383         return cast(__m128) __simd(XMM.CMPPS, a, b, 4);
384     else
385         return cast(__m128) cmpps!(FPComparison.une)(a, b);
386 }
387 unittest
388 {
389     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
390     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
391     __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B);
392     int[4] correct = [-1, 0, -1, -1];
393     assert(R.array == correct);
394 }
395 
396 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 
397 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
398 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
399 {
400     static if (DMD_with_DSIMD)
401         return cast(__m128) __simd(XMM.CMPSS, a, b, 4);
402     else
403         return cast(__m128) cmpss!(FPComparison.une)(a, b);
404 }
405 unittest
406 {
407     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
408     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
409     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
410     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
411     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
412     __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B);
413     __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C);
414     __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D);
415     __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E);
416     int[4] correct1 = [0, 0, 0, 0];
417     int[4] correct2 = [-1, 0, 0, 0];
418     int[4] correct3 = [-1, 0, 0, 0];
419     int[4] correct4 = [-1, 0, 0, 0];
420     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
421 }
422 
423 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal.
424 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
425 {
426     static if (DMD_with_DSIMD)
427         return cast(__m128) __simd(XMM.CMPPS, b, a, 6);
428     else
429         return cast(__m128) cmpps!(FPComparison.ult)(a, b);
430 }
431 unittest
432 {
433     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
434     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
435     __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B);
436     int[4] correct = [-1, 0, 0, -1];
437     assert(R.array == correct);
438 }
439 
440 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 
441 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
442 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
443 {
444     static if (DMD_with_DSIMD)
445     {
446         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6);
447         a[0] = c[0];
448         return a;
449     }
450     else
451         return cast(__m128) cmpss!(FPComparison.ult)(a, b);
452 }
453 unittest
454 {
455     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
456     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
457     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
458     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
459     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
460     __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B);
461     __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C);
462     __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D);
463     __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E);
464     int[4] correct1 = [0, 0, 0, 0];
465     int[4] correct2 = [0, 0, 0, 0];
466     int[4] correct3 = [-1, 0, 0, 0];
467     int[4] correct4 = [-1, 0, 0, 0];
468     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
469 }
470 
471 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than.
472 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
473 {
474     static if (DMD_with_DSIMD)
475         return cast(__m128) __simd(XMM.CMPPS, b, a, 5);
476     else
477         return cast(__m128) cmpps!(FPComparison.ule)(a, b);
478 }
479 unittest
480 {
481     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
482     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
483     __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B);
484     int[4] correct = [-1, -1, 0, -1];
485     assert(R.array == correct);
486 }
487 
488 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 
489 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
490 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
491 {
492     static if (DMD_with_DSIMD)
493     {
494         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5);
495         a[0] = c[0];
496         return a;
497     }
498     else
499         return cast(__m128) cmpss!(FPComparison.ule)(a, b);
500 }
501 unittest
502 {
503     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
504     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
505     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
506     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
507     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
508     __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B);
509     __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C);
510     __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D);
511     __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E);
512     int[4] correct1 = [-1, 0, 0, 0];
513     int[4] correct2 = [0, 0, 0, 0];
514     int[4] correct3 = [-1, 0, 0, 0];
515     int[4] correct4 = [-1, 0, 0, 0];
516     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
517 }
518 
519 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal.
520 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
521 {
522     static if (DMD_with_DSIMD)
523         return cast(__m128) __simd(XMM.CMPPS, a, b, 6);
524     else
525         return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
526 }
527 unittest
528 {
529     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
530     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
531     __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B);
532     int[4] correct = [0, 0, -1, -1];
533     assert(R.array == correct);
534 }
535 
536 
537 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 
538 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
539 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
540 {
541     static if (DMD_with_DSIMD)
542         return cast(__m128) __simd(XMM.CMPSS, a, b, 6);
543     else
544         return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
545 }
546 unittest
547 {
548     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
549     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
550     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
551     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
552     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
553     __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B);
554     __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C);
555     __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D);
556     __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E);
557     int[4] correct1 = [0, 0, 0, 0];
558     int[4] correct2 = [-1, 0, 0, 0];
559     int[4] correct3 = [-1, 0, 0, 0];
560     int[4] correct4 = [0, 0, 0, 0];
561     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
562 }
563 
564 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than.
565 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
566 {
567     static if (DMD_with_DSIMD)
568         return cast(__m128) __simd(XMM.CMPPS, a, b, 5);
569     else
570         return cast(__m128) cmpps!(FPComparison.uge)(a, b);
571 }
572 unittest
573 {
574     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
575     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
576     __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B);
577     int[4] correct = [0, -1, -1, -1];
578     assert(R.array == correct);
579 }
580 
581 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 
582 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
583 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
584 {
585     static if (DMD_with_DSIMD)
586         return cast(__m128) __simd(XMM.CMPSS, a, b, 5);
587     else
588         return cast(__m128) cmpss!(FPComparison.uge)(a, b);
589 }
590 unittest
591 {
592     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
593     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
594     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
595     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
596     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
597     __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B);
598     __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C);
599     __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D);
600     __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E);
601     int[4] correct1 = [-1, 0, 0, 0];
602     int[4] correct2 = [-1, 0, 0, 0];
603     int[4] correct3 = [-1, 0, 0, 0];
604     int[4] correct4 = [0, 0, 0, 0];
605     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
606 }
607 
608 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN.
609 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
610 {
611     static if (DMD_with_DSIMD)
612         return cast(__m128) __simd(XMM.CMPPS, a, b, 7);
613     else
614         return cast(__m128) cmpps!(FPComparison.ord)(a, b);
615 }
616 unittest
617 {
618     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
619     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
620     __m128i R = cast(__m128i) _mm_cmpord_ps(A, B);
621     int[4] correct = [-1, -1, -1, 0];
622     assert(R.array == correct);
623 }
624 
625 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 
626 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
627 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
628 {
629     static if (DMD_with_DSIMD)
630         return cast(__m128) __simd(XMM.CMPSS, a, b, 7);
631     else
632         return cast(__m128) cmpss!(FPComparison.ord)(a, b);
633 }
634 unittest
635 {
636     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
637     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
638     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
639     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
640     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
641     __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B);
642     __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C);
643     __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D);
644     __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E);
645     int[4] correct1 = [-1, 0, 0, 0];
646     int[4] correct2 = [-1, 0, 0, 0];
647     int[4] correct3 = [0, 0, 0, 0];
648     int[4] correct4 = [-1, 0, 0, 0];
649     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
650 }
651 
652 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
653 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
654 {
655     static if (DMD_with_DSIMD)
656         return cast(__m128) __simd(XMM.CMPPS, a, b, 3);
657     else
658         return cast(__m128) cmpps!(FPComparison.uno)(a, b);
659 }
660 unittest
661 {
662     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
663     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
664     __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B);
665     int[4] correct = [0, 0, 0, -1];
666     assert(R.array == correct);
667 }
668 
669 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
670 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
671 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
672 {
673     static if (DMD_with_DSIMD)
674         return cast(__m128) __simd(XMM.CMPSS, a, b, 3);
675     else return cast(__m128) cmpss!(FPComparison.uno)(a, b);
676 }
677 unittest
678 {
679     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
680     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
681     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
682     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
683     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
684     __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B);
685     __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C);
686     __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D);
687     __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E);
688     int[4] correct1 = [0, 0, 0, 0];
689     int[4] correct2 = [0, 0, 0, 0];
690     int[4] correct3 = [-1, 0, 0, 0];
691     int[4] correct4 = [0, 0, 0, 0];
692     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
693 }
694 
695 
696 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 
697 /// and return the boolean result (0 or 1).
698 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe
699 {
700     return a.array[0] == b.array[0];
701 }
702 unittest
703 {
704     assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
705     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
706     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
707     assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
708     assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0)));
709 }
710 
711 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 
712 /// and return the boolean result (0 or 1).
713 int _mm_comige_ss (__m128 a, __m128 b) pure @safe
714 {
715     return a.array[0] >= b.array[0];
716 }
717 unittest
718 {
719     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
720     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
721     assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
722     assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
723     assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
724     assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
725 }
726 
727 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 
728 /// and return the boolean result (0 or 1).
729 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
730 {
731     return a.array[0] > b.array[0];
732 }
733 unittest
734 {
735     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
736     assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
737     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
738     assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
739     assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
740 }
741 
742 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 
743 /// and return the boolean result (0 or 1).
744 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
745 {
746     return a.array[0] <= b.array[0];
747 }
748 unittest
749 {
750     assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
751     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
752     assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
753     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
754     assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
755     assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
756 }
757 
758 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 
759 /// and return the boolean result (0 or 1).
760 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
761 {
762     return a.array[0] < b.array[0];
763 }
764 unittest
765 {
766     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
767     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
768     assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
769     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
770     assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
771     assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
772 }
773 
774 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 
775 /// and return the boolean result (0 or 1).
776 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
777 {
778     return a.array[0] != b.array[0];
779 }
780 unittest
781 {
782     assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
783     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
784     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
785     assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
786     assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
787 }
788 
789 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
790 /// floating-point elements, store the results in the lower 2 elements, 
791 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
792 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
793 
794 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 
795 /// to packed 32-bit integers.
796 __m64 _mm_cvt_ps2pi (__m128 a) @safe
797 {
798     return to_m64(_mm_cvtps_epi32(a));
799 }
800 
801 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
802 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
803 /// upper elements of the result.
804 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted
805 {
806     v.ptr[0] = cast(float)x;
807     return v;
808 }
809 unittest
810 {
811     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
812     assert(a.array == [42f, 0, 0, 0]);
813 }
814 
815 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
816 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
817 {
818     __m128i ma = to_m128i(a);
819     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
820     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
821     return _mm_cvtepi32_ps(ma);
822 }
823 unittest
824 {
825     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
826     __m128 R = _mm_cvtpi16_ps(A);
827     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
828     assert(R.array == correct);
829 }
830 
831 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
832 /// floating-point elements, store the results in the lower 2 elements, 
833 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
834 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
835 {
836     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
837     a.ptr[0] = fb.array[0];
838     a.ptr[1] = fb.array[1];
839     return a;
840 }
841 unittest
842 {
843     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
844     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
845     assert(R.array == correct);
846 }
847 
848 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 
849 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 
850 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements.
851 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
852 {
853     long2 l;
854     l.ptr[0] = a.array[0];
855     l.ptr[1] = b.array[0];
856     return _mm_cvtepi32_ps(cast(__m128i)l);
857 }
858 unittest
859 {
860     __m64 A = _mm_setr_pi32(-45, 128);
861     __m64 B = _mm_setr_pi32(0, 1000);
862     __m128 R = _mm_cvtpi32x2_ps(A, B);
863     float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f];
864     assert(R.array == correct);
865 }
866 
867 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
868 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
869 {
870     __m128i b = to_m128i(a); 
871 
872     // Zero extend to 32-bit
873     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
874     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
875 
876     // Replicate sign bit
877     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
878     return _mm_cvtepi32_ps(b);
879 }
880 unittest
881 {
882     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
883     __m128 R = _mm_cvtpi8_ps(A);
884     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
885     assert(R.array == correct);
886 }
887 
888 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers.
889 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.
890 __m64 _mm_cvtps_pi16 (__m128 a) @safe
891 {
892     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
893     // Which means the 16-bit integers should be saturated
894     __m128i b = _mm_cvtps_epi32(a);
895     b = _mm_packs_epi32(b, b);
896     return to_m64(b);
897 }
898 unittest
899 {
900     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
901     short4 R = cast(short4) _mm_cvtps_pi16(A);
902     short[4] correct = [-1, 2, -32768, 32767];
903     assert(R.array == correct);
904 }
905 
906 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
907 __m64 _mm_cvtps_pi32 (__m128 a) @safe
908 {
909     return to_m64(_mm_cvtps_epi32(a));
910 }
911 unittest
912 {
913     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
914     int2 R = cast(int2) _mm_cvtps_pi32(A);
915     int[2] correct = [-33000, 70000];
916     assert(R.array == correct);
917 }
918 
919 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 
920 /// and store the results in lower 4 elements. 
921 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.
922 __m64 _mm_cvtps_pi8 (__m128 a) @safe
923 {
924     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
925     // Which means the 8-bit integers should be saturated
926     __m128i b = _mm_cvtps_epi32(a);
927     b = _mm_packs_epi32(b, _mm_setzero_si128());
928     b = _mm_packs_epi16(b, _mm_setzero_si128());
929     return to_m64(b);
930 }
931 unittest
932 {
933     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
934     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
935     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
936     assert(R.array == correct);
937 }
938 
939 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
940 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
941 {
942     __m128i ma = to_m128i(a);
943     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
944     return _mm_cvtepi32_ps(ma);
945 }
946 unittest
947 {
948     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
949     __m128 R = _mm_cvtpu16_ps(A);
950     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
951     assert(R.array == correct);
952 }
953 
954 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element.
955 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
956 {
957     __m128i b = to_m128i(a); 
958 
959     // Zero extend to 32-bit
960     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
961     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
962     return _mm_cvtepi32_ps(b);
963 }
964 unittest
965 {
966     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
967     __m128 R = _mm_cvtpu8_ps(A);
968     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
969     assert(R.array == correct);
970 }
971 
972 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
973 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
974 /// upper elements of result.
975 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
976 {
977     v.ptr[0] = cast(float)x;
978     return v;
979 }
980 unittest
981 {
982     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
983     assert(a.array == [42.0f, 0, 0, 0]);
984 }
985 
986 
987 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 
988 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
989 /// upper elements of result.
990 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
991 {
992     v.ptr[0] = cast(float)x;
993     return v;
994 }
995 unittest
996 {
997     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
998     assert(a.array == [42.0f, 0, 0, 0]);
999 }
1000 
1001 /// Take the lower single-precision (32-bit) floating-point element of `a`.
1002 float _mm_cvtss_f32(__m128 a) pure @safe
1003 {
1004     return a.array[0];
1005 }
1006 
1007 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
1008 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC
1009 {
1010     static if (GDC_with_SSE)
1011     {
1012         return __builtin_ia32_cvtss2si(a);
1013     }
1014     else static if (LDC_with_SSE)
1015     {
1016         return __builtin_ia32_cvtss2si(a);
1017     }
1018     else static if (DMD_with_DSIMD)
1019     {
1020         __m128 b;
1021         __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers.
1022         return r.array[0];
1023     }
1024     else
1025     {
1026         return convertFloatToInt32UsingMXCSR(a.array[0]);
1027     }
1028 }
1029 unittest
1030 {
1031     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1032 }
1033 
1034 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
1035 long _mm_cvtss_si64 (__m128 a) @safe
1036 {
1037     static if (LDC_with_SSE2)
1038     {
1039         version(X86_64)
1040         {
1041             return __builtin_ia32_cvtss2si64(a);
1042         }
1043         else
1044         {
1045             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1046             // using SSE instructions only. So the builtin doesn't exit for this arch.
1047             return convertFloatToInt64UsingMXCSR(a.array[0]);
1048         }
1049     }
1050     else
1051     {
1052         return convertFloatToInt64UsingMXCSR(a.array[0]);
1053     }
1054 }
1055 unittest
1056 {
1057     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1058 
1059     uint savedRounding = _MM_GET_ROUNDING_MODE();
1060 
1061     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1062     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
1063 
1064     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1065     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
1066 
1067     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1068     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
1069 
1070     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1071     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
1072 
1073     _MM_SET_ROUNDING_MODE(savedRounding);
1074 }
1075 
1076 
1077 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 
1078 /// integer with truncation.
1079 int _mm_cvtt_ss2si (__m128 a) pure @safe
1080 {
1081     // x86: cvttss2si always generated, even in -O0
1082     return cast(int)(a.array[0]);
1083 }
1084 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto
1085 unittest
1086 {
1087     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1088 }
1089 
1090 
1091 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
1092 /// integers with truncation.
1093 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
1094 {
1095     return to_m64(_mm_cvttps_epi32(a));
1096 }
1097 
1098 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 
1099 /// integer with truncation.
1100 long _mm_cvttss_si64 (__m128 a) pure @safe
1101 {
1102     return cast(long)(a.array[0]);
1103 }
1104 unittest
1105 {
1106     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1107 }
1108 
1109 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
1110 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
1111 {
1112     pragma(inline, true);
1113     return a / b;
1114 }
1115 unittest
1116 {
1117     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1118     a = _mm_div_ps(a, a);
1119     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
1120     assert(a.array == correct);
1121 }
1122 
1123 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 
1124 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 
1125 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1126 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
1127 {
1128     static if (DMD_with_DSIMD)
1129         return cast(__m128) __simd(XMM.DIVSS, a, b);
1130     else static if (GDC_with_SSE)
1131         return __builtin_ia32_divss(a, b);
1132     else
1133     {
1134         a[0] /= b[0];
1135         return a;
1136     }
1137 }
1138 unittest
1139 {
1140     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1141     a = _mm_div_ss(a, a);
1142     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
1143     assert(a.array == correct);
1144 }
1145 
1146 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended.
1147 int _mm_extract_pi16 (__m64 a, int imm8)
1148 {
1149     short4 sa = cast(short4)a;
1150     return cast(ushort)(sa.array[imm8]);
1151 }
1152 unittest
1153 {
1154     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
1155     assert(_mm_extract_pi16(A, 0) == 65535);
1156     assert(_mm_extract_pi16(A, 1) == 6);
1157     assert(_mm_extract_pi16(A, 2) == 0);
1158     assert(_mm_extract_pi16(A, 3) == 4);
1159 }
1160 
1161 /// Free aligned memory that was allocated with `_mm_malloc` or `_mm_realloc`.
1162 void _mm_free(void * mem_addr) @trusted
1163 {
1164     // support for free(NULL)
1165     if (mem_addr is null)
1166         return;
1167 
1168     // Technically we don't need to store size and alignement in the chunk, but we do in case we
1169     // have to implement _mm_realloc
1170 
1171     size_t pointerSize = (void*).sizeof;
1172     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
1173     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
1174     size_t alignment = *alignmentLocation;
1175     assert(alignment != 0);
1176     assert(isPointerAligned(mem_addr, alignment));
1177     free(*rawLocation);
1178 }
1179 
1180 /// Get the exception mask bits from the MXCSR control and status register. 
1181 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 
1182 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1183 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated.
1184 uint _MM_GET_EXCEPTION_MASK() @safe
1185 {
1186     return _mm_getcsr() & _MM_MASK_MASK;
1187 }
1188 
1189 /// Get the exception state bits from the MXCSR control and status register. 
1190 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
1191 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
1192 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported.
1193 uint _MM_GET_EXCEPTION_STATE() @safe
1194 {
1195     return _mm_getcsr() & _MM_EXCEPT_MASK;
1196 }
1197 
1198 /// Get the flush zero bits from the MXCSR control and status register. 
1199 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`
1200 uint _MM_GET_FLUSH_ZERO_MODE() @safe
1201 {
1202     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
1203 }
1204 
1205 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 
1206 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
1207 uint _MM_GET_ROUNDING_MODE() @safe
1208 {
1209     return _mm_getcsr() & _MM_ROUND_MASK;
1210 }
1211 
1212 /// Get the unsigned 32-bit value of the MXCSR control and status register.
1213 /// Note: this is emulated on ARM, because there is no MXCSR register then.
1214 uint _mm_getcsr() @trusted
1215 {
1216     static if (LDC_with_ARM)
1217     {
1218         // Note: we convert the ARM FPSCR into a x86 SSE control word.
1219         // However, only rounding mode and flush to zero are actually set.
1220         // The returned control word will have all exceptions masked, and no exception detected.
1221 
1222         uint fpscr = arm_get_fpcr();
1223 
1224         uint cw = 0; // No exception detected
1225         if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
1226         {
1227             // ARM has one single flag for ARM.
1228             // It does both x86 bits.
1229             // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
1230             cw |= _MM_FLUSH_ZERO_ON;
1231             cw |= 0x40; // set "denormals are zeros"
1232         } 
1233         cw |= _MM_MASK_MASK; // All exception maske
1234 
1235         // Rounding mode
1236         switch(fpscr & _MM_ROUND_MASK_ARM)
1237         {
1238             default:
1239             case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
1240             case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
1241             case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
1242             case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
1243         }
1244         return cw;
1245     }
1246     else version(GNU)
1247     {
1248         static if (GDC_with_SSE)
1249         {
1250             return __builtin_ia32_stmxcsr();
1251         }
1252         else version(X86)
1253         {
1254             uint sseRounding = 0;
1255             asm pure nothrow @nogc @trusted
1256             {
1257                 "stmxcsr %0;\n" 
1258                   : "=m" (sseRounding)
1259                   : 
1260                   : ;
1261             }
1262             return sseRounding;
1263         }
1264         else return __warn_noop_ret!uint();
1265     }
1266     else version (InlineX86Asm)
1267     {
1268         uint controlWord;
1269         asm nothrow @nogc pure @trusted
1270         {
1271             stmxcsr controlWord;
1272         }
1273         return controlWord;
1274     }
1275     else
1276         static assert(0, "Not yet supported");
1277 }
1278 unittest
1279 {
1280     uint csr = _mm_getcsr();
1281 }
1282 
1283 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`.
1284 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted
1285 {
1286     short4 r = cast(short4)v;
1287     r.ptr[imm8 & 3] = cast(short)i;
1288     return cast(__m64)r;
1289 }
1290 unittest
1291 {
1292     __m64 A = _mm_set_pi16(3, 2, 1, 0);
1293     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
1294     short[4] correct = [0, 42, 2, 3];
1295     assert(R.array == correct);
1296 }
1297 
1298 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory.
1299 //  `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1300 __m128 _mm_load_ps(const(float)*p) pure @trusted // FUTURE shouldn't be trusted, see #62
1301 {
1302     pragma(inline, true);
1303     return *cast(__m128*)p;
1304 }
1305 unittest
1306 {
1307     static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f];
1308     __m128 A = _mm_load_ps(correct.ptr);
1309     assert(A.array == correct);
1310 }
1311 
1312 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1313 __m128 _mm_load_ps1(const(float)*p) pure @trusted
1314 {
1315     return __m128(*p);
1316 }
1317 unittest
1318 {
1319     float n = 2.5f;
1320     float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f];
1321     __m128 A = _mm_load_ps1(&n);
1322     assert(A.array == correct);
1323 }
1324 
1325 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 
1326 /// elements. `mem_addr` does not need to be aligned on any particular boundary.
1327 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
1328 {
1329     pragma(inline, true);
1330     static if (DMD_with_DSIMD)
1331     {
1332         return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr);
1333     }
1334     else
1335     {
1336         __m128 r; // PERf =void;
1337         r.ptr[0] = *mem_addr;
1338         r.ptr[1] = 0;
1339         r.ptr[2] = 0;
1340         r.ptr[3] = 0;
1341         return r;
1342     }
1343 }
1344 unittest
1345 {
1346     float n = 2.5f;
1347     float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f];
1348     __m128 A = _mm_load_ss(&n);
1349     assert(A.array == correct);
1350 }
1351 
1352 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1353 alias _mm_load1_ps = _mm_load_ps1;
1354 
1355 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 
1356 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary.
1357 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1358 {
1359     pragma(inline, true);
1360     static if (DMD_with_DSIMD)
1361     {
1362         return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 
1363     }
1364     else
1365     {
1366         // x86: movlhps generated since LDC 1.9.0 -O1
1367         long2 la = cast(long2)a;
1368         la.ptr[1] = (*mem_addr).array[0];
1369         return cast(__m128)la;
1370     }
1371 }
1372 unittest
1373 {
1374     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1375     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1376     __m64 M = to_m64(cast(__m128i)B);
1377      __m128 R = _mm_loadh_pi(A, &M);
1378     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1379     assert(R.array == correct);
1380 }
1381 
1382 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 
1383 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1384 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1385 {
1386     pragma(inline, true);
1387 
1388     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=23046
1389     /*
1390     static if (DMD_with_DSIMD)
1391     {
1392         return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 
1393     }
1394     else */
1395     {
1396         // x86: movlpd/movlps generated with all LDC -01
1397         long2 la = cast(long2)a;
1398         la.ptr[0] = (*mem_addr).array[0];
1399         return cast(__m128)la;
1400     }
1401 }
1402 unittest
1403 {
1404     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1405     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1406     __m64 M = to_m64(cast(__m128i)B);
1407      __m128 R = _mm_loadl_pi(A, &M);
1408     float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f];
1409     assert(R.array == correct);
1410 }
1411 
1412 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 
1413 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1414 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // FUTURE shouldn't be trusted, see #62
1415 {
1416     __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1
1417     __m128 a = *aligned;
1418     static if (DMD_with_DSIMD)
1419     {
1420         return cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
1421     }
1422     else
1423     {
1424         __m128 r; // PERF =void;
1425         r.ptr[0] = a.array[3];
1426         r.ptr[1] = a.array[2];
1427         r.ptr[2] = a.array[1];
1428         r.ptr[3] = a.array[0];
1429         return r;
1430     }
1431 }
1432 unittest
1433 {
1434     align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ];
1435     __m128 A = _mm_loadr_ps(arr.ptr);
1436     float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ];
1437     assert(A.array == correct);
1438 }
1439 
1440 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 
1441 /// `mem_addr` does not need to be aligned on any particular boundary.
1442 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted
1443 {
1444     pragma(inline, true);
1445     static if (GDC_with_SSE2)
1446     {
1447         return __builtin_ia32_loadups(mem_addr);
1448     }
1449     else static if (LDC_with_optimizations)
1450     {
1451         static if (LDC_with_optimizations)
1452         {
1453             return loadUnaligned!(__m128)(mem_addr);
1454         }
1455         else
1456         {
1457             __m128 result;
1458             result.ptr[0] = mem_addr[0];
1459             result.ptr[1] = mem_addr[1];
1460             result.ptr[2] = mem_addr[2];
1461             result.ptr[3] = mem_addr[3];
1462             return result;
1463         }
1464     }
1465     else version(DigitalMars)
1466     {
1467         static if (DMD_with_DSIMD)
1468         {
1469             return cast(__m128)__simd(XMM.LODUPS, *cast(const(float4*))mem_addr);
1470         }
1471         else static if (SSESizedVectorsAreEmulated)
1472         {
1473             // Since this vector is emulated, it doesn't have alignement constraints
1474             // and as such we can just cast it.
1475             return *cast(__m128*)(mem_addr);
1476         }
1477         else
1478         {
1479             __m128 result;
1480             result.ptr[0] = mem_addr[0];
1481             result.ptr[1] = mem_addr[1];
1482             result.ptr[2] = mem_addr[2];
1483             result.ptr[3] = mem_addr[3];
1484             return result;
1485         }
1486     }
1487     else
1488     {
1489         __m128 result;
1490         result.ptr[0] = mem_addr[0];
1491         result.ptr[1] = mem_addr[1];
1492         result.ptr[2] = mem_addr[2];
1493         result.ptr[3] = mem_addr[3];
1494         return result;
1495     }
1496 }
1497 unittest
1498 {
1499     align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ];  // force unaligned load
1500     __m128 A = _mm_loadu_ps(&arr[1]);
1501     float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ];
1502     assert(A.array == correct);
1503 }
1504 
1505 /// Allocate size bytes of memory, aligned to the alignment specified in align,
1506 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
1507 /// memory that is allocated with `_mm_malloc`.
1508 void* _mm_malloc(size_t size, size_t alignment) @trusted
1509 {
1510     assert(alignment != 0);
1511     size_t request = requestedSize(size, alignment);
1512     void* raw = malloc(request);
1513     if (request > 0 && raw == null) // malloc(0) can validly return anything
1514         onOutOfMemoryError();
1515     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
1516 }
1517 
1518 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 
1519 /// bit is not set in the corresponding element) and a non-temporal memory hint.
1520 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
1521 {
1522     // this works since mask is zero-extended
1523     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
1524 }
1525 
1526 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;///
1527 
1528 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value.
1529 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
1530 {
1531     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
1532 }
1533 
1534 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1535 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
1536 {
1537     static if (DMD_with_DSIMD)
1538     {
1539         return cast(__m128) __simd(XMM.MAXPS, a, b);
1540     }
1541     else static if (GDC_with_SSE)
1542     {
1543         return __builtin_ia32_maxps(a, b);
1544     }
1545     else static if (LDC_with_SSE)
1546     {
1547         return __builtin_ia32_maxps(a, b);
1548     }
1549     else
1550     {
1551         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1552         __m128 r; // PERF =void;
1553         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1554         r[1] = (a[1] > b[1]) ? a[1] : b[1];
1555         r[2] = (a[2] > b[2]) ? a[2] : b[2];
1556         r[3] = (a[3] > b[3]) ? a[3] : b[3];
1557         return r;    
1558     }
1559 }
1560 unittest
1561 {
1562     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1563     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1564     __m128 M = _mm_max_ps(A, B);
1565     assert(M.array[0] == 4);
1566     assert(M.array[1] == 2);
1567     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1568     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1569 }
1570 
1571 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
1572 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
1573 {
1574     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
1575 }
1576 
1577 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 
1578 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1579  __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
1580 {
1581     static if (DMD_with_DSIMD)
1582     {
1583         return cast(__m128) __simd(XMM.MAXSS, a, b);
1584     }
1585     else static if (GDC_with_SSE)
1586     {
1587         return __builtin_ia32_maxss(a, b);
1588     }
1589     else static if (LDC_with_SSE)
1590     {
1591         return __builtin_ia32_maxss(a, b); 
1592     }
1593     else
1594     {  
1595         __m128 r = a;
1596         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1597         return r;
1598     }
1599 }
1600 unittest
1601 {
1602     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1603     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1604     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1605     __m128 M = _mm_max_ss(A, B);
1606     assert(M.array[0] == 4);
1607     assert(M.array[1] == 2);
1608     assert(M.array[2] == 3);
1609     assert(M.array[3] == 4);
1610     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
1611     assert(M.array[0] != M.array[0]);
1612     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
1613     assert(M.array[0] == 1);
1614 }
1615 
1616 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values.
1617 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
1618 {
1619     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
1620 }
1621 
1622 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1623 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
1624 {
1625     static if (DMD_with_DSIMD)
1626     {
1627         return cast(__m128) __simd(XMM.MINPS, a, b);
1628     }
1629     else static if (GDC_with_SSE)
1630     {
1631         return __builtin_ia32_minps(a, b);
1632     }
1633     else static if (LDC_with_SSE)
1634     {
1635         // not technically needed, but better perf in debug mode
1636         return __builtin_ia32_minps(a, b);
1637     }
1638     else
1639     {
1640         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1641         __m128 r; // PERF =void;
1642         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1643         r[1] = (a[1] < b[1]) ? a[1] : b[1];
1644         r[2] = (a[2] < b[2]) ? a[2] : b[2];
1645         r[3] = (a[3] < b[3]) ? a[3] : b[3];
1646         return r;
1647     }
1648 }
1649 unittest
1650 {
1651     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1652     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1653     __m128 M = _mm_min_ps(A, B);
1654     assert(M.array[0] == 1);
1655     assert(M.array[1] == 1);
1656     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1657     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1658 }
1659 
1660 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
1661 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
1662 {
1663     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
1664 }
1665 
1666 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 
1667 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1668 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
1669 {
1670     static if (DMD_with_DSIMD)
1671     {
1672         return cast(__m128) __simd(XMM.MINSS, a, b);
1673     }
1674     else static if (GDC_with_SSE)
1675     {
1676         return __builtin_ia32_minss(a, b);
1677     }
1678     else static if (LDC_with_SSE)
1679     {
1680         return __builtin_ia32_minss(a, b);
1681     }
1682     else
1683     {
1684         // Generates minss since LDC 1.3 -O1
1685         __m128 r = a;
1686         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1687         return r;
1688     }
1689 }
1690 unittest
1691 {
1692     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1693     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1694     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1695     __m128 M = _mm_min_ss(A, B);
1696     assert(M.array[0] == 1);
1697     assert(M.array[1] == 2);
1698     assert(M.array[2] == 3);
1699     assert(M.array[3] == 4);
1700     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
1701     assert(M.array[0] != M.array[0]);
1702     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
1703     assert(M.array[0] == 1);
1704 }
1705 
1706 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 
1707 /// the upper 3 packed elements from `a` to the upper elements of result.
1708 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
1709 {
1710     // Workaround https://issues.dlang.org/show_bug.cgi?id=21673
1711     // inlining of this function fails.
1712     version(DigitalMars) asm nothrow @nogc pure { nop; }
1713 
1714     a.ptr[0] = b.array[0];
1715     return a;
1716 }
1717 unittest
1718 {
1719     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1720     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1721     __m128 R = _mm_move_ss(A, B);
1722     float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f];
1723     assert(R.array == correct);
1724 }
1725 
1726 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 
1727 /// copy the upper 2 elements from `a` to the upper 2 elements of dst.
1728 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
1729 {
1730     // PERF DMD
1731     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1732     /*
1733     static if (DMD_with_DSIMD)
1734     {
1735         
1736         return cast(__m128) __simd(XMM.MOVHLPS, a, b);
1737     }
1738     else */
1739     {
1740         a.ptr[0] = b.array[2];
1741         a.ptr[1] = b.array[3];
1742         return a;
1743     }
1744 }
1745 unittest
1746 {
1747     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1748     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1749     __m128 R = _mm_movehl_ps(A, B);
1750     float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f];
1751     assert(R.array == correct);
1752 }
1753 
1754 /// Move the upper 2 32-bit integer elements from `b` to the lower 2 elements of result, and 
1755 /// copy the upper 2 elements from `a` to the upper 2 elements of dst.
1756 __m128i _mm_movehl_epi32 (__m128i a, __m128i b) pure @trusted
1757 {
1758     a.ptr[0] = b.array[2];
1759     a.ptr[1] = b.array[3];
1760     return a;
1761 }
1762 unittest
1763 {
1764     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1765     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
1766     __m128i R = _mm_movehl_epi32(A, B);
1767     int[4] correct = [7, 8, 3, 4];
1768     assert(R.array == correct);
1769 }
1770 
1771 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 
1772 /// copy the lower 2 elements from `a` to the lower 2 elements of result
1773 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
1774 {    
1775     // Was disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1776     static if (DMD_with_DSIMD && __VERSION__ >= 2101)
1777     {
1778         return cast(__m128) __simd(XMM.MOVLHPS, a, b);
1779     }
1780     else
1781     {
1782         a.ptr[2] = b.array[0];
1783         a.ptr[3] = b.array[1];
1784         return a;
1785     }    
1786 }
1787 unittest
1788 {
1789     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1790     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1791     __m128 R = _mm_movelh_ps(A, B);
1792     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1793     assert(R.array == correct);
1794 }
1795 
1796 /// Move the lower 2 32-bit integers `b` to the upper 2 elements of result, and 
1797 /// copy the lower 2 elements from `a` to the lower 2 elements of result
1798 __m128i _mm_movelh_epi32 (__m128i a, __m128i b) pure @trusted // #BONUS
1799 {
1800     version(DigitalMars)
1801     {
1802         // Crash in DMD 2.098 with -O -inline -a x86
1803         // not sure when it was fixed
1804         pragma(inline, false);
1805         a.ptr[2] = b.array[0];
1806         a.ptr[3] = b.array[1];
1807     }
1808     else
1809     {
1810         a.ptr[2] = b.array[0];
1811         a.ptr[3] = b.array[1];
1812     }
1813     return a;
1814 }
1815 unittest
1816 {
1817     __m128i A = _mm_setr_epi32(1, 2, 3, 4);
1818     __m128i B = _mm_setr_epi32(5, 6, 7, 8);
1819     __m128i R = _mm_movelh_epi32(A, B);
1820     int[4] correct = [1, 2, 5, 6];
1821     assert(R.array == correct);
1822 }
1823 
1824 /// Create mask from the most significant bit of each 8-bit element in `a`.
1825 int _mm_movemask_pi8 (__m64 a) pure @safe
1826 {
1827     return _mm_movemask_epi8(to_m128i(a));
1828 }
1829 unittest
1830 {
1831     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
1832 }
1833 
1834 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 
1835 /// floating-point element in `a`.
1836 int _mm_movemask_ps (__m128 a) pure @trusted
1837 {
1838     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
1839     static if (GDC_with_SSE)
1840     {
1841         return __builtin_ia32_movmskps(a);
1842     }
1843     else static if (LDC_with_SSE)
1844     {
1845         return __builtin_ia32_movmskps(a);
1846     }
1847     else static if (LDC_with_ARM)
1848     {
1849         int4 ai = cast(int4)a;
1850         int4 shift31 = [31, 31, 31, 31]; 
1851         ai = ai >>> shift31;
1852         int4 shift = [0, 1, 2, 3]; 
1853         ai = ai << shift; // 4-way shift, only efficient on ARM.
1854         int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
1855         return r;
1856     }
1857     else
1858     {
1859         int4 ai = cast(int4)a;
1860         int r = 0;
1861         if (ai.array[0] < 0) r += 1;
1862         if (ai.array[1] < 0) r += 2;
1863         if (ai.array[2] < 0) r += 4;
1864         if (ai.array[3] < 0) r += 8;
1865         return r;
1866     }
1867 }
1868 unittest
1869 {
1870     int4 A = [-1, 0, -43, 0];
1871     assert(5 == _mm_movemask_ps(cast(float4)A));
1872 }
1873 
1874 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
1875 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
1876 {
1877     pragma(inline, true);
1878     return a * b;
1879 }
1880 unittest
1881 {
1882     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1883     a = _mm_mul_ps(a, a);
1884     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
1885     assert(a.array == correct);
1886 }
1887 
1888 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 
1889 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1890 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
1891 {
1892     static if (DMD_with_DSIMD)
1893         return cast(__m128) __simd(XMM.MULSS, a, b);
1894     else static if (GDC_with_SSE)
1895         return __builtin_ia32_mulss(a, b);
1896     else
1897     {
1898         a[0] *= b[0];
1899         return a;
1900     }
1901 }
1902 unittest
1903 {
1904     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1905     a = _mm_mul_ss(a, a);
1906     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
1907     assert(a.array == correct);
1908 }
1909 
1910 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
1911 /// and return the high 16 bits of the intermediate integers.
1912 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
1913 {
1914     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
1915 }
1916 unittest
1917 {
1918     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
1919     __m64 B = _mm_set1_pi16(16384);
1920     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
1921     short[4] correct = [0, 0x3FFC, 0, 0];
1922     assert(R.array == correct);
1923 }
1924 
1925 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 
1926 /// return the result.
1927 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
1928 {
1929     static if (DMD_with_DSIMD)
1930         return cast(__m128)__simd(XMM.ORPS, a, b);
1931     else
1932         return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
1933 }
1934 unittest
1935 {
1936     __m128 A = cast(__m128) _mm_set1_epi32(0x80000000);
1937     __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity);
1938     __m128 C = _mm_or_ps(A, B);
1939     float[4] correct = [-4.0f, -5.0, -9.5f, -float.infinity];
1940     assert(C.array == correct);
1941 }
1942 
1943 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;///
1944 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;///
1945 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;///
1946 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;///
1947 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;///
1948 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;///
1949 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;///
1950 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;///
1951 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;///
1952 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;///
1953 
1954 enum _MM_HINT_T0  = 3; ///
1955 enum _MM_HINT_T1  = 2; ///
1956 enum _MM_HINT_T2  = 1; ///
1957 enum _MM_HINT_NTA = 0; ///
1958 
1959 
1960 version(LDC)
1961 {
1962     // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
1963     // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
1964     static if (__VERSION__ >= 2091) 
1965     {
1966         pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
1967             void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
1968     }
1969 }
1970 
1971 /// Fetch the line of data from memory that contains address `p` to a location in the 
1972 /// cache hierarchy specified by the locality hint i.
1973 ///
1974 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
1975 void _mm_prefetch(int locality)(const(void)* p) pure @trusted
1976 {
1977     static if (GDC_with_SSE)
1978     {
1979         return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
1980     }
1981     else static if (DMD_with_DSIMD)
1982     {
1983         enum bool isWrite = (locality & 0x4) != 0;
1984         enum level = locality & 3;
1985         return prefetch!(isWrite, level)(p);
1986     }
1987     else version(LDC)
1988     {
1989         static if ((__VERSION__ >= 2091) && (__VERSION__ < 2106))
1990         {
1991             // const_cast here. `llvm_prefetch` wants a mutable pointer
1992             llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
1993         }
1994         else
1995         {
1996             // const_cast here. `llvm_prefetch` wants a mutable pointer
1997             llvm_prefetch( cast(void*)p, 0, locality, 1);
1998         }
1999     }
2000     else version(D_InlineAsm_X86_64)
2001     {
2002         static if (locality == _MM_HINT_NTA)
2003         {
2004             asm pure nothrow @nogc @trusted
2005             {
2006                 mov RAX, p;
2007                 prefetchnta [RAX];
2008             }
2009         }
2010         else static if (locality == _MM_HINT_T0)
2011         {
2012             asm pure nothrow @nogc @trusted
2013             {
2014                 mov RAX, p;
2015                 prefetcht0 [RAX];
2016             }
2017         }
2018         else static if (locality == _MM_HINT_T1)
2019         {
2020             asm pure nothrow @nogc @trusted
2021             {
2022                 mov RAX, p;
2023                 prefetcht1 [RAX];
2024             }
2025         }
2026         else static if (locality == _MM_HINT_T2)
2027         {
2028             asm pure nothrow @nogc @trusted
2029             {
2030                 mov RAX, p;
2031                 prefetcht2 [RAX];
2032             }
2033         }
2034         else
2035             assert(false); // invalid locality hint
2036     }
2037     else version(D_InlineAsm_X86)
2038     {
2039         static if (locality == _MM_HINT_NTA)
2040         {
2041             asm pure nothrow @nogc @trusted
2042             {
2043                 mov EAX, p;
2044                 prefetchnta [EAX];
2045             }
2046         }
2047         else static if (locality == _MM_HINT_T0)
2048         {
2049             asm pure nothrow @nogc @trusted
2050             {
2051                 mov EAX, p;
2052                 prefetcht0 [EAX];
2053             }
2054         }
2055         else static if (locality == _MM_HINT_T1)
2056         {
2057             asm pure nothrow @nogc @trusted
2058             {
2059                 mov EAX, p;
2060                 prefetcht1 [EAX];
2061             }
2062         }
2063         else static if (locality == _MM_HINT_T2)
2064         {
2065             asm pure nothrow @nogc @trusted
2066             {
2067                 mov EAX, p;
2068                 prefetcht2 [EAX];
2069             }
2070         }
2071         else 
2072             assert(false); // invalid locality hint
2073     }
2074     else
2075     {
2076         // Generic version: do nothing. From bitter experience, 
2077         // it's unlikely you get ANY speed-up with manual prefetching.
2078         // Prefetching or not doesn't change program behaviour.
2079     }
2080 }
2081 unittest
2082 {
2083     // From Intel documentation:
2084     // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 
2085     // 32 bytes."
2086     ubyte[256] cacheline; // though it seems it cannot generate GP fault
2087     _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
2088     _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
2089     _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
2090     _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
2091 }
2092 
2093 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;///
2094 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;///
2095 
2096 
2097 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 
2098 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12.
2099 __m128 _mm_rcp_ps (__m128 a) pure @trusted
2100 {
2101     static if (DMD_with_DSIMD)
2102     {
2103         return cast(__m128) __simd(XMM.RCPPS, a);
2104     }
2105     else static if (GDC_with_SSE)
2106     {
2107         return __builtin_ia32_rcpps(a);
2108     }
2109     else static if (LDC_with_SSE)
2110     {
2111         return __builtin_ia32_rcpps(a);
2112     }
2113     else
2114     {        
2115         a.ptr[0] = 1.0f / a.array[0];
2116         a.ptr[1] = 1.0f / a.array[1];
2117         a.ptr[2] = 1.0f / a.array[2];
2118         a.ptr[3] = 1.0f / a.array[3];
2119         return a;
2120     }
2121 }
2122 unittest
2123 {
2124     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2125     __m128 groundTruth = _mm_set1_ps(1.0f) / A;
2126     __m128 result = _mm_rcp_ps(A);
2127     foreach(i; 0..4)
2128     {
2129         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2130         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2131     }
2132 }
2133 
2134 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 
2135 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 
2136 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2137 __m128 _mm_rcp_ss (__m128 a) pure @trusted
2138 {
2139     // Disabled, see https://issues.dlang.org/show_bug.cgi?id=23049
2140     /*static if (DMD_with_DSIMD)
2141     {
2142         return cast(__m128) __simd(XMM.RCPSS, a);
2143     }
2144     else*/
2145     static if (GDC_with_SSE)
2146     {
2147         return __builtin_ia32_rcpss(a);
2148     }
2149     else static if (LDC_with_SSE)
2150     {
2151         return __builtin_ia32_rcpss(a);
2152     }
2153     else
2154     {
2155         a.ptr[0] = 1.0f / a.array[0];
2156         return a;
2157     }
2158 }
2159 unittest
2160 {
2161     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2162     __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f);
2163     __m128 R = _mm_rcp_ss(A);
2164     double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1;
2165     assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2166     assert(R.array[1] == correct.array[1]);
2167     assert(R.array[2] == correct.array[2]);
2168     assert(R.array[3] == correct.array[3]);
2169 }
2170 
2171 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, and 
2172 /// return a pointer to the newly allocated memory. 
2173 /// Previous data is preserved if any.
2174 ///
2175 /// IMPORTANT: `size` MUST be > 0.
2176 ///
2177 /// `_mm_free` MUST be used to free memory that is allocated with `_mm_malloc` or `_mm_realloc`.
2178 /// Do NOT call _mm_realloc with size = 0.
2179 void* _mm_realloc(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
2180 {
2181     return alignedReallocImpl!true(aligned, size, alignment);
2182 }
2183 unittest
2184 {
2185     enum NALLOC = 8;
2186     enum size_t[8] ALIGNMENTS = [1, 2, 4, 8, 16, 32, 64, 128];
2187     
2188     void*[NALLOC] alloc;
2189 
2190     foreach(t; 0..100)
2191     {
2192         foreach(n; 0..NALLOC)
2193         {
2194             size_t alignment = ALIGNMENTS[n];
2195             size_t s = 1 + ( (n + t * 69096) & 0xffff );
2196             alloc[n] = _mm_realloc(alloc[n], s, alignment);
2197             assert(isPointerAligned(alloc[n], alignment));
2198             foreach(b; 0..s)
2199                 (cast(ubyte*)alloc[n])[b] = cast(ubyte)n;
2200         }
2201     }
2202     foreach(n; 0..NALLOC)
2203     {        
2204         _mm_free(alloc[n]);
2205     }
2206 }
2207 
2208 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, and 
2209 /// return a pointer to the newly allocated memory. 
2210 /// Previous data is discarded.
2211 ///
2212 /// IMPORTANT: `size` MUST be > 0.
2213 ///
2214 /// `_mm_free` MUST be used to free memory that is allocated with `_mm_malloc` or `_mm_realloc`.
2215 void* _mm_realloc_discard(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
2216 {
2217     return alignedReallocImpl!false(aligned, size, alignment);
2218 }
2219 
2220 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 
2221 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2222 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted
2223 {
2224     static if (DMD_with_DSIMD)
2225     {
2226         return cast(__m128) __simd(XMM.RSQRTPS, a);
2227     }
2228     else static if (GDC_with_SSE)
2229     {
2230         return __builtin_ia32_rsqrtps(a);
2231     }
2232     else static if (LDC_with_SSE)
2233     {
2234         return __builtin_ia32_rsqrtps(a);
2235     }
2236     else version(LDC)
2237     {
2238         a[0] = 1.0f / llvm_sqrt(a[0]);
2239         a[1] = 1.0f / llvm_sqrt(a[1]);
2240         a[2] = 1.0f / llvm_sqrt(a[2]);
2241         a[3] = 1.0f / llvm_sqrt(a[3]);
2242         return a;
2243     }
2244     else
2245     {
2246         a.ptr[0] = 1.0f / sqrt(a.array[0]);
2247         a.ptr[1] = 1.0f / sqrt(a.array[1]);
2248         a.ptr[2] = 1.0f / sqrt(a.array[2]);
2249         a.ptr[3] = 1.0f / sqrt(a.array[3]);
2250         return a;
2251     }
2252 }
2253 unittest
2254 {
2255     __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f);
2256     __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
2257     __m128 result = _mm_rsqrt_ps(A);
2258     foreach(i; 0..4)
2259     {
2260         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2261         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2262     }
2263 }
2264 
2265 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`,
2266 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 
2267 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2268 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted
2269 {   
2270     static if (DMD_with_DSIMD)
2271     {
2272         return cast(__m128) __simd(XMM.RSQRTSS, a);
2273     }
2274     else static if (GDC_with_SSE)
2275     {
2276         return __builtin_ia32_rsqrtss(a);
2277     }
2278     else static if (LDC_with_SSE)
2279     {
2280         return __builtin_ia32_rsqrtss(a);
2281     }
2282     else version(LDC)
2283     {
2284         a[0] = 1.0f / llvm_sqrt(a[0]);
2285         return a;
2286     }
2287     else
2288     {
2289         a[0] = 1.0f / sqrt(a[0]);
2290         return a;
2291     }
2292 }
2293 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss
2294 {
2295     double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc?
2296     void testApproximateSSE(float number) nothrow @nogc
2297     {
2298         __m128 A = _mm_set1_ps(number);
2299 
2300         // test _mm_rcp_ps
2301         __m128 B = _mm_rcp_ps(A);
2302         foreach(i; 0..4)
2303         {
2304             double exact = 1.0f / A.array[i];
2305             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2306             assert(abs_double(ratio - 1) <= maxRelativeError);
2307         }
2308 
2309         // test _mm_rcp_ss
2310         {
2311             B = _mm_rcp_ss(A);
2312             double exact = 1.0f / A.array[0];
2313             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2314             assert(abs_double(ratio - 1) <= maxRelativeError);
2315         }
2316 
2317         // test _mm_rsqrt_ps
2318         B = _mm_rsqrt_ps(A);
2319         foreach(i; 0..4)
2320         {
2321             double exact = 1.0f / sqrt(A.array[i]);
2322             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2323             assert(abs_double(ratio - 1) <= maxRelativeError);
2324         }
2325 
2326         // test _mm_rsqrt_ss
2327         {
2328             B = _mm_rsqrt_ss(A);
2329             double exact = 1.0f / sqrt(A.array[0]);
2330             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2331             assert(abs_double(ratio - 1) <= maxRelativeError);
2332         }
2333     }
2334 
2335     testApproximateSSE(0.00001f);
2336     testApproximateSSE(1.1f);
2337     testApproximateSSE(345.0f);
2338     testApproximateSSE(2.45674864151f);
2339     testApproximateSSE(700000.0f);
2340     testApproximateSSE(10000000.0f);
2341     testApproximateSSE(27841456468.0f);
2342 }
2343 
2344 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2345 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2346 /// low 16 bits of result.
2347 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
2348 {
2349     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
2350 }
2351 
2352 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2353 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`,
2354 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
2355 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
2356 {
2357     // Note: unsupported on ARM
2358     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
2359 }
2360 
2361 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2362 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
2363 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
2364 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
2365 {
2366     // Note: unsupported on ARM
2367     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
2368 }
2369 
2370 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2371 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`.
2372 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
2373 {
2374     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
2375 }
2376 
2377 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
2378 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
2379 {
2380     __m128 r;
2381     r.ptr[0] = e0;
2382     r.ptr[1] = e1;
2383     r.ptr[2] = e2;
2384     r.ptr[3] = e3;
2385     return r;
2386 }
2387 unittest
2388 {
2389     __m128 A = _mm_set_ps(3, 2, 1, 546);
2390     float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
2391     assert(A.array == correct);
2392 
2393     // Very old LDC, like 1.17, cannot case __vector at CT
2394     static if (__VERSION__ >= 2094)
2395     {
2396         static immutable B = _mm_set_ps(3, 2, 1, 546);
2397         enum C = _mm_set_ps(3, 2, 1, 546);
2398     }
2399 }
2400 
2401 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; ///
2402 
2403 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2404 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 
2405 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
2406 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
2407 {
2408     // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2409     version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; }
2410     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
2411 }
2412 
2413 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements.
2414 __m128 _mm_set_ss (float a) pure @trusted
2415 {
2416     static if (DMD_with_DSIMD)
2417     {
2418         return cast(__m128) __simd(XMM.LODSS, a);
2419     }
2420     else
2421     {
2422         __m128 r = _mm_setzero_ps();
2423         r.ptr[0] = a;
2424         return r;
2425     }
2426 }
2427 unittest
2428 {
2429     float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
2430     __m128 A = _mm_set_ss(42.0f);
2431     assert(A.array == correct);
2432 }
2433 
2434 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements.
2435 __m128 _mm_set1_ps (float a) pure @trusted
2436 {
2437     pragma(inline, true);
2438     __m128 r = a;
2439     return r;
2440 }
2441 unittest
2442 {
2443     float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
2444     __m128 A = _mm_set1_ps(42.0f);
2445     assert(A.array == correct);
2446     
2447     static if (__VERSION__ >= 2094)
2448     {
2449         enum __m128 B = _mm_set1_ps(2.4f);
2450     }
2451 }
2452 
2453 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`.
2454 void _mm_setcsr(uint controlWord) @trusted
2455 {
2456     static if (LDC_with_ARM)
2457     {
2458         // Convert from SSE to ARM control word. This is done _partially_
2459         // and only support rounding mode changes.
2460 
2461         // "To alter some bits of a VFP system register without 
2462         // affecting other bits, use a read-modify-write procedure"
2463         uint fpscr = arm_get_fpcr();
2464         
2465         // Bits 23 to 22 are rounding modes, however not used in NEON
2466         fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
2467         switch(controlWord & _MM_ROUND_MASK)
2468         {
2469             default:
2470             case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
2471             case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
2472             case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
2473             case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
2474         }
2475         fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
2476         if (controlWord & _MM_FLUSH_ZERO_MASK)
2477             fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
2478         arm_set_fpcr(fpscr);
2479     }
2480     else version(GNU)
2481     {
2482         static if (GDC_with_SSE)
2483         {
2484             // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2485             version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; }
2486             __builtin_ia32_ldmxcsr(controlWord);
2487         }
2488         else version(X86)
2489         {
2490             asm nothrow @nogc @trusted
2491             {
2492                 "ldmxcsr %0;\n" 
2493                   : 
2494                   : "m" (controlWord)
2495                   : ;
2496             }
2497         }
2498         else return __warn_noop();
2499     }
2500     else version (InlineX86Asm)
2501     {
2502         asm nothrow @nogc @trusted
2503         {
2504             ldmxcsr controlWord;
2505         }
2506     }
2507     else
2508         static assert(0, "Not yet supported");
2509 }
2510 unittest
2511 {
2512     _mm_setcsr(_mm_getcsr());
2513 }
2514 
2515 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
2516 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
2517 {
2518     pragma(inline, true);
2519 
2520     if (__ctfe)
2521     {  
2522         __m128 r;
2523         r.ptr[0] = e3;
2524         r.ptr[1] = e2;
2525         r.ptr[2] = e1;
2526         r.ptr[3] = e0;
2527         return r;
2528     }
2529     else
2530     {
2531         // This small = void here wins a bit in all optimization levels in GDC
2532         // and in -O0 in LDC.
2533         __m128 r = void;
2534         r.ptr[0] = e3;
2535         r.ptr[1] = e2;
2536         r.ptr[2] = e1;
2537         r.ptr[3] = e0;
2538         return r;
2539     }
2540 }
2541 unittest
2542 {
2543     __m128 A = _mm_setr_ps(3, 2, 1, 546);
2544     float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
2545     assert(A.array == correct);
2546 
2547     // Very old LDC, like 1.17, cannot case __vector at CT
2548     static if (__VERSION__ >= 2094)
2549     {
2550         static immutable B = _mm_setr_ps(3, 2, 1, 546);
2551         enum C = _mm_setr_ps(3, 2, 1, 546);
2552     }
2553 }
2554 
2555 /// Return vector of type `__m128` with all elements set to zero.
2556 __m128 _mm_setzero_ps() pure @trusted
2557 {
2558     pragma(inline, true);
2559 
2560     // Note: for all compilers, this works best in debug builds, and in DMD -O
2561     int4 r; 
2562     return cast(__m128)r;
2563 }
2564 unittest
2565 {
2566     __m128 R = _mm_setzero_ps();
2567     float[4] correct = [0.0f, 0, 0, 0];
2568     assert(R.array == correct);
2569 }
2570 
2571 /// Do a serializing operation on all store-to-memory instructions that were issued prior 
2572 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 
2573 /// is globally visible before any store instruction which follows the fence in program order.
2574 void _mm_sfence() @trusted
2575 {
2576     version(GNU)
2577     {
2578         static if (GDC_with_SSE)
2579         {
2580             __builtin_ia32_sfence();
2581         }
2582         else version(X86)
2583         {
2584             asm pure nothrow @nogc @trusted
2585             {
2586                 "sfence;\n" : : : ;
2587             }
2588         }
2589         else return __warn_noop();
2590     }
2591     else static if (LDC_with_SSE)
2592     {
2593         __builtin_ia32_sfence();
2594     }
2595     else static if (DMD_with_asm)
2596     {
2597         // PERF: can't be inlined in DMD, probably because of that assembly.
2598         asm nothrow @nogc pure @trusted
2599         {
2600             sfence;
2601         }
2602     }
2603     else static if (LDC_with_ARM64)
2604     {
2605         __builtin_arm_dmb(10); // dmb ishst
2606     }
2607     else version(LDC)
2608     {
2609         // When the architecture is unknown, generate a full memory barrier,
2610         // as the semantics of sfence do not really match those of atomics.
2611         llvm_memory_fence();
2612     }
2613     else
2614         static assert(false);
2615 }
2616 unittest
2617 {
2618     _mm_sfence();
2619 }
2620 
2621 
2622 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @trusted
2623 {
2624     // PERF DMD + D_SIMD
2625     version(LDC)
2626     {
2627         return cast(__m64) shufflevectorLDC!(short4, ( (imm8 >> 0) & 3 ),
2628                                                      ( (imm8 >> 2) & 3 ),
2629                                                      ( (imm8 >> 4) & 3 ),
2630                                                      ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
2631     }
2632     else
2633     {
2634         // GDC optimizes that correctly starting with -O2
2635         short4 sa = cast(short4)a;
2636         short4 r = void;
2637         r.ptr[0] = sa.array[ (imm8 >> 0) & 3 ];
2638         r.ptr[1] = sa.array[ (imm8 >> 2) & 3 ];
2639         r.ptr[2] = sa.array[ (imm8 >> 4) & 3 ];
2640         r.ptr[3] = sa.array[ (imm8 >> 6) & 3 ];
2641         return cast(__m64)r;
2642     }
2643 }
2644 unittest
2645 {
2646     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
2647     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2648     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
2649     short[4] expectedB = [ 3, 2, 1, 0 ];
2650     assert(B.array == expectedB);
2651 }
2652 
2653 /// Shuffle single-precision (32-bit) floating-point elements in `a` and `b` using the control in `imm8`, 
2654 /// Warning: the immediate shuffle value `imm` is given at compile-time instead of runtime.
2655 __m128 _mm_shuffle_ps(ubyte imm8)(__m128 a, __m128 b) pure @trusted
2656 {
2657     static if (GDC_with_SSE)
2658     {
2659         return __builtin_ia32_shufps(a, b, imm8);
2660     }
2661     else static if (DMD_with_DSIMD)
2662     {
2663         return cast(__m128) __simd(XMM.SHUFPS, a, b, imm8);
2664     }
2665     else static if (LDC_with_optimizations)
2666     {
2667         return shufflevectorLDC!(__m128, imm8 & 3, (imm8>>2) & 3, 
2668                                  4 + ((imm8>>4) & 3), 4 + ((imm8>>6) & 3) )(a, b);
2669     }
2670     else
2671     {
2672         float4 r = void;
2673         r.ptr[0] = a.array[ (imm8 >> 0) & 3 ];
2674         r.ptr[1] = a.array[ (imm8 >> 2) & 3 ];
2675         r.ptr[2] = b.array[ (imm8 >> 4) & 3 ];
2676         r.ptr[3] = b.array[ (imm8 >> 6) & 3 ];
2677         return r;
2678     }
2679 }
2680 unittest
2681 {
2682     __m128 A = _mm_setr_ps(0, 1, 2, 3);
2683     __m128 B = _mm_setr_ps(4, 5, 6, 7);
2684     __m128 C = _mm_shuffle_ps!0x9c(A, B);
2685     float[4] correct = [0.0f, 3, 5, 6];
2686     assert(C.array == correct);
2687 }
2688 
2689 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
2690 __m128 _mm_sqrt_ps(__m128 a) @trusted
2691 {
2692     static if (GDC_with_SSE)
2693     {
2694         return __builtin_ia32_sqrtps(a);
2695     }
2696     else static if (DMD_with_DSIMD)
2697     {
2698         return cast(__m128) __simd(XMM.SQRTPS, a);
2699     }
2700     else version(LDC)
2701     {
2702         // Disappeared with LDC 1.11
2703         static if (__VERSION__ < 2081)
2704             return __builtin_ia32_sqrtps(a);
2705         else
2706         {
2707             // PERF: use llvm_sqrt on the vector, works better
2708             a[0] = llvm_sqrt(a[0]);
2709             a[1] = llvm_sqrt(a[1]);
2710             a[2] = llvm_sqrt(a[2]);
2711             a[3] = llvm_sqrt(a[3]);
2712             return a;
2713         }
2714     }
2715     else
2716     {
2717         a.ptr[0] = sqrt(a.array[0]);
2718         a.ptr[1] = sqrt(a.array[1]);
2719         a.ptr[2] = sqrt(a.array[2]);
2720         a.ptr[3] = sqrt(a.array[3]);
2721         return a;
2722     }
2723 }
2724 unittest
2725 {
2726     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
2727     assert(A.array[0] == 2.0f);
2728     assert(A.array[1] == 2.0f);
2729     assert(A.array[2] == 2.0f);
2730     assert(A.array[3] == 2.0f);
2731 }
2732 
2733 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower
2734 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result.
2735 __m128 _mm_sqrt_ss(__m128 a) @trusted
2736 {
2737     static if (GDC_with_SSE)
2738     {
2739         return __builtin_ia32_sqrtss(a);
2740     }
2741     // PERF DMD
2742     // TODO: enable when https://issues.dlang.org/show_bug.cgi?id=23437 is fixed for good
2743     /*else static if (DMD_with_DSIMD)
2744     {
2745         return cast(__m128) __simd(XMM.SQRTSS, a);
2746     }*/
2747     else version(LDC)
2748     {
2749         a.ptr[0] = llvm_sqrt(a.array[0]);
2750         return a;
2751     }
2752     else
2753     {   
2754         a.ptr[0] = sqrt(a.array[0]);
2755         return a;
2756     }
2757 }
2758 unittest
2759 {
2760     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
2761     assert(A.array[0] == 2.0f);
2762     assert(A.array[1] == 4.0f);
2763     assert(A.array[2] == 4.0f);
2764     assert(A.array[3] == 4.0f);
2765 }
2766 
2767 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2768 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2769 void _mm_store_ps (float* mem_addr, __m128 a) pure
2770 {
2771     pragma(inline, true);
2772     __m128* aligned = cast(__m128*)mem_addr;
2773     *aligned = a;
2774 }
2775 
2776 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; ///
2777 
2778 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 
2779 /// `mem_addr` does not need to be aligned on any particular boundary.
2780 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
2781 {
2782     pragma(inline, true);
2783     *mem_addr = a.array[0];
2784 }
2785 unittest
2786 {
2787     float a;
2788     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
2789     assert(a == 546);
2790 }
2791 
2792 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 
2793 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2794 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE: shouldn't be trusted, see #62
2795 {
2796     __m128* aligned = cast(__m128*)mem_addr;
2797     static if (DMD_with_DSIMD)
2798     {
2799         __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 0);
2800     }
2801     else
2802     {
2803         __m128 r; // PERF =void;
2804         r.ptr[0] = a.array[0];
2805         r.ptr[1] = a.array[0];
2806         r.ptr[2] = a.array[0];
2807         r.ptr[3] = a.array[0];
2808     }
2809     *aligned = r;
2810 }
2811 unittest
2812 {
2813     align(16) float[4] A;
2814     _mm_store1_ps(A.ptr, _mm_set_ss(42.0f));
2815     float[4] correct = [42.0f, 42, 42, 42];
2816     assert(A == correct);
2817 }
2818 
2819 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory.
2820 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
2821 {
2822     pragma(inline, true);
2823     long2 la = cast(long2)a;
2824     (*p).ptr[0] = la.array[1];
2825 }
2826 unittest
2827 {
2828     __m64 R = _mm_setzero_si64();
2829     long2 A = [13, 25];
2830     _mm_storeh_pi(&R, cast(__m128)A);
2831     assert(R.array[0] == 25);
2832 }
2833 
2834 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory.
2835 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
2836 {
2837     pragma(inline, true);
2838     long2 la = cast(long2)a;
2839     (*p).ptr[0] = la.array[0];
2840 }
2841 unittest
2842 {
2843     __m64 R = _mm_setzero_si64();
2844     long2 A = [13, 25];
2845     _mm_storel_pi(&R, cast(__m128)A);
2846     assert(R.array[0] == 13);
2847 }
2848 
2849 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 
2850 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2851 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted
2852 {
2853     __m128* aligned = cast(__m128*)mem_addr;
2854     static if (DMD_with_DSIMD)
2855     {
2856         __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
2857     }
2858     else
2859     {
2860         __m128 r; // PERF =void;
2861         r.ptr[0] = a.array[3];
2862         r.ptr[1] = a.array[2];
2863         r.ptr[2] = a.array[1];
2864         r.ptr[3] = a.array[0];
2865     }
2866     *aligned = r;
2867 }
2868 unittest
2869 {
2870     align(16) float[4] A;
2871     _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4));
2872     float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f];
2873     assert(A == correct);
2874 }
2875 
2876 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2877 /// `mem_addr` does not need to be aligned on any particular boundary.
2878 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted, see #62
2879 {
2880     pragma(inline, true);
2881     static if (DMD_with_DSIMD)
2882     {
2883         cast(void) __simd_sto(XMM.STOUPS, *cast(void16*)(cast(float*)mem_addr), a);
2884     }
2885     else static if (GDC_with_SSE)
2886     {
2887         __builtin_ia32_storeups(mem_addr, a); // better in -O0
2888     }
2889     else static if (LDC_with_optimizations)
2890     {
2891         storeUnaligned!(float4)(a, mem_addr);
2892     }
2893     else
2894     {
2895         mem_addr[0] = a.array[0];
2896         mem_addr[1] = a.array[1];
2897         mem_addr[2] = a.array[2];
2898         mem_addr[3] = a.array[3];
2899     }
2900 }
2901 unittest
2902 {
2903     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
2904     align(16) float[6] R = [0.0f, 0, 0, 0, 0, 0];
2905     float[4] correct = [1.0f, 2, 3, 4];
2906     _mm_storeu_ps(&R[1], A);
2907     assert(R[1..5] == correct);
2908 }
2909 
2910 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint.
2911 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
2912 void _mm_stream_pi (__m64* mem_addr, __m64 a) pure @trusted
2913 {
2914     _mm_stream_si64(cast(long*)mem_addr, a.array[0]);
2915 }
2916 
2917 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 
2918 /// `a`s into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 
2919 /// boundary or a general-protection exception may be generated.
2920 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
2921 void _mm_stream_ps (float* mem_addr, __m128 a)
2922 {
2923     // TODO report this bug: DMD generates no stream instruction when using D_SIMD
2924     static if (GDC_with_SSE)
2925     {
2926         return __builtin_ia32_movntps(mem_addr, a); 
2927     }
2928     else static if (LDC_with_InlineIREx && LDC_with_optimizations)
2929     {
2930         enum prefix = `!0 = !{ i32 1 }`;
2931         enum ir = `
2932             store <4 x float> %1, <4 x float>* %0, align 16, !nontemporal !0
2933             ret void`;
2934         LDCInlineIREx!(prefix, ir, "", void, __m128*, float4)(cast(__m128*)mem_addr, a);
2935 
2936     }
2937     else
2938     {
2939         // Regular store instead.
2940         __m128* dest = cast(__m128*)mem_addr;
2941         *dest = a; // it's a regular move instead
2942     }
2943 }
2944 unittest
2945 {
2946     align(16) float[4] A;
2947     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
2948     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
2949 }
2950 
2951 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 
2952 /// floating-point elements in `a`.
2953 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
2954 {
2955     pragma(inline, true);
2956     return a - b;
2957 }
2958 unittest
2959 {
2960     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2961     a = _mm_sub_ps(a, a);
2962     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
2963     assert(a.array == correct);
2964 }
2965 
2966 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit)
2967 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 
2968 /// packed elements from a to the upper elements of result.
2969 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
2970 {
2971     static if (DMD_with_DSIMD)
2972         return cast(__m128) __simd(XMM.SUBSS, a, b);
2973     else static if (GDC_with_SSE)
2974         return __builtin_ia32_subss(a, b);
2975     else
2976     {
2977         a[0] -= b[0];
2978         return a;
2979     }
2980 }
2981 unittest
2982 {
2983     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2984     a = _mm_sub_ss(a, a);
2985     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
2986     assert(a.array == correct);
2987 }
2988 
2989 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 
2990 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
2991 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
2992 {
2993     __m128 tmp3, tmp2, tmp1, tmp0;
2994     tmp0 = _mm_unpacklo_ps(row0, row1);
2995     tmp2 = _mm_unpacklo_ps(row2, row3);
2996     tmp1 = _mm_unpackhi_ps(row0, row1);
2997     tmp3 = _mm_unpackhi_ps(row2, row3);
2998     row0 = _mm_movelh_ps(tmp0, tmp2);
2999     row1 = _mm_movehl_ps(tmp2, tmp0);
3000     row2 = _mm_movelh_ps(tmp1, tmp3);
3001     row3 = _mm_movehl_ps(tmp3, tmp1);
3002 }
3003 unittest
3004 {
3005     __m128 l0 = _mm_setr_ps(0, 1, 2, 3);
3006     __m128 l1 = _mm_setr_ps(4, 5, 6, 7);
3007     __m128 l2 = _mm_setr_ps(8, 9, 10, 11);
3008     __m128 l3 = _mm_setr_ps(12, 13, 14, 15);
3009     _MM_TRANSPOSE4_PS(l0, l1, l2, l3);
3010     float[4] r0 = [0.0f, 4, 8, 12];
3011     float[4] r1 = [1.0f, 5, 9, 13];
3012     float[4] r2 = [2.0f, 6, 10, 14];
3013     float[4] r3 = [3.0f, 7, 11, 15];
3014     assert(l0.array == r0);
3015     assert(l1.array == r1);
3016     assert(l2.array == r2);
3017     assert(l3.array == r3);
3018 }
3019 
3020 /// Transpose the 4x4 matrix formed by the 4 rows of 32-bit integer elements in row0, row1, 
3021 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
3022 void _MM_TRANSPOSE4_EPI32 (ref __m128i row0, ref __m128i row1, ref __m128i row2, ref __m128i row3) pure @safe // #BONUS
3023 {
3024     __m128i tmp3, tmp2, tmp1, tmp0;
3025     tmp0 = _mm_unpacklo_epi32(row0, row1);
3026     tmp2 = _mm_unpacklo_epi32(row2, row3);
3027     tmp1 = _mm_unpackhi_epi32(row0, row1);
3028     tmp3 = _mm_unpackhi_epi32(row2, row3);
3029     row0 = _mm_movelh_epi32(tmp0, tmp2);
3030     row1 = _mm_movehl_epi32(tmp2, tmp0);
3031     row2 = _mm_movelh_epi32(tmp1, tmp3);
3032     row3 = _mm_movehl_epi32(tmp3, tmp1);
3033 }
3034 unittest
3035 {
3036     __m128i l0 = _mm_setr_epi32(0, 1, 2, 3);
3037     __m128i l1 = _mm_set_epi32(7, 6, 5, 4);
3038     __m128i l2 = _mm_setr_epi32(8, 9, 10, 11);
3039     __m128i l3 = _mm_setr_epi32(12, 13, 14, 15);
3040     _MM_TRANSPOSE4_EPI32(l0, l1, l2, l3);
3041     int[4] r0 = [0, 4, 8, 12];
3042     int[4] r1 = [1, 5, 9, 13];
3043     int[4] r2 = [2, 6, 10, 14];
3044     int[4] r3 = [3, 7, 11, 15];
3045     assert(l0.array == r0);
3046     assert(l1.array == r1);
3047     assert(l2.array == r2);
3048     assert(l3.array == r3);
3049 }
3050 
3051 // Note: the only difference between these intrinsics is the signalling
3052 //       behaviour of quiet NaNs. This is incorrect but the case where
3053 //       you would want to differentiate between qNaN and sNaN and then
3054 //       treat them differently on purpose seems extremely rare.
3055 alias _mm_ucomieq_ss = _mm_comieq_ss;
3056 alias _mm_ucomige_ss = _mm_comige_ss;
3057 alias _mm_ucomigt_ss = _mm_comigt_ss;
3058 alias _mm_ucomile_ss = _mm_comile_ss;
3059 alias _mm_ucomilt_ss = _mm_comilt_ss;
3060 alias _mm_ucomineq_ss = _mm_comineq_ss;
3061 
3062 /// Return vector of type `__m128` with undefined elements.
3063 __m128 _mm_undefined_ps() pure @safe
3064 {
3065     pragma(inline, true);
3066     __m128 undef = void;
3067     return undef;
3068 }
3069 
3070 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`.
3071 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
3072 {
3073     // PERF GDC use intrinsic
3074     static if (DMD_with_DSIMD)
3075     {
3076         return cast(__m128) __simd(XMM.UNPCKHPS, a, b);
3077     }
3078     else static if (LDC_with_optimizations)
3079     {
3080         enum ir = `%r = shufflevector <4 x float> %0, <4 x float> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3081                   ret <4 x float> %r`;
3082         return LDCInlineIR!(ir, float4, float4, float4)(a, b);
3083     }
3084     else
3085     {
3086         __m128 r; // PERF =void;
3087         r.ptr[0] = a.array[2];
3088         r.ptr[1] = b.array[2];
3089         r.ptr[2] = a.array[3];
3090         r.ptr[3] = b.array[3];
3091         return r;
3092     }
3093 }
3094 unittest
3095 {
3096     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
3097     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
3098     __m128 R = _mm_unpackhi_ps(A, B);
3099     float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f];
3100     assert(R.array == correct);
3101 }
3102 
3103 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`.
3104 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
3105 {
3106     // PERF GDC use intrinsic
3107     static if (DMD_with_DSIMD)
3108     {
3109         return cast(__m128) __simd(XMM.UNPCKLPS, a, b);
3110     }
3111     else static if (LDC_with_optimizations)
3112     {
3113         enum ir = `%r = shufflevector <4 x float> %0, <4 x float> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3114                    ret <4 x float> %r`;
3115         return LDCInlineIR!(ir, float4, float4, float4)(a, b);
3116     }
3117     else
3118     {
3119         __m128 r; // PERF =void;
3120         r.ptr[0] = a.array[0];
3121         r.ptr[1] = b.array[0];
3122         r.ptr[2] = a.array[1];
3123         r.ptr[3] = b.array[1];
3124         return r;
3125     }
3126 }
3127 unittest
3128 {
3129     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
3130     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
3131     __m128 R = _mm_unpacklo_ps(A, B);
3132     float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f];
3133     assert(R.array == correct);
3134 }
3135 
3136 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
3137 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
3138 {
3139     static if (DMD_with_DSIMD)
3140     {
3141         return cast(__m128) __simd(XMM.XORPS, cast(void16) a, cast(void16) b);
3142     }
3143     else
3144     {
3145         return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
3146     }
3147 }
3148 unittest
3149 {
3150     __m128 A = cast(__m128) _mm_set1_epi32(0x80000000);
3151     __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity);
3152     __m128 C = _mm_xor_ps(A, B);
3153     float[4] correct = [-4.0f, 5.0, 9.5f, -float.infinity];
3154     assert(C.array == correct);
3155 }
3156 
3157 private
3158 {
3159     // Returns: `true` if the pointer is suitably aligned.
3160     bool isPointerAligned(void* p, size_t alignment) pure
3161     {
3162         assert(alignment != 0);
3163         return ( cast(size_t)p & (alignment - 1) ) == 0;
3164     }
3165 
3166     // Returns: next pointer aligned with alignment bytes.
3167     void* nextAlignedPointer(void* start, size_t alignment) pure
3168     {
3169         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
3170     }
3171 
3172     // Returns number of bytes to actually allocate when asking
3173     // for a particular alignment
3174     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
3175     {
3176         enum size_t pointerSize = size_t.sizeof;
3177         return askedSize + alignment - 1 + pointerSize * 3;
3178     }
3179 
3180     // Store pointer given by malloc + size + alignment
3181     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
3182     {
3183         enum size_t pointerSize = size_t.sizeof;
3184         char* start = cast(char*)raw + pointerSize * 3;
3185         void* aligned = nextAlignedPointer(start, alignment);
3186         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
3187         *rawLocation = raw;
3188         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
3189         *sizeLocation = size;
3190         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
3191         *alignmentLocation = alignment;
3192         assert( isPointerAligned(aligned, alignment) );
3193         return aligned;
3194     }
3195 
3196     // Returns: x, multiple of powerOfTwo, so that x >= n.
3197     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
3198     {
3199         // check power-of-two
3200         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
3201 
3202         size_t mask = ~(powerOfTwo - 1);
3203         return (n + powerOfTwo - 1) & mask;
3204     }
3205 
3206     void* alignedReallocImpl(bool PreserveDataIfResized)(void* aligned, size_t size, size_t alignment)
3207     {
3208         // Calling `_mm_realloc`, `_mm_realloc_discard` or `realloc`  with size 0 is 
3209         // Undefined Behavior, and not only since C23.
3210         // Moreover, alignedReallocImpl was buggy about it.
3211         assert(size != 0);
3212 
3213         if (aligned is null)
3214             return _mm_malloc(size, alignment);
3215 
3216         assert(alignment != 0);
3217         assert(isPointerAligned(aligned, alignment));
3218 
3219         size_t previousSize = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 2);
3220         size_t prevAlignment = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 3);
3221 
3222         // It is illegal to change the alignment across calls.
3223         assert(prevAlignment == alignment);
3224 
3225         void* raw = *cast(void**)(cast(char*)aligned - size_t.sizeof);
3226         size_t request = requestedSize(size, alignment);
3227         size_t previousRequest = requestedSize(previousSize, alignment);
3228         assert(previousRequest - request == previousSize - size);
3229 
3230         // Heuristic: if a requested size is within 50% to 100% of what is already allocated
3231         //            then exit with the same pointer
3232         // PERF it seems like `realloc` should do that, not us.
3233         if ( (previousRequest < request * 4) && (request <= previousRequest) )
3234             return aligned;
3235 
3236         void* newRaw = malloc(request);
3237         if (request > 0 && newRaw == null) // realloc(0) can validly return anything
3238             onOutOfMemoryError();
3239 
3240         void* newAligned = storeRawPointerPlusInfo(newRaw, size, alignment);
3241 
3242         static if (PreserveDataIfResized)
3243         {
3244             size_t minSize = size < previousSize ? size : previousSize;
3245             memcpy(newAligned, aligned, minSize); // ok to use memcpy: newAligned is into new memory, always different from aligned
3246         }
3247 
3248         // Free previous data
3249         _mm_free(aligned);
3250         assert(isPointerAligned(newAligned, alignment));
3251         return newAligned;
3252     }
3253 }
3254 
3255 unittest
3256 {
3257     assert(nextMultipleOf(0, 4) == 0);
3258     assert(nextMultipleOf(1, 4) == 4);
3259     assert(nextMultipleOf(2, 4) == 4);
3260     assert(nextMultipleOf(3, 4) == 4);
3261     assert(nextMultipleOf(4, 4) == 4);
3262     assert(nextMultipleOf(5, 4) == 8);
3263 
3264     {
3265         void* p = _mm_malloc(23, 16);
3266         assert(p !is null);
3267         assert(((cast(size_t)p) & 0xf) == 0);
3268         _mm_free(p);
3269     }
3270 
3271     void* nullAlloc = _mm_malloc(0, 32);
3272     assert(nullAlloc != null);
3273     _mm_free(nullAlloc);
3274 }
3275 
3276 unittest
3277 {
3278     // In C23, it is UB to call realloc with 0 size.
3279     // Ensure this is not the case, ever.
3280 
3281     int alignment = 1;
3282     void* alloc = _mm_malloc(18, alignment);
3283 
3284     // DO NOT DO THAT:
3285     //_mm_realloc(alloc, 0, alignment);
3286 
3287     // DO THAT:
3288     _mm_free(alloc);
3289 }
3290 
3291 
3292 // For some reason, order of declaration is important for this one
3293 // so it is misplaced.
3294 // Note: is just another name for _mm_cvtss_si32
3295 alias _mm_cvt_ss2si = _mm_cvtss_si32;