1 /**
2 * SSE intrinsics.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.xmmintrin;
8 
9 public import inteli.types;
10 
11 import inteli.internals;
12 
13 import inteli.mmx;
14 import inteli.emmintrin;
15 
16 import core.stdc.stdlib: malloc, free;
17 import core.exception: onOutOfMemoryError;
18 
19 version(D_InlineAsm_X86)
20     version = InlineX86Asm;
21 else version(D_InlineAsm_X86_64)
22     version = InlineX86Asm;
23 
24 
25 // SSE1
26 
27 nothrow @nogc:
28 
29 
30 enum int _MM_EXCEPT_INVALID    = 0x0001; /// MXCSR Exception states.
31 enum int _MM_EXCEPT_DENORM     = 0x0002; ///ditto
32 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004; ///ditto
33 enum int _MM_EXCEPT_OVERFLOW   = 0x0008; ///ditto
34 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010; ///ditto
35 enum int _MM_EXCEPT_INEXACT    = 0x0020; ///ditto
36 enum int _MM_EXCEPT_MASK       = 0x003f; /// MXCSR Exception states mask.
37 
38 enum int _MM_MASK_INVALID      = 0x0080; /// MXCSR Exception masks.
39 enum int _MM_MASK_DENORM       = 0x0100; ///ditto
40 enum int _MM_MASK_DIV_ZERO     = 0x0200; ///ditto
41 enum int _MM_MASK_OVERFLOW     = 0x0400; ///ditto
42 enum int _MM_MASK_UNDERFLOW    = 0x0800; ///ditto
43 enum int _MM_MASK_INEXACT      = 0x1000; ///ditto
44 enum int _MM_MASK_MASK         = 0x1f80; /// MXCSR Exception masks mask.
45 
46 enum int _MM_ROUND_NEAREST     = 0x0000; /// MXCSR Rounding mode.
47 enum int _MM_ROUND_DOWN        = 0x2000; ///ditto
48 enum int _MM_ROUND_UP          = 0x4000; ///ditto
49 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto
50 enum int _MM_ROUND_MASK        = 0x6000; /// MXCSR Rounding mode mask.
51 
52 enum int _MM_FLUSH_ZERO_MASK   = 0x8000; /// MXCSR Denormal flush to zero mask.
53 enum int _MM_FLUSH_ZERO_ON     = 0x8000; /// MXCSR Denormal flush to zero modes.
54 enum int _MM_FLUSH_ZERO_OFF    = 0x0000; ///ditto
55 
56 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
57 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
58 {
59     return a + b;
60 }
61 unittest
62 {
63     __m128 a = [1, 2, 3, 4];
64     a = _mm_add_ps(a, a);
65     assert(a.array[0] == 2);
66     assert(a.array[1] == 4);
67     assert(a.array[2] == 6);
68     assert(a.array[3] == 8);
69 }
70 
71 /// Add the lower single-precision (32-bit) floating-point element 
72 /// in `a` and `b`, store the result in the lower element of result, 
73 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
74 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
75 {
76     static if (GDC_with_SSE)
77     {
78         return __builtin_ia32_addss(a, b);
79     }
80     else static if (DMD_with_DSIMD)
81     {
82         return cast(__m128) __simd(XMM.ADDSS, a, b);
83     }
84     else
85     {
86         a[0] += b[0];
87         return a;
88     }
89 }
90 unittest
91 {
92     __m128 a = [1, 2, 3, 4];
93     a = _mm_add_ss(a, a);
94     assert(a.array == [2.0f, 2, 3, 4]);
95 }
96 
97 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
98 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
99 {
100     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
101 }
102 unittest
103 {
104     float a = 4.32f;
105     float b = -78.99f;
106     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
107     __m128 A = _mm_set_ps(a, b, a, b);
108     __m128 B = _mm_set_ps(b, a, b, a);
109     int4 R = cast(int4)( _mm_and_ps(A, B) );
110     assert(R.array[0] == correct);
111     assert(R.array[1] == correct);
112     assert(R.array[2] == correct);
113     assert(R.array[3] == correct);
114 }
115 
116 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
117 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
118 {
119     static if (DMD_with_DSIMD)
120         return cast(__m128) __simd(XMM.ANDNPS, a, b);
121     else
122         return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
123 }
124 unittest
125 {
126     float a = 4.32f;
127     float b = -78.99f;
128     int correct  = ~(*cast(int*)(&a)) &  (*cast(int*)(&b));
129     int correct2 =  (*cast(int*)(&a)) & ~(*cast(int*)(&b));
130     __m128 A = _mm_set_ps(a, b, a, b);
131     __m128 B = _mm_set_ps(b, a, b, a);
132     int4 R = cast(int4)( _mm_andnot_ps(A, B) );
133     assert(R.array[0] == correct2);
134     assert(R.array[1] == correct);
135     assert(R.array[2] == correct2);
136     assert(R.array[3] == correct);
137 }
138 
139 /// Average packed unsigned 16-bit integers in ``a` and `b`.
140 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
141 {
142     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
143 }
144 
145 /// Average packed unsigned 8-bit integers in ``a` and `b`.
146 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
147 {
148     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
149 }
150 
151 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality.
152 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
153 {
154     static if (DMD_with_DSIMD)
155         return cast(__m128) __simd(XMM.CMPPS, a, b, 0);
156     else
157         return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
158 }
159 unittest
160 {
161     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
162     __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan);
163     __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B);
164     int[4] correct = [0, -1, 0, 0];
165     assert(R.array == correct);
166 }
167 
168 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 
169 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
170 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
171 {
172     static if (DMD_with_DSIMD)
173         return cast(__m128) __simd(XMM.CMPSS, a, b, 0);
174     else
175         return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
176 }
177 unittest
178 {
179     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
180     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
181     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
182     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
183     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
184     __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B);
185     __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C);
186     __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D);
187     __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E);
188     int[4] correct1 = [-1, 0, 0, 0];
189     int[4] correct2 = [0, 0, 0, 0];
190     int[4] correct3 = [0, 0, 0, 0];
191     int[4] correct4 = [0, 0, 0, 0];
192     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
193 }
194 
195 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal.
196 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
197 {
198     static if (DMD_with_DSIMD)
199         return cast(__m128) __simd(XMM.CMPPS, b, a, 2);
200     else
201         return cast(__m128) cmpps!(FPComparison.oge)(a, b);
202 }
203 unittest
204 {
205     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
206     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
207     __m128i R = cast(__m128i) _mm_cmpge_ps(A, B);
208     int[4] correct = [0, -1,-1, 0];
209     assert(R.array == correct);
210 }
211 
212 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 
213 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
214 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
215 {
216     static if (DMD_with_DSIMD)
217     {
218         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2);
219         a[0] = c[0];
220         return a;
221     }
222     else
223         return cast(__m128) cmpss!(FPComparison.oge)(a, b);
224 }
225 unittest
226 {
227     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
228     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
229     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
230     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
231     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
232     __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B);
233     __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C);
234     __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D);
235     __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E);
236     int[4] correct1 = [-1, 0, 0, 0];
237     int[4] correct2 = [-1, 0, 0, 0];
238     int[4] correct3 = [0, 0, 0, 0];
239     int[4] correct4 = [0, 0, 0, 0];
240     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
241 }
242 
243 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than.
244 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
245 {
246     static if (DMD_with_DSIMD)
247         return cast(__m128) __simd(XMM.CMPPS, b, a, 1);
248     else
249         return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
250 }
251 unittest
252 {
253     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
254     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
255     __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B);
256     int[4] correct = [0, 0,-1, 0];
257     assert(R.array == correct);
258 }
259 
260 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 
261 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
262 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
263 {
264     static if (DMD_with_DSIMD)
265     {
266         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1);
267         a[0] = c[0];
268         return a;
269     }
270     else
271         return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
272 }
273 unittest
274 {
275     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
276     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
277     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
278     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
279     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
280     __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B);
281     __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C);
282     __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D);
283     __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E);
284     int[4] correct1 = [0, 0, 0, 0];
285     int[4] correct2 = [-1, 0, 0, 0];
286     int[4] correct3 = [0, 0, 0, 0];
287     int[4] correct4 = [0, 0, 0, 0];
288     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
289 }
290 
291 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal.
292 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
293 {
294     static if (DMD_with_DSIMD)
295         return cast(__m128) __simd(XMM.CMPPS, a, b, 2);
296     else
297         return cast(__m128) cmpps!(FPComparison.ole)(a, b);
298 }
299 unittest
300 {
301     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
302     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
303     __m128i R = cast(__m128i) _mm_cmple_ps(A, B);
304     int[4] correct = [-1, -1, 0, 0];
305     assert(R.array == correct);
306 }
307 
308 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 
309 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
310 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
311 {
312     static if (DMD_with_DSIMD)
313         return cast(__m128) __simd(XMM.CMPSS, a, b, 2);
314     else
315         return cast(__m128) cmpss!(FPComparison.ole)(a, b);
316 }
317 unittest
318 {
319     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
320     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
321     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
322     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
323     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
324     __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B);
325     __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C);
326     __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D);
327     __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E);
328     int[4] correct1 = [-1, 0, 0, 0];
329     int[4] correct2 = [0, 0, 0, 0];
330     int[4] correct3 = [0, 0, 0, 0];
331     int[4] correct4 = [-1, 0, 0, 0];
332     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
333 }
334 
335 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than.
336 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
337 {
338     static if (DMD_with_DSIMD)
339         return cast(__m128) __simd(XMM.CMPPS, a, b, 1);
340     else
341         return cast(__m128) cmpps!(FPComparison.olt)(a, b);
342 }
343 unittest
344 {
345     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
346     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
347     __m128i R = cast(__m128i) _mm_cmplt_ps(A, B);
348     int[4] correct = [-1, 0, 0, 0];
349     assert(R.array == correct);
350 }
351 
352 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 
353 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
354 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
355 {
356     static if (DMD_with_DSIMD)
357         return cast(__m128) __simd(XMM.CMPSS, a, b, 1);
358     else
359         return cast(__m128) cmpss!(FPComparison.olt)(a, b);
360 }
361 unittest
362 {
363     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
364     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
365     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
366     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
367     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
368     __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B);
369     __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C);
370     __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D);
371     __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E);
372     int[4] correct1 = [0, 0, 0, 0];
373     int[4] correct2 = [0, 0, 0, 0];
374     int[4] correct3 = [0, 0, 0, 0];
375     int[4] correct4 = [-1, 0, 0, 0];
376     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
377 }
378 
379 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal.
380 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
381 {
382     static if (DMD_with_DSIMD)
383         return cast(__m128) __simd(XMM.CMPPS, a, b, 4);
384     else
385         return cast(__m128) cmpps!(FPComparison.une)(a, b);
386 }
387 unittest
388 {
389     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
390     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
391     __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B);
392     int[4] correct = [-1, 0, -1, -1];
393     assert(R.array == correct);
394 }
395 
396 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 
397 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
398 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
399 {
400     static if (DMD_with_DSIMD)
401         return cast(__m128) __simd(XMM.CMPSS, a, b, 4);
402     else
403         return cast(__m128) cmpss!(FPComparison.une)(a, b);
404 }
405 unittest
406 {
407     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
408     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
409     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
410     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
411     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
412     __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B);
413     __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C);
414     __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D);
415     __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E);
416     int[4] correct1 = [0, 0, 0, 0];
417     int[4] correct2 = [-1, 0, 0, 0];
418     int[4] correct3 = [-1, 0, 0, 0];
419     int[4] correct4 = [-1, 0, 0, 0];
420     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
421 }
422 
423 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal.
424 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
425 {
426     static if (DMD_with_DSIMD)
427         return cast(__m128) __simd(XMM.CMPPS, b, a, 6);
428     else
429         return cast(__m128) cmpps!(FPComparison.ult)(a, b);
430 }
431 unittest
432 {
433     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
434     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
435     __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B);
436     int[4] correct = [-1, 0, 0, -1];
437     assert(R.array == correct);
438 }
439 
440 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 
441 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
442 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
443 {
444     static if (DMD_with_DSIMD)
445     {
446         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6);
447         a[0] = c[0];
448         return a;
449     }
450     else
451         return cast(__m128) cmpss!(FPComparison.ult)(a, b);
452 }
453 unittest
454 {
455     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
456     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
457     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
458     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
459     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
460     __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B);
461     __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C);
462     __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D);
463     __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E);
464     int[4] correct1 = [0, 0, 0, 0];
465     int[4] correct2 = [0, 0, 0, 0];
466     int[4] correct3 = [-1, 0, 0, 0];
467     int[4] correct4 = [-1, 0, 0, 0];
468     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
469 }
470 
471 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than.
472 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
473 {
474     static if (DMD_with_DSIMD)
475         return cast(__m128) __simd(XMM.CMPPS, b, a, 5);
476     else
477         return cast(__m128) cmpps!(FPComparison.ule)(a, b);
478 }
479 unittest
480 {
481     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
482     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
483     __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B);
484     int[4] correct = [-1, -1, 0, -1];
485     assert(R.array == correct);
486 }
487 
488 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 
489 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
490 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
491 {
492     static if (DMD_with_DSIMD)
493     {
494         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5);
495         a[0] = c[0];
496         return a;
497     }
498     else
499         return cast(__m128) cmpss!(FPComparison.ule)(a, b);
500 }
501 unittest
502 {
503     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
504     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
505     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
506     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
507     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
508     __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B);
509     __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C);
510     __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D);
511     __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E);
512     int[4] correct1 = [-1, 0, 0, 0];
513     int[4] correct2 = [0, 0, 0, 0];
514     int[4] correct3 = [-1, 0, 0, 0];
515     int[4] correct4 = [-1, 0, 0, 0];
516     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
517 }
518 
519 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal.
520 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
521 {
522     static if (DMD_with_DSIMD)
523         return cast(__m128) __simd(XMM.CMPPS, a, b, 6);
524     else
525         return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
526 }
527 unittest
528 {
529     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
530     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
531     __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B);
532     int[4] correct = [0, 0, -1, -1];
533     assert(R.array == correct);
534 }
535 
536 
537 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 
538 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
539 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
540 {
541     static if (DMD_with_DSIMD)
542         return cast(__m128) __simd(XMM.CMPSS, a, b, 6);
543     else
544         return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
545 }
546 unittest
547 {
548     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
549     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
550     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
551     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
552     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
553     __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B);
554     __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C);
555     __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D);
556     __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E);
557     int[4] correct1 = [0, 0, 0, 0];
558     int[4] correct2 = [-1, 0, 0, 0];
559     int[4] correct3 = [-1, 0, 0, 0];
560     int[4] correct4 = [0, 0, 0, 0];
561     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
562 }
563 
564 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than.
565 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
566 {
567     static if (DMD_with_DSIMD)
568         return cast(__m128) __simd(XMM.CMPPS, a, b, 5);
569     else
570         return cast(__m128) cmpps!(FPComparison.uge)(a, b);
571 }
572 unittest
573 {
574     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
575     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
576     __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B);
577     int[4] correct = [0, -1, -1, -1];
578     assert(R.array == correct);
579 }
580 
581 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 
582 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
583 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
584 {
585     static if (DMD_with_DSIMD)
586         return cast(__m128) __simd(XMM.CMPSS, a, b, 5);
587     else
588         return cast(__m128) cmpss!(FPComparison.uge)(a, b);
589 }
590 unittest
591 {
592     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
593     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
594     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
595     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
596     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
597     __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B);
598     __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C);
599     __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D);
600     __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E);
601     int[4] correct1 = [-1, 0, 0, 0];
602     int[4] correct2 = [-1, 0, 0, 0];
603     int[4] correct3 = [-1, 0, 0, 0];
604     int[4] correct4 = [0, 0, 0, 0];
605     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
606 }
607 
608 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN.
609 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
610 {
611     static if (DMD_with_DSIMD)
612         return cast(__m128) __simd(XMM.CMPPS, a, b, 7);
613     else
614         return cast(__m128) cmpps!(FPComparison.ord)(a, b);
615 }
616 unittest
617 {
618     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
619     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
620     __m128i R = cast(__m128i) _mm_cmpord_ps(A, B);
621     int[4] correct = [-1, -1, -1, 0];
622     assert(R.array == correct);
623 }
624 
625 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 
626 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
627 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
628 {
629     static if (DMD_with_DSIMD)
630         return cast(__m128) __simd(XMM.CMPSS, a, b, 7);
631     else
632         return cast(__m128) cmpss!(FPComparison.ord)(a, b);
633 }
634 unittest
635 {
636     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
637     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
638     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
639     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
640     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
641     __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B);
642     __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C);
643     __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D);
644     __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E);
645     int[4] correct1 = [-1, 0, 0, 0];
646     int[4] correct2 = [-1, 0, 0, 0];
647     int[4] correct3 = [0, 0, 0, 0];
648     int[4] correct4 = [-1, 0, 0, 0];
649     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
650 }
651 
652 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
653 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
654 {
655     static if (DMD_with_DSIMD)
656         return cast(__m128) __simd(XMM.CMPPS, a, b, 3);
657     else
658         return cast(__m128) cmpps!(FPComparison.uno)(a, b);
659 }
660 unittest
661 {
662     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
663     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
664     __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B);
665     int[4] correct = [0, 0, 0, -1];
666     assert(R.array == correct);
667 }
668 
669 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
670 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
671 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
672 {
673     static if (DMD_with_DSIMD)
674         return cast(__m128) __simd(XMM.CMPSS, a, b, 3);
675     else return cast(__m128) cmpss!(FPComparison.uno)(a, b);
676 }
677 unittest
678 {
679     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
680     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
681     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
682     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
683     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
684     __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B);
685     __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C);
686     __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D);
687     __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E);
688     int[4] correct1 = [0, 0, 0, 0];
689     int[4] correct2 = [0, 0, 0, 0];
690     int[4] correct3 = [-1, 0, 0, 0];
691     int[4] correct4 = [0, 0, 0, 0];
692     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
693 }
694 
695 
696 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 
697 /// and return the boolean result (0 or 1).
698 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe
699 {
700     return a.array[0] == b.array[0];
701 }
702 unittest
703 {
704     assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
705     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
706     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
707     assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
708     assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0)));
709 }
710 
711 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 
712 /// and return the boolean result (0 or 1).
713 int _mm_comige_ss (__m128 a, __m128 b) pure @safe
714 {
715     return a.array[0] >= b.array[0];
716 }
717 unittest
718 {
719     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
720     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
721     assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
722     assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
723     assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
724     assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
725 }
726 
727 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 
728 /// and return the boolean result (0 or 1).
729 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
730 {
731     return a.array[0] > b.array[0];
732 }
733 unittest
734 {
735     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
736     assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
737     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
738     assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
739     assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
740 }
741 
742 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 
743 /// and return the boolean result (0 or 1).
744 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
745 {
746     return a.array[0] <= b.array[0];
747 }
748 unittest
749 {
750     assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
751     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
752     assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
753     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
754     assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
755     assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
756 }
757 
758 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 
759 /// and return the boolean result (0 or 1).
760 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
761 {
762     return a.array[0] < b.array[0];
763 }
764 unittest
765 {
766     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
767     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
768     assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
769     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
770     assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
771     assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
772 }
773 
774 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 
775 /// and return the boolean result (0 or 1).
776 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
777 {
778     return a.array[0] != b.array[0];
779 }
780 unittest
781 {
782     assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
783     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
784     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
785     assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
786     assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
787 }
788 
789 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
790 /// floating-point elements, store the results in the lower 2 elements, 
791 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
792 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
793 
794 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 
795 /// to packed 32-bit integers.
796 __m64 _mm_cvt_ps2pi (__m128 a) @safe
797 {
798     return to_m64(_mm_cvtps_epi32(a));
799 }
800 
801 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
802 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
803 /// upper elements of the result.
804 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted
805 {
806     v.ptr[0] = cast(float)x;
807     return v;
808 }
809 unittest
810 {
811     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
812     assert(a.array == [42f, 0, 0, 0]);
813 }
814 
815 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
816 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
817 {
818     __m128i ma = to_m128i(a);
819     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
820     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
821     return _mm_cvtepi32_ps(ma);
822 }
823 unittest
824 {
825     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
826     __m128 R = _mm_cvtpi16_ps(A);
827     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
828     assert(R.array == correct);
829 }
830 
831 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
832 /// floating-point elements, store the results in the lower 2 elements, 
833 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
834 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
835 {
836     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
837     a.ptr[0] = fb.array[0];
838     a.ptr[1] = fb.array[1];
839     return a;
840 }
841 unittest
842 {
843     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
844     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
845     assert(R.array == correct);
846 }
847 
848 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 
849 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 
850 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements.
851 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
852 {
853     long2 l;
854     l.ptr[0] = a.array[0];
855     l.ptr[1] = b.array[0];
856     return _mm_cvtepi32_ps(cast(__m128i)l);
857 }
858 unittest
859 {
860     __m64 A = _mm_setr_pi32(-45, 128);
861     __m64 B = _mm_setr_pi32(0, 1000);
862     __m128 R = _mm_cvtpi32x2_ps(A, B);
863     float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f];
864     assert(R.array == correct);
865 }
866 
867 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
868 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
869 {
870     __m128i b = to_m128i(a); 
871 
872     // Zero extend to 32-bit
873     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
874     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
875 
876     // Replicate sign bit
877     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
878     return _mm_cvtepi32_ps(b);
879 }
880 unittest
881 {
882     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
883     __m128 R = _mm_cvtpi8_ps(A);
884     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
885     assert(R.array == correct);
886 }
887 
888 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers.
889 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.
890 __m64 _mm_cvtps_pi16 (__m128 a) @safe
891 {
892     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
893     // Which means the 16-bit integers should be saturated
894     __m128i b = _mm_cvtps_epi32(a);
895     b = _mm_packs_epi32(b, b);
896     return to_m64(b);
897 }
898 unittest
899 {
900     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
901     short4 R = cast(short4) _mm_cvtps_pi16(A);
902     short[4] correct = [-1, 2, -32768, 32767];
903     assert(R.array == correct);
904 }
905 
906 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
907 __m64 _mm_cvtps_pi32 (__m128 a) @safe
908 {
909     return to_m64(_mm_cvtps_epi32(a));
910 }
911 unittest
912 {
913     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
914     int2 R = cast(int2) _mm_cvtps_pi32(A);
915     int[2] correct = [-33000, 70000];
916     assert(R.array == correct);
917 }
918 
919 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 
920 /// and store the results in lower 4 elements. 
921 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.
922 __m64 _mm_cvtps_pi8 (__m128 a) @safe
923 {
924     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
925     // Which means the 8-bit integers should be saturated
926     __m128i b = _mm_cvtps_epi32(a);
927     b = _mm_packs_epi32(b, _mm_setzero_si128());
928     b = _mm_packs_epi16(b, _mm_setzero_si128());
929     return to_m64(b);
930 }
931 unittest
932 {
933     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
934     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
935     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
936     assert(R.array == correct);
937 }
938 
939 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
940 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
941 {
942     __m128i ma = to_m128i(a);
943     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
944     return _mm_cvtepi32_ps(ma);
945 }
946 unittest
947 {
948     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
949     __m128 R = _mm_cvtpu16_ps(A);
950     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
951     assert(R.array == correct);
952 }
953 
954 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element.
955 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
956 {
957     __m128i b = to_m128i(a); 
958 
959     // Zero extend to 32-bit
960     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
961     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
962     return _mm_cvtepi32_ps(b);
963 }
964 unittest
965 {
966     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
967     __m128 R = _mm_cvtpu8_ps(A);
968     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
969     assert(R.array == correct);
970 }
971 
972 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
973 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
974 /// upper elements of result.
975 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
976 {
977     v.ptr[0] = cast(float)x;
978     return v;
979 }
980 unittest
981 {
982     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
983     assert(a.array == [42.0f, 0, 0, 0]);
984 }
985 
986 
987 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 
988 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
989 /// upper elements of result.
990 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
991 {
992     v.ptr[0] = cast(float)x;
993     return v;
994 }
995 unittest
996 {
997     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
998     assert(a.array == [42.0f, 0, 0, 0]);
999 }
1000 
1001 /// Take the lower single-precision (32-bit) floating-point element of `a`.
1002 float _mm_cvtss_f32(__m128 a) pure @safe
1003 {
1004     return a.array[0];
1005 }
1006 
1007 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
1008 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC
1009 {
1010     static if (GDC_with_SSE)
1011     {
1012         return __builtin_ia32_cvtss2si(a);
1013     }
1014     else static if (LDC_with_SSE1)
1015     {
1016         return __builtin_ia32_cvtss2si(a);
1017     }
1018     else static if (DMD_with_DSIMD)
1019     {
1020         __m128 b;
1021         __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers.
1022         return r.array[0];
1023     }
1024     else
1025     {
1026         return convertFloatToInt32UsingMXCSR(a.array[0]);
1027     }
1028 }
1029 unittest
1030 {
1031     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1032 }
1033 
1034 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
1035 long _mm_cvtss_si64 (__m128 a) @safe
1036 {
1037     version(LDC)
1038     {
1039         version(X86_64)
1040         {
1041             return __builtin_ia32_cvtss2si64(a);
1042         }
1043         else
1044         {
1045             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1046             // using SSE instructions only. So the builtin doesn't exit for this arch.
1047             return convertFloatToInt64UsingMXCSR(a.array[0]);
1048         }
1049     }
1050     else
1051     {
1052         return convertFloatToInt64UsingMXCSR(a.array[0]);
1053     }
1054 }
1055 unittest
1056 {
1057     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1058 
1059     uint savedRounding = _MM_GET_ROUNDING_MODE();
1060 
1061     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1062     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
1063 
1064     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1065     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
1066 
1067     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1068     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
1069 
1070     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1071     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
1072 
1073     _MM_SET_ROUNDING_MODE(savedRounding);
1074 }
1075 
1076 
1077 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 
1078 /// integer with truncation.
1079 int _mm_cvtt_ss2si (__m128 a) pure @safe
1080 {
1081     // x86: cvttss2si always generated, even in -O0
1082     return cast(int)(a.array[0]);
1083 }
1084 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto
1085 unittest
1086 {
1087     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1088 }
1089 
1090 
1091 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
1092 /// integers with truncation.
1093 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
1094 {
1095     return to_m64(_mm_cvttps_epi32(a));
1096 }
1097 
1098 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 
1099 /// integer with truncation.
1100 long _mm_cvttss_si64 (__m128 a) pure @safe
1101 {
1102     return cast(long)(a.array[0]);
1103 }
1104 unittest
1105 {
1106     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1107 }
1108 
1109 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
1110 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
1111 {
1112     return a / b;
1113 }
1114 unittest
1115 {
1116     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1117     a = _mm_div_ps(a, a);
1118     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
1119     assert(a.array == correct);
1120 }
1121 
1122 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 
1123 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 
1124 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1125 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
1126 {
1127     static if (DMD_with_DSIMD)
1128         return cast(__m128) __simd(XMM.DIVSS, a, b);
1129     else static if (GDC_with_SSE)
1130         return __builtin_ia32_divss(a, b);
1131     else
1132     {
1133         a[0] /= b[0];
1134         return a;
1135     }
1136 }
1137 unittest
1138 {
1139     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1140     a = _mm_div_ss(a, a);
1141     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
1142     assert(a.array == correct);
1143 }
1144 
1145 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended.
1146 int _mm_extract_pi16 (__m64 a, int imm8)
1147 {
1148     short4 sa = cast(short4)a;
1149     return cast(ushort)(sa.array[imm8]);
1150 }
1151 unittest
1152 {
1153     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
1154     assert(_mm_extract_pi16(A, 0) == 65535);
1155     assert(_mm_extract_pi16(A, 1) == 6);
1156     assert(_mm_extract_pi16(A, 2) == 0);
1157     assert(_mm_extract_pi16(A, 3) == 4);
1158 }
1159 
1160 /// Free aligned memory that was allocated with `_mm_malloc`.
1161 void _mm_free(void * mem_addr) @trusted
1162 {
1163     // support for free(NULL)
1164     if (mem_addr is null)
1165         return;
1166 
1167     // Technically we don't need to store size and alignement in the chunk, but we do in case we
1168     // have to implement _mm_realloc
1169 
1170     size_t pointerSize = (void*).sizeof;
1171     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
1172     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
1173     size_t alignment = *alignmentLocation;
1174     assert(alignment != 0);
1175     assert(isPointerAligned(mem_addr, alignment));
1176     free(*rawLocation);
1177 }
1178 
1179 /// Get the exception mask bits from the MXCSR control and status register. 
1180 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 
1181 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1182 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated.
1183 uint _MM_GET_EXCEPTION_MASK() @safe
1184 {
1185     return _mm_getcsr() & _MM_MASK_MASK;
1186 }
1187 
1188 /// Get the exception state bits from the MXCSR control and status register. 
1189 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
1190 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
1191 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported.
1192 uint _MM_GET_EXCEPTION_STATE() @safe
1193 {
1194     return _mm_getcsr() & _MM_EXCEPT_MASK;
1195 }
1196 
1197 /// Get the flush zero bits from the MXCSR control and status register. 
1198 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`
1199 uint _MM_GET_FLUSH_ZERO_MODE() @safe
1200 {
1201     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
1202 }
1203 
1204 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 
1205 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
1206 uint _MM_GET_ROUNDING_MODE() @safe
1207 {
1208     return _mm_getcsr() & _MM_ROUND_MASK;
1209 }
1210 
1211 /// Get the unsigned 32-bit value of the MXCSR control and status register.
1212 /// Note: this is emulated on ARM, because there is no MXCSR register then.
1213 uint _mm_getcsr() @trusted
1214 {
1215     static if (LDC_with_ARM)
1216     {
1217         // Note: we convert the ARM FPSCR into a x86 SSE control word.
1218         // However, only rounding mode and flush to zero are actually set.
1219         // The returned control word will have all exceptions masked, and no exception detected.
1220 
1221         uint fpscr = arm_get_fpcr();
1222 
1223         uint cw = 0; // No exception detected
1224         if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
1225         {
1226             // ARM has one single flag for ARM.
1227             // It does both x86 bits.
1228             // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
1229             cw |= _MM_FLUSH_ZERO_ON;
1230             cw |= 0x40; // set "denormals are zeros"
1231         } 
1232         cw |= _MM_MASK_MASK; // All exception maske
1233 
1234         // Rounding mode
1235         switch(fpscr & _MM_ROUND_MASK_ARM)
1236         {
1237             default:
1238             case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
1239             case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
1240             case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
1241             case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
1242         }
1243         return cw;
1244     }
1245     else version(GNU)
1246     {
1247         static if (GDC_with_SSE)
1248         {
1249             return __builtin_ia32_stmxcsr();
1250         }
1251         else version(X86)
1252         {
1253             uint sseRounding = 0;
1254             asm pure nothrow @nogc @trusted
1255             {
1256                 "stmxcsr %0;\n" 
1257                   : "=m" (sseRounding)
1258                   : 
1259                   : ;
1260             }
1261             return sseRounding;
1262         }
1263         else
1264             static assert(false);
1265     }
1266     else version (InlineX86Asm)
1267     {
1268         uint controlWord;
1269         asm nothrow @nogc pure @safe
1270         {
1271             stmxcsr controlWord;
1272         }
1273         return controlWord;
1274     }
1275     else
1276         static assert(0, "Not yet supported");
1277 }
1278 unittest
1279 {
1280     uint csr = _mm_getcsr();
1281 }
1282 
1283 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`.
1284 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted
1285 {
1286     short4 r = cast(short4)v;
1287     r.ptr[imm8 & 3] = cast(short)i;
1288     return cast(__m64)r;
1289 }
1290 unittest
1291 {
1292     __m64 A = _mm_set_pi16(3, 2, 1, 0);
1293     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
1294     short[4] correct = [0, 42, 2, 3];
1295     assert(R.array == correct);
1296 }
1297 
1298 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory.
1299 //  `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1300 __m128 _mm_load_ps(const(float)*p) pure @trusted // TODO shouldn't be trusted
1301 {
1302     return *cast(__m128*)p;
1303 }
1304 unittest
1305 {
1306     static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f];
1307     __m128 A = _mm_load_ps(correct.ptr);
1308     assert(A.array == correct);
1309 }
1310 
1311 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1312 __m128 _mm_load_ps1(const(float)*p) pure @trusted
1313 {
1314     return __m128(*p);
1315 }
1316 unittest
1317 {
1318     float n = 2.5f;
1319     float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f];
1320     __m128 A = _mm_load_ps1(&n);
1321     assert(A.array == correct);
1322 }
1323 
1324 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 
1325 /// elements. `mem_addr` does not need to be aligned on any particular boundary.
1326 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
1327 {
1328     static if (DMD_with_DSIMD)
1329     {
1330         return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr);
1331     }
1332     else
1333     {
1334         __m128 r;
1335         r.ptr[0] = *mem_addr;
1336         r.ptr[1] = 0;
1337         r.ptr[2] = 0;
1338         r.ptr[3] = 0;
1339         return r;
1340     }
1341 }
1342 unittest
1343 {
1344     float n = 2.5f;
1345     float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f];
1346     __m128 A = _mm_load_ss(&n);
1347     assert(A.array == correct);
1348 }
1349 
1350 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1351 alias _mm_load1_ps = _mm_load_ps1;
1352 
1353 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 
1354 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary.
1355 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1356 {
1357     static if (DMD_with_DSIMD)
1358     {
1359         return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 
1360     }
1361     else
1362     {
1363         // x86: movlhps generated since LDC 1.9.0 -O1
1364         long2 la = cast(long2)a;
1365         la.ptr[1] = (*mem_addr).array[0];
1366         return cast(__m128)la;
1367     }
1368 }
1369 unittest
1370 {
1371     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1372     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1373     __m64 M = to_m64(cast(__m128i)B);
1374      __m128 R = _mm_loadh_pi(A, &M);
1375     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1376     assert(R.array == correct);
1377 }
1378 
1379 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 
1380 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1381 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1382 {
1383     static if (DMD_with_DSIMD)
1384     {
1385         return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 
1386     }
1387     else
1388     {
1389         // x86: movlpd/movlps generated with all LDC -01
1390         long2 la = cast(long2)a;
1391         la.ptr[0] = (*mem_addr).array[0];
1392         return cast(__m128)la;
1393     }
1394 }
1395 unittest
1396 {
1397     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1398     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1399     __m64 M = to_m64(cast(__m128i)B);
1400      __m128 R = _mm_loadl_pi(A, &M);
1401     float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f];
1402     assert(R.array == correct);
1403 }
1404 
1405 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 
1406 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1407 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // TODO shouldn't be trusted
1408 {
1409     __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1
1410     __m128 a = *aligned;
1411     static if (DMD_with_DSIMD)
1412     {
1413         return cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
1414     }
1415     else
1416     {
1417         __m128 r;
1418         r.ptr[0] = a.array[3];
1419         r.ptr[1] = a.array[2];
1420         r.ptr[2] = a.array[1];
1421         r.ptr[3] = a.array[0];
1422         return r;
1423     }
1424 }
1425 unittest
1426 {
1427     align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ];
1428     __m128 A = _mm_loadr_ps(arr.ptr);
1429     float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ];
1430     assert(A.array == correct);
1431 }
1432 
1433 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 
1434 /// `mem_addr` does not need to be aligned on any particular boundary.
1435 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted
1436 {
1437     static if (GDC_with_SSE2)
1438     {
1439         return __builtin_ia32_loadups(mem_addr);
1440     }
1441     else version(LDC)
1442     {
1443         return loadUnaligned!(__m128)(mem_addr);
1444     }
1445     else version(DigitalMars)
1446     {
1447         static if (DMD_with_DSIMD)
1448         {
1449             return cast(__m128)__simd(XMM.LODUPS, *mem_addr);
1450         }
1451         else static if (SSESizedVectorsAreEmulated)
1452         {
1453             // Since this vector is emulated, it doesn't have alignement constraints
1454             // and as such we can just cast it.
1455             return *cast(__m128*)(mem_addr);
1456         }
1457         else
1458         {
1459             __m128 result;
1460             result.ptr[0] = mem_addr[0];
1461             result.ptr[1] = mem_addr[1];
1462             result.ptr[2] = mem_addr[2];
1463             result.ptr[3] = mem_addr[3];
1464             return result;
1465         }
1466     }
1467     else
1468     {
1469         __m128 result;
1470         result.ptr[0] = mem_addr[0];
1471         result.ptr[1] = mem_addr[1];
1472         result.ptr[2] = mem_addr[2];
1473         result.ptr[3] = mem_addr[3];
1474         return result;
1475     }
1476 }
1477 unittest
1478 {
1479     align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ];  // force unaligned load
1480     __m128 A = _mm_loadu_ps(&arr[1]);
1481     float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ];
1482     assert(A.array == correct);
1483 }
1484 
1485 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
1486 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
1487 {
1488     static if (DMD_with_DSIMD)
1489     {
1490         int r = *cast(short*)(mem_addr);
1491         return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
1492     }
1493     else version(DigitalMars)
1494     {
1495         // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
1496         // DMD cannot handle the below code...
1497         align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
1498         r[0] = *cast(short*)(mem_addr);
1499         return *cast(int4*)(r.ptr);
1500     }
1501     else
1502     {
1503         short r = *cast(short*)(mem_addr);
1504         short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
1505         result.ptr[0] = r;
1506         return cast(__m128i)result;
1507     }
1508 }
1509 unittest
1510 {
1511     short r = 13;
1512     short8 A = cast(short8) _mm_loadu_si16(&r);
1513     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
1514     assert(A.array == correct);
1515 }
1516 
1517 /// Load unaligned 64-bit integer from memory into the first element of result.
1518 /// Upper 64-bit is zeroed.
1519 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
1520 {
1521     static if (DMD_with_DSIMD)
1522     {
1523         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
1524     }
1525     else
1526     {
1527         long r = *cast(long*)(mem_addr);
1528         long2 result = [0, 0];
1529         result.ptr[0] = r;
1530         return cast(__m128i)result;
1531     }
1532 }
1533 unittest
1534 {
1535     long r = 446446446446;
1536     long2 A = cast(long2) _mm_loadu_si64(&r);
1537     long[2] correct = [446446446446, 0];
1538     assert(A.array == correct);
1539 }
1540 
1541 /// Allocate size bytes of memory, aligned to the alignment specified in align,
1542 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
1543 /// memory that is allocated with `_mm_malloc`.
1544 void* _mm_malloc(size_t size, size_t alignment) @trusted
1545 {
1546     assert(alignment != 0);
1547     size_t request = requestedSize(size, alignment);
1548     void* raw = malloc(request);
1549     if (request > 0 && raw == null) // malloc(0) can validly return anything
1550         onOutOfMemoryError();
1551     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
1552 }
1553 
1554 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 
1555 /// bit is not set in the corresponding element) and a non-temporal memory hint.
1556 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
1557 {
1558     // this works since mask is zero-extended
1559     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
1560 }
1561 
1562 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;///
1563 
1564 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value.
1565 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
1566 {
1567     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
1568 }
1569 
1570 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1571 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
1572 {
1573     static if (DMD_with_DSIMD)
1574     {
1575         return cast(__m128) __simd(XMM.MAXPS, a, b);
1576     }
1577     else static if (GDC_with_SSE)
1578     {
1579         return __builtin_ia32_maxps(a, b);
1580     }
1581     else static if (LDC_with_SSE1)
1582     {
1583         return __builtin_ia32_maxps(a, b);
1584     }
1585     else
1586     {
1587         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1588         __m128 r;
1589         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1590         r[1] = (a[1] > b[1]) ? a[1] : b[1];
1591         r[2] = (a[2] > b[2]) ? a[2] : b[2];
1592         r[3] = (a[3] > b[3]) ? a[3] : b[3];
1593         return r;    
1594     }
1595 }
1596 unittest
1597 {
1598     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1599     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1600     __m128 M = _mm_max_ps(A, B);
1601     assert(M.array[0] == 4);
1602     assert(M.array[1] == 2);
1603     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1604     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1605 }
1606 
1607 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
1608 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
1609 {
1610     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
1611 }
1612 
1613 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 
1614 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1615  __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
1616 {
1617     static if (DMD_with_DSIMD)
1618     {
1619         return cast(__m128) __simd(XMM.MAXSS, a, b);
1620     }
1621     else static if (GDC_with_SSE)
1622     {
1623         return __builtin_ia32_maxss(a, b);
1624     }
1625     else static if (LDC_with_SSE1)
1626     {
1627         return __builtin_ia32_maxss(a, b); 
1628     }
1629     else
1630     {  
1631         __m128 r = a;
1632         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1633         return r;
1634     }
1635 }
1636 unittest
1637 {
1638     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1639     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1640     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1641     __m128 M = _mm_max_ss(A, B);
1642     assert(M.array[0] == 4);
1643     assert(M.array[1] == 2);
1644     assert(M.array[2] == 3);
1645     assert(M.array[3] == 4);
1646     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
1647     assert(M.array[0] != M.array[0]);
1648     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
1649     assert(M.array[0] == 1);
1650 }
1651 
1652 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values.
1653 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
1654 {
1655     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
1656 }
1657 
1658 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1659 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
1660 {
1661     static if (DMD_with_DSIMD)
1662     {
1663         return cast(__m128) __simd(XMM.MINPS, a, b);
1664     }
1665     else static if (GDC_with_SSE)
1666     {
1667         return __builtin_ia32_minps(a, b);
1668     }
1669     else static if (LDC_with_SSE1)
1670     {
1671         // not technically needed, but better perf in debug mode
1672         return __builtin_ia32_minps(a, b);
1673     }
1674     else
1675     {
1676         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1677         __m128 r;
1678         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1679         r[1] = (a[1] < b[1]) ? a[1] : b[1];
1680         r[2] = (a[2] < b[2]) ? a[2] : b[2];
1681         r[3] = (a[3] < b[3]) ? a[3] : b[3];
1682         return r;
1683     }
1684 }
1685 unittest
1686 {
1687     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1688     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1689     __m128 M = _mm_min_ps(A, B);
1690     assert(M.array[0] == 1);
1691     assert(M.array[1] == 1);
1692     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1693     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1694 }
1695 
1696 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
1697 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
1698 {
1699     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
1700 }
1701 
1702 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 
1703 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1704 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
1705 {
1706     static if (DMD_with_DSIMD)
1707     {
1708         return cast(__m128) __simd(XMM.MINSS, a, b);
1709     }
1710     else static if (GDC_with_SSE)
1711     {
1712         return __builtin_ia32_minss(a, b);
1713     }
1714     else static if (LDC_with_SSE1)
1715     {
1716         return __builtin_ia32_minss(a, b);
1717     }
1718     else
1719     {
1720         // Generates minss since LDC 1.3 -O1
1721         __m128 r = a;
1722         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1723         return r;
1724     }
1725 }
1726 unittest
1727 {
1728     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1729     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1730     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1731     __m128 M = _mm_min_ss(A, B);
1732     assert(M.array[0] == 1);
1733     assert(M.array[1] == 2);
1734     assert(M.array[2] == 3);
1735     assert(M.array[3] == 4);
1736     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
1737     assert(M.array[0] != M.array[0]);
1738     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
1739     assert(M.array[0] == 1);
1740 }
1741 
1742 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 
1743 /// the upper 3 packed elements from `a` to the upper elements of result.
1744 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
1745 {
1746     // Workaround https://issues.dlang.org/show_bug.cgi?id=21673
1747     // inlining of this function fails.
1748     version(DigitalMars) asm nothrow @nogc pure { nop; }
1749 
1750     a.ptr[0] = b.array[0];
1751     return a;
1752 }
1753 unittest
1754 {
1755     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1756     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1757     __m128 R = _mm_move_ss(A, B);
1758     float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f];
1759     assert(R.array == correct);
1760 }
1761 
1762 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 
1763 /// copy the upper 2 elements from `a` to the upper 2 elements of dst.
1764 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
1765 {
1766     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1767     /*
1768     static if (DMD_with_DSIMD)
1769     {
1770         
1771         return cast(__m128) __simd(XMM.MOVHLPS, a, b);
1772     }
1773     else */
1774     {
1775         a.ptr[0] = b.array[2];
1776         a.ptr[1] = b.array[3];
1777         return a;
1778     }
1779 }
1780 unittest
1781 {
1782     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1783     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1784     __m128 R = _mm_movehl_ps(A, B);
1785     float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f];
1786     assert(R.array == correct);
1787 }
1788 
1789 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 
1790 /// copy the lower 2 elements from `a` to the lower 2 elements of result
1791 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
1792 {    
1793     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1794     /*
1795     static if (DMD_with_DSIMD)
1796     {
1797         return cast(__m128) __simd(XMM.MOVLHPS, a, b);
1798     }
1799     else
1800     */
1801     {
1802         a.ptr[2] = b.array[0];
1803         a.ptr[3] = b.array[1];
1804         return a;
1805     }    
1806 }
1807 unittest
1808 {
1809     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1810     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1811     __m128 R = _mm_movelh_ps(A, B);
1812     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1813     assert(R.array == correct);
1814 }
1815 
1816 /// Create mask from the most significant bit of each 8-bit element in `a`.
1817 int _mm_movemask_pi8 (__m64 a) pure @safe
1818 {
1819     return _mm_movemask_epi8(to_m128i(a));
1820 }
1821 unittest
1822 {
1823     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
1824 }
1825 
1826 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 
1827 /// floating-point element in `a`.
1828 int _mm_movemask_ps (__m128 a) pure @trusted
1829 {
1830     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
1831     static if (GDC_with_SSE)
1832     {
1833         return __builtin_ia32_movmskps(a);
1834     }
1835     else static if (LDC_with_SSE1)
1836     {
1837         return __builtin_ia32_movmskps(a);
1838     }
1839     else static if (LDC_with_ARM)
1840     {
1841         int4 ai = cast(int4)a;
1842         int4 shift31 = [31, 31, 31, 31]; 
1843         ai = ai >>> shift31;
1844         int4 shift = [0, 1, 2, 3]; 
1845         ai = ai << shift; // 4-way shift, only efficient on ARM.
1846         int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
1847         return r;
1848     }
1849     else
1850     {
1851         int4 ai = cast(int4)a;
1852         int r = 0;
1853         if (ai.array[0] < 0) r += 1;
1854         if (ai.array[1] < 0) r += 2;
1855         if (ai.array[2] < 0) r += 4;
1856         if (ai.array[3] < 0) r += 8;
1857         return r;
1858     }
1859 }
1860 unittest
1861 {
1862     int4 A = [-1, 0, -43, 0];
1863     assert(5 == _mm_movemask_ps(cast(float4)A));
1864 }
1865 
1866 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
1867 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
1868 {
1869     return a * b;
1870 }
1871 unittest
1872 {
1873     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1874     a = _mm_mul_ps(a, a);
1875     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
1876     assert(a.array == correct);
1877 }
1878 
1879 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 
1880 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1881 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
1882 {
1883     static if (DMD_with_DSIMD)
1884         return cast(__m128) __simd(XMM.MULSS, a, b);
1885     else static if (GDC_with_SSE)
1886         return __builtin_ia32_mulss(a, b);
1887     else
1888     {
1889         a[0] *= b[0];
1890         return a;
1891     }
1892 }
1893 unittest
1894 {
1895     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1896     a = _mm_mul_ss(a, a);
1897     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
1898     assert(a.array == correct);
1899 }
1900 
1901 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
1902 /// and return the high 16 bits of the intermediate integers.
1903 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
1904 {
1905     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
1906 }
1907 unittest
1908 {
1909     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
1910     __m64 B = _mm_set1_pi16(16384);
1911     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
1912     short[4] correct = [0, 0x3FFC, 0, 0];
1913     assert(R.array == correct);
1914 }
1915 
1916 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 
1917 /// return the result.
1918 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
1919 {
1920     static if (DMD_with_DSIMD)
1921         return cast(__m128)__simd(XMM.ORPS, a, b);
1922     else
1923         return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
1924 }
1925 
1926 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;///
1927 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;///
1928 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;///
1929 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;///
1930 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;///
1931 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;///
1932 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;///
1933 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;///
1934 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;///
1935 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;///
1936 
1937 enum _MM_HINT_T0  = 3; ///
1938 enum _MM_HINT_T1  = 2; ///
1939 enum _MM_HINT_T2  = 1; ///
1940 enum _MM_HINT_NTA = 0; ///
1941 
1942 
1943 version(LDC)
1944 {
1945     // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
1946     // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
1947     static if (__VERSION__ >= 2091) 
1948     {
1949         pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
1950             void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
1951     }
1952 }
1953 
1954 /// Fetch the line of data from memory that contains address `p` to a location in the 
1955 /// cache hierarchy specified by the locality hint i.
1956 ///
1957 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
1958 void _mm_prefetch(int locality)(const(void)* p) pure @trusted
1959 {
1960     static if (GDC_with_SSE)
1961     {
1962         return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
1963     }
1964     else static if (DMD_with_DSIMD)
1965     {
1966         enum bool isWrite = (locality & 0x4) != 0;
1967         enum level = locality & 3;
1968         return prefetch!(isWrite, level)(p);
1969     }
1970     else version(LDC)
1971     {
1972         static if (__VERSION__ >= 2091)
1973         {
1974             // const_cast here. `llvm_prefetch` wants a mutable pointer
1975             llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
1976         }
1977         else
1978         {
1979             // const_cast here. `llvm_prefetch` wants a mutable pointer
1980             llvm_prefetch( cast(void*)p, 0, locality, 1);
1981         }
1982     }
1983     else version(D_InlineAsm_X86_64)
1984     {
1985         static if (locality == _MM_HINT_NTA)
1986         {
1987             asm pure nothrow @nogc @trusted
1988             {
1989                 mov RAX, p;
1990                 prefetchnta [RAX];
1991             }
1992         }
1993         else static if (locality == _MM_HINT_T0)
1994         {
1995             asm pure nothrow @nogc @trusted
1996             {
1997                 mov RAX, p;
1998                 prefetcht0 [RAX];
1999             }
2000         }
2001         else static if (locality == _MM_HINT_T1)
2002         {
2003             asm pure nothrow @nogc @trusted
2004             {
2005                 mov RAX, p;
2006                 prefetcht1 [RAX];
2007             }
2008         }
2009         else static if (locality == _MM_HINT_T2)
2010         {
2011             asm pure nothrow @nogc @trusted
2012             {
2013                 mov RAX, p;
2014                 prefetcht2 [RAX];
2015             }
2016         }
2017         else
2018             assert(false); // invalid locality hint
2019     }
2020     else version(D_InlineAsm_X86)
2021     {
2022         static if (locality == _MM_HINT_NTA)
2023         {
2024             asm pure nothrow @nogc @trusted
2025             {
2026                 mov EAX, p;
2027                 prefetchnta [EAX];
2028             }
2029         }
2030         else static if (locality == _MM_HINT_T0)
2031         {
2032             asm pure nothrow @nogc @trusted
2033             {
2034                 mov EAX, p;
2035                 prefetcht0 [EAX];
2036             }
2037         }
2038         else static if (locality == _MM_HINT_T1)
2039         {
2040             asm pure nothrow @nogc @trusted
2041             {
2042                 mov EAX, p;
2043                 prefetcht1 [EAX];
2044             }
2045         }
2046         else static if (locality == _MM_HINT_T2)
2047         {
2048             asm pure nothrow @nogc @trusted
2049             {
2050                 mov EAX, p;
2051                 prefetcht2 [EAX];
2052             }
2053         }
2054         else 
2055             assert(false); // invalid locality hint
2056     }
2057     else
2058     {
2059         // Generic version: do nothing. From bitter experience, 
2060         // it's unlikely you get ANY speed-up with manual prefetching.
2061         // Prefetching or not doesn't change program behaviour.
2062     }
2063 }
2064 unittest
2065 {
2066     // From Intel documentation:
2067     // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 
2068     // 32 bytes."
2069     ubyte[256] cacheline; // though it seems it cannot generate GP fault
2070     _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
2071     _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
2072     _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
2073     _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
2074 }
2075 
2076 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;///
2077 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;///
2078 
2079 
2080 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 
2081 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12.
2082 __m128 _mm_rcp_ps (__m128 a) pure @trusted
2083 {
2084     static if (DMD_with_DSIMD)
2085     {
2086         return cast(__m128) __simd(XMM.RCPPS, a);
2087     }
2088     else static if (GDC_with_SSE)
2089     {
2090         return __builtin_ia32_rcpps(a);
2091     }
2092     else static if (LDC_with_SSE1)
2093     {
2094         return __builtin_ia32_rcpps(a);
2095     }
2096     else
2097     {        
2098         a.ptr[0] = 1.0f / a.array[0];
2099         a.ptr[1] = 1.0f / a.array[1];
2100         a.ptr[2] = 1.0f / a.array[2];
2101         a.ptr[3] = 1.0f / a.array[3];
2102         return a;
2103     }
2104 }
2105 unittest
2106 {
2107     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2108     __m128 groundTruth = _mm_set1_ps(1.0f) / A;
2109     __m128 result = _mm_rcp_ps(A);
2110     foreach(i; 0..4)
2111     {
2112         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2113         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2114     }
2115 }
2116 
2117 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 
2118 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 
2119 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2120 __m128 _mm_rcp_ss (__m128 a) pure @trusted
2121 {
2122     static if (DMD_with_DSIMD)
2123     {
2124         return cast(__m128) __simd(XMM.RCPSS, a);
2125     }
2126     else static if (GDC_with_SSE)
2127     {
2128         return __builtin_ia32_rcpss(a);
2129     }
2130     else static if (LDC_with_SSE1)
2131     {
2132         return __builtin_ia32_rcpss(a);
2133     }
2134     else
2135     {
2136         a.ptr[0] = 1.0f / a.array[0];
2137         return a;
2138     }
2139 }
2140 unittest
2141 {
2142     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2143     __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f);
2144     __m128 R = _mm_rcp_ss(A);
2145     double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1;
2146     assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2147     assert(R.array[1] == correct.array[1]);
2148     assert(R.array[2] == correct.array[2]);
2149     assert(R.array[3] == correct.array[3]);
2150 }
2151 
2152 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 
2153 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2154 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted
2155 {
2156     static if (DMD_with_DSIMD)
2157     {
2158         return cast(__m128) __simd(XMM.RSQRTPS, a);
2159     }
2160     else static if (GDC_with_SSE)
2161     {
2162         return __builtin_ia32_rsqrtps(a);
2163     }
2164     else static if (LDC_with_SSE1)
2165     {
2166         return __builtin_ia32_rsqrtps(a);
2167     }
2168     else version(LDC)
2169     {
2170         a[0] = 1.0f / llvm_sqrt(a[0]);
2171         a[1] = 1.0f / llvm_sqrt(a[1]);
2172         a[2] = 1.0f / llvm_sqrt(a[2]);
2173         a[3] = 1.0f / llvm_sqrt(a[3]);
2174         return a;
2175     }
2176     else
2177     {
2178         a.ptr[0] = 1.0f / sqrt(a.array[0]);
2179         a.ptr[1] = 1.0f / sqrt(a.array[1]);
2180         a.ptr[2] = 1.0f / sqrt(a.array[2]);
2181         a.ptr[3] = 1.0f / sqrt(a.array[3]);
2182         return a;
2183     }
2184 }
2185 unittest
2186 {
2187     __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f);
2188     __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
2189     __m128 result = _mm_rsqrt_ps(A);
2190     foreach(i; 0..4)
2191     {
2192         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2193         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2194     }
2195 }
2196 
2197 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`,
2198 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 
2199 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2200 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted
2201 {   
2202     static if (DMD_with_DSIMD)
2203     {
2204         return cast(__m128) __simd(XMM.RSQRTSS, a);
2205     }
2206     else static if (GDC_with_SSE)
2207     {
2208         return __builtin_ia32_rsqrtss(a);
2209     }
2210     else static if (LDC_with_SSE1)
2211     {
2212         return __builtin_ia32_rsqrtss(a);
2213     }
2214     else version(LDC)
2215     {
2216         a[0] = 1.0f / llvm_sqrt(a[0]);
2217         return a;
2218     }
2219     else
2220     {
2221         a[0] = 1.0f / sqrt(a[0]);
2222         return a;
2223     }
2224 }
2225 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss
2226 {
2227     double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc?
2228     void testApproximateSSE(float number) nothrow @nogc
2229     {
2230         __m128 A = _mm_set1_ps(number);
2231 
2232         // test _mm_rcp_ps
2233         __m128 B = _mm_rcp_ps(A);
2234         foreach(i; 0..4)
2235         {
2236             double exact = 1.0f / A.array[i];
2237             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2238             assert(abs_double(ratio - 1) <= maxRelativeError);
2239         }
2240 
2241         // test _mm_rcp_ss
2242         {
2243             B = _mm_rcp_ss(A);
2244             double exact = 1.0f / A.array[0];
2245             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2246             assert(abs_double(ratio - 1) <= maxRelativeError);
2247         }
2248 
2249         // test _mm_rsqrt_ps
2250         B = _mm_rsqrt_ps(A);
2251         foreach(i; 0..4)
2252         {
2253             double exact = 1.0f / sqrt(A.array[i]);
2254             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2255             assert(abs_double(ratio - 1) <= maxRelativeError);
2256         }
2257 
2258         // test _mm_rsqrt_ss
2259         {
2260             B = _mm_rsqrt_ss(A);
2261             double exact = 1.0f / sqrt(A.array[0]);
2262             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2263             assert(abs_double(ratio - 1) <= maxRelativeError);
2264         }
2265     }
2266 
2267     testApproximateSSE(0.00001f);
2268     testApproximateSSE(1.1f);
2269     testApproximateSSE(345.0f);
2270     testApproximateSSE(2.45674864151f);
2271     testApproximateSSE(700000.0f);
2272     testApproximateSSE(10000000.0f);
2273     testApproximateSSE(27841456468.0f);
2274 }
2275 
2276 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2277 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2278 /// low 16 bits of result.
2279 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
2280 {
2281     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
2282 }
2283 
2284 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2285 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`,
2286 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
2287 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
2288 {
2289     // Note: unsupported on ARM
2290     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
2291 }
2292 
2293 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2294 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
2295 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
2296 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
2297 {
2298     // Note: unsupported on ARM
2299     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
2300 }
2301 
2302 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2303 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`.
2304 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
2305 {
2306     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
2307 }
2308 
2309 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
2310 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
2311 {
2312     // Note: despite appearances, generates sensible code,
2313     //       inlines correctly and is constant folded
2314     float[4] result = [e0, e1, e2, e3];
2315     return loadUnaligned!(float4)(result.ptr);
2316 }
2317 unittest
2318 {
2319     __m128 A = _mm_set_ps(3, 2, 1, 546);
2320     float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
2321     assert(A.array == correct);
2322 }
2323 
2324 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; ///
2325 
2326 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2327 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 
2328 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
2329 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
2330 {
2331     // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2332     version(GNU) asm @trusted { "" : : : "memory"; }
2333     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
2334 }
2335 
2336 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements.
2337 __m128 _mm_set_ss (float a) pure @trusted
2338 {
2339     static if (DMD_with_DSIMD)
2340     {
2341         return cast(__m128) __simd(XMM.LODSS, a);
2342     }
2343     else
2344     {
2345         __m128 r = _mm_setzero_ps();
2346         r.ptr[0] = a;
2347         return r;
2348     }
2349 }
2350 unittest
2351 {
2352     float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
2353     __m128 A = _mm_set_ss(42.0f);
2354     assert(A.array == correct);
2355 }
2356 
2357 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements.
2358 __m128 _mm_set1_ps (float a) pure @trusted
2359 {
2360     __m128 r = a;
2361     return r;
2362 }
2363 unittest
2364 {
2365     float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
2366     __m128 A = _mm_set1_ps(42.0f);
2367     assert(A.array == correct);
2368 }
2369 
2370 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`.
2371 void _mm_setcsr(uint controlWord) @trusted
2372 {
2373     static if (LDC_with_ARM)
2374     {
2375         // Convert from SSE to ARM control word. This is done _partially_
2376         // and only support rounding mode changes.
2377 
2378         // "To alter some bits of a VFP system register without 
2379         // affecting other bits, use a read-modify-write procedure"
2380         uint fpscr = arm_get_fpcr();
2381         
2382         // Bits 23 to 22 are rounding modes, however not used in NEON
2383         fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
2384         switch(controlWord & _MM_ROUND_MASK)
2385         {
2386             default:
2387             case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
2388             case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
2389             case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
2390             case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
2391         }
2392         fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
2393         if (controlWord & _MM_FLUSH_ZERO_MASK)
2394             fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
2395         arm_set_fpcr(fpscr);
2396     }
2397     else version(GNU)
2398     {
2399         static if (GDC_with_SSE)
2400         {
2401             // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2402             version(GNU) asm @trusted { "" : : : "memory"; }
2403             __builtin_ia32_ldmxcsr(controlWord);
2404         }
2405         else version(X86)
2406         {
2407             asm pure nothrow @nogc @trusted
2408             {
2409                 "ldmxcsr %0;\n" 
2410                   : 
2411                   : "m" (controlWord)
2412                   : ;
2413             }
2414         }
2415         else
2416             static assert(false);
2417     }
2418     else version (InlineX86Asm)
2419     {
2420         asm pure nothrow @nogc @safe
2421         {
2422             ldmxcsr controlWord;
2423         }
2424     }
2425     else
2426         static assert(0, "Not yet supported");
2427 }
2428 unittest
2429 {
2430     _mm_setcsr(_mm_getcsr());
2431 }
2432 
2433 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
2434 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
2435 {
2436     float[4] result = [e3, e2, e1, e0];
2437     return loadUnaligned!(float4)(result.ptr);
2438 }
2439 unittest
2440 {
2441     __m128 A = _mm_setr_ps(3, 2, 1, 546);
2442     float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
2443     assert(A.array == correct);
2444     assert(A.array[0] == 3.0f);
2445     assert(A.array[1] == 2.0f);
2446     assert(A.array[2] == 1.0f);
2447     assert(A.array[3] == 546.0f);
2448 }
2449 
2450 /// Return vector of type `__m128` with all elements set to zero.
2451 __m128 _mm_setzero_ps() pure @trusted
2452 {
2453     float4 r;
2454     r = 0.0f;
2455     return r;
2456 }
2457 unittest
2458 {
2459     __m128 R = _mm_setzero_ps();
2460     float[4] correct = [0.0f, 0, 0, 0];
2461     assert(R.array == correct);
2462 }
2463 
2464 /// Perform a serializing operation on all store-to-memory instructions that were issued prior 
2465 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 
2466 /// is globally visible before any store instruction which follows the fence in program order.
2467 void _mm_sfence() @trusted
2468 {
2469     version(GNU)
2470     {
2471         static if (GDC_with_SSE)
2472         {
2473             __builtin_ia32_sfence();
2474         }
2475         else version(X86)
2476         {
2477             asm pure nothrow @nogc @trusted
2478             {
2479                 "sfence;\n" : : : ;
2480             }
2481         }
2482         else
2483             static assert(false);
2484     }
2485     else static if (LDC_with_SSE1)
2486     {
2487         __builtin_ia32_sfence();
2488     }
2489     else static if (DMD_with_asm)
2490     {
2491         asm nothrow @nogc pure @safe
2492         {
2493             sfence;
2494         }
2495     }
2496     else version(LDC)
2497     {
2498         llvm_memory_fence(); // PERF: this generates mfence instead of sfence
2499     }
2500     else
2501         static assert(false);
2502 }
2503 unittest
2504 {
2505     _mm_sfence();
2506 }
2507 
2508 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
2509 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe
2510 {
2511     return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ),
2512                                               ( (imm8 >> 2) & 3 ),
2513                                               ( (imm8 >> 4) & 3 ),
2514                                               ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
2515 }
2516 unittest
2517 {
2518     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
2519     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2520     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
2521     short[4] expectedB = [ 3, 2, 1, 0 ];
2522     assert(B.array == expectedB);
2523 }
2524 
2525 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
2526 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
2527 {
2528     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
2529 }
2530 
2531 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
2532 __m128 _mm_sqrt_ps(__m128 a) @trusted
2533 {
2534     static if (GDC_with_SSE)
2535     {
2536         return __builtin_ia32_sqrtps(a);
2537     }
2538     else version(LDC)
2539     {
2540         // Disappeared with LDC 1.11
2541         static if (__VERSION__ < 2081)
2542             return __builtin_ia32_sqrtps(a);
2543         else
2544         {
2545             a[0] = llvm_sqrt(a[0]);
2546             a[1] = llvm_sqrt(a[1]);
2547             a[2] = llvm_sqrt(a[2]);
2548             a[3] = llvm_sqrt(a[3]);
2549             return a;
2550         }
2551     }
2552     else
2553     {
2554         a.ptr[0] = sqrt(a.array[0]);
2555         a.ptr[1] = sqrt(a.array[1]);
2556         a.ptr[2] = sqrt(a.array[2]);
2557         a.ptr[3] = sqrt(a.array[3]);
2558         return a;
2559     }
2560 }
2561 unittest
2562 {
2563     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
2564     assert(A.array[0] == 2.0f);
2565     assert(A.array[1] == 2.0f);
2566     assert(A.array[2] == 2.0f);
2567     assert(A.array[3] == 2.0f);
2568 }
2569 
2570 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower
2571 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result.
2572 __m128 _mm_sqrt_ss(__m128 a) @trusted
2573 {
2574     static if (GDC_with_SSE)
2575     {
2576         return __builtin_ia32_sqrtss(a);
2577     }
2578     else version(LDC)
2579     {
2580         a.ptr[0] = llvm_sqrt(a.array[0]);
2581         return a;
2582     }
2583     else
2584     {   
2585         a.ptr[0] = sqrt(a.array[0]);
2586         return a;
2587     }
2588 }
2589 unittest
2590 {
2591     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
2592     assert(A.array[0] == 2.0f);
2593     assert(A.array[1] == 4.0f);
2594     assert(A.array[2] == 4.0f);
2595     assert(A.array[3] == 4.0f);
2596 }
2597 
2598 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2599 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2600 void _mm_store_ps (float* mem_addr, __m128 a) pure
2601 {
2602     __m128* aligned = cast(__m128*)mem_addr;
2603     *aligned = a;
2604 }
2605 
2606 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; ///
2607 
2608 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 
2609 /// `mem_addr` does not need to be aligned on any particular boundary.
2610 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
2611 {
2612     *mem_addr = a.array[0];
2613 }
2614 unittest
2615 {
2616     float a;
2617     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
2618     assert(a == 546);
2619 }
2620 
2621 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 
2622 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2623 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // TODO: shouldn't be trusted
2624 {
2625     __m128* aligned = cast(__m128*)mem_addr;
2626     __m128 r;
2627     r.ptr[0] = a.array[0];
2628     r.ptr[1] = a.array[0];
2629     r.ptr[2] = a.array[0];
2630     r.ptr[3] = a.array[0];
2631     *aligned = r;
2632 }
2633 unittest
2634 {
2635     align(16) float[4] A;
2636     _mm_store1_ps(A.ptr, _mm_set_ss(42.0f));
2637     float[4] correct = [42.0f, 42, 42, 42];
2638     assert(A == correct);
2639 }
2640 
2641 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory.
2642 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
2643 {
2644     long2 la = cast(long2)a;
2645     (*p).ptr[0] = la.array[1];
2646 }
2647 unittest
2648 {
2649     __m64 R = _mm_setzero_si64();
2650     long2 A = [13, 25];
2651     _mm_storeh_pi(&R, cast(__m128)A);
2652     assert(R.array[0] == 25);
2653 }
2654 
2655 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory.
2656 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
2657 {
2658     long2 la = cast(long2)a;
2659     (*p).ptr[0] = la.array[0];
2660 }
2661 unittest
2662 {
2663     __m64 R = _mm_setzero_si64();
2664     long2 A = [13, 25];
2665     _mm_storel_pi(&R, cast(__m128)A);
2666     assert(R.array[0] == 13);
2667 }
2668 
2669 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 
2670 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2671 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // TODO should not be trusted
2672 {
2673     __m128* aligned = cast(__m128*)mem_addr;
2674     __m128 r;
2675     r.ptr[0] = a.array[3];
2676     r.ptr[1] = a.array[2];
2677     r.ptr[2] = a.array[1];
2678     r.ptr[3] = a.array[0];
2679     *aligned = r;
2680 }
2681 unittest
2682 {
2683     align(16) float[4] A;
2684     _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4));
2685     float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f];
2686     assert(A == correct);
2687 }
2688 
2689 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2690 /// `mem_addr` does not need to be aligned on any particular boundary.
2691 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe // TODO should not be trusted
2692 {
2693     storeUnaligned!(float4)(a, mem_addr);
2694 }
2695 
2696 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint.
2697 void _mm_stream_pi (__m64* mem_addr, __m64 a)
2698 {
2699     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2700     *mem_addr = a; // it's a regular move instead
2701 }
2702 
2703 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`s into memory using
2704 /// a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be
2705 /// generated.
2706 void _mm_stream_ps (float* mem_addr, __m128 a)
2707 {
2708     // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal
2709     // needs some IR outside this function that would say:
2710     //
2711     //  !0 = !{ i32 1 }
2712     //
2713     // It's a LLVM IR metadata description.
2714     __m128* dest = cast(__m128*)mem_addr;
2715     *dest = a; // it's a regular move instead
2716 }
2717 unittest
2718 {
2719     align(16) float[4] A;
2720     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
2721     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
2722 }
2723 
2724 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 
2725 /// floating-point elements in `a`.
2726 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
2727 {
2728     return a - b;
2729 }
2730 unittest
2731 {
2732     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2733     a = _mm_sub_ps(a, a);
2734     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
2735     assert(a.array == correct);
2736 }
2737 
2738 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit)
2739 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 
2740 /// packed elements from a to the upper elements of result.
2741 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
2742 {
2743     static if (DMD_with_DSIMD)
2744         return cast(__m128) __simd(XMM.SUBSS, a, b);
2745     else static if (GDC_with_SSE)
2746         return __builtin_ia32_subss(a, b);
2747     else
2748     {
2749         a[0] -= b[0];
2750         return a;
2751     }
2752 }
2753 unittest
2754 {
2755     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2756     a = _mm_sub_ss(a, a);
2757     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
2758     assert(a.array == correct);
2759 }
2760 
2761 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 
2762 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
2763 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
2764 {
2765     __m128 tmp3, tmp2, tmp1, tmp0;
2766     tmp0 = _mm_unpacklo_ps(row0, row1);
2767     tmp2 = _mm_unpacklo_ps(row2, row3);
2768     tmp1 = _mm_unpackhi_ps(row0, row1);
2769     tmp3 = _mm_unpackhi_ps(row2, row3);
2770     row0 = _mm_movelh_ps(tmp0, tmp2);
2771     row1 = _mm_movehl_ps(tmp2, tmp0);
2772     row2 = _mm_movelh_ps(tmp1, tmp3);
2773     row3 = _mm_movehl_ps(tmp3, tmp1);
2774 }
2775 unittest
2776 {
2777     __m128 l0 = _mm_setr_ps(0, 1, 2, 3);
2778     __m128 l1 = _mm_setr_ps(4, 5, 6, 7);
2779     __m128 l2 = _mm_setr_ps(8, 9, 10, 11);
2780     __m128 l3 = _mm_setr_ps(12, 13, 14, 15);
2781     _MM_TRANSPOSE4_PS(l0, l1, l2, l3);
2782     float[4] r0 = [0.0f, 4, 8, 12];
2783     float[4] r1 = [1.0f, 5, 9, 13];
2784     float[4] r2 = [2.0f, 6, 10, 14];
2785     float[4] r3 = [3.0f, 7, 11, 15];
2786     assert(l0.array == r0);
2787     assert(l1.array == r1);
2788     assert(l2.array == r2);
2789     assert(l3.array == r3);
2790 }
2791 
2792 // Note: the only difference between these intrinsics is the signalling
2793 //       behaviour of quiet NaNs. This is incorrect but the case where
2794 //       you would want to differentiate between qNaN and sNaN and then
2795 //       treat them differently on purpose seems extremely rare.
2796 alias _mm_ucomieq_ss = _mm_comieq_ss;
2797 alias _mm_ucomige_ss = _mm_comige_ss;
2798 alias _mm_ucomigt_ss = _mm_comigt_ss;
2799 alias _mm_ucomile_ss = _mm_comile_ss;
2800 alias _mm_ucomilt_ss = _mm_comilt_ss;
2801 alias _mm_ucomineq_ss = _mm_comineq_ss;
2802 
2803 /// Return vector of type `__m128` with undefined elements.
2804 __m128 _mm_undefined_ps() pure @safe
2805 {
2806     __m128 undef = void;
2807     return undef;
2808 }
2809 
2810 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`.
2811 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
2812 {
2813     version(LDC)
2814     {
2815         // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0
2816         return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b);
2817     }
2818     else
2819     {
2820         __m128 r;
2821         r.ptr[0] = a.array[2];
2822         r.ptr[1] = b.array[2];
2823         r.ptr[2] = a.array[3];
2824         r.ptr[3] = b.array[3];
2825         return r;
2826     }
2827 }
2828 unittest
2829 {
2830     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
2831     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
2832     __m128 R = _mm_unpackhi_ps(A, B);
2833     float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f];
2834     assert(R.array == correct);
2835 }
2836 
2837 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`.
2838 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
2839 {
2840     version(LDC)
2841     {
2842         // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0
2843         return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b);
2844     }
2845     else
2846     {
2847         __m128 r;
2848         r.ptr[0] = a.array[0];
2849         r.ptr[1] = b.array[0];
2850         r.ptr[2] = a.array[1];
2851         r.ptr[3] = b.array[1];
2852         return r;
2853     }
2854 }
2855 unittest
2856 {
2857     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
2858     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
2859     __m128 R = _mm_unpacklo_ps(A, B);
2860     float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f];
2861     assert(R.array == correct);
2862 }
2863 
2864 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
2865 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
2866 {
2867     return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
2868 }
2869 
2870 
2871 private
2872 {
2873     // Returns: `true` if the pointer is suitably aligned.
2874     bool isPointerAligned(void* p, size_t alignment) pure
2875     {
2876         assert(alignment != 0);
2877         return ( cast(size_t)p & (alignment - 1) ) == 0;
2878     }
2879 
2880     // Returns: next pointer aligned with alignment bytes.
2881     void* nextAlignedPointer(void* start, size_t alignment) pure
2882     {
2883         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
2884     }
2885 
2886     // Returns number of bytes to actually allocate when asking
2887     // for a particular alignment
2888     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
2889     {
2890         enum size_t pointerSize = size_t.sizeof;
2891         return askedSize + alignment - 1 + pointerSize * 3;
2892     }
2893 
2894     // Store pointer given my malloc, size and alignment
2895     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
2896     {
2897         enum size_t pointerSize = size_t.sizeof;
2898         char* start = cast(char*)raw + pointerSize * 3;
2899         void* aligned = nextAlignedPointer(start, alignment);
2900         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
2901         *rawLocation = raw;
2902         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
2903         *sizeLocation = size;
2904         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
2905         *alignmentLocation = alignment;
2906         assert( isPointerAligned(aligned, alignment) );
2907         return aligned;
2908     }
2909 
2910     // Returns: x, multiple of powerOfTwo, so that x >= n.
2911     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
2912     {
2913         // check power-of-two
2914         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
2915 
2916         size_t mask = ~(powerOfTwo - 1);
2917         return (n + powerOfTwo - 1) & mask;
2918     }
2919 }
2920 
2921 unittest
2922 {
2923     assert(nextMultipleOf(0, 4) == 0);
2924     assert(nextMultipleOf(1, 4) == 4);
2925     assert(nextMultipleOf(2, 4) == 4);
2926     assert(nextMultipleOf(3, 4) == 4);
2927     assert(nextMultipleOf(4, 4) == 4);
2928     assert(nextMultipleOf(5, 4) == 8);
2929 
2930     {
2931         void* p = _mm_malloc(23, 16);
2932         assert(p !is null);
2933         assert(((cast(size_t)p) & 0xf) == 0);
2934         _mm_free(p);
2935     }
2936 
2937     void* nullAlloc = _mm_malloc(0, 32);
2938     assert(nullAlloc != null);
2939     _mm_free(nullAlloc);
2940 }
2941 
2942 // For some reason, order of declaration is important for this one
2943 // so it is misplaced.
2944 // Note: is just another name for _mm_cvtss_si32
2945 alias _mm_cvt_ss2si = _mm_cvtss_si32;