1 /**
2 * SSE intrinsics.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.xmmintrin;
8 
9 public import inteli.types;
10 
11 import inteli.internals;
12 
13 import inteli.mmx;
14 import inteli.emmintrin;
15 
16 import core.stdc.stdlib: malloc, free;
17 import core.stdc.string: memcpy;
18 import core.exception: onOutOfMemoryError;
19 
20 version(D_InlineAsm_X86)
21     version = InlineX86Asm;
22 else version(D_InlineAsm_X86_64)
23     version = InlineX86Asm;
24 
25 
26 // SSE1
27 
28 nothrow @nogc:
29 
30 
31 enum int _MM_EXCEPT_INVALID    = 0x0001; /// MXCSR Exception states.
32 enum int _MM_EXCEPT_DENORM     = 0x0002; ///ditto
33 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004; ///ditto
34 enum int _MM_EXCEPT_OVERFLOW   = 0x0008; ///ditto
35 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010; ///ditto
36 enum int _MM_EXCEPT_INEXACT    = 0x0020; ///ditto
37 enum int _MM_EXCEPT_MASK       = 0x003f; /// MXCSR Exception states mask.
38 
39 enum int _MM_MASK_INVALID      = 0x0080; /// MXCSR Exception masks.
40 enum int _MM_MASK_DENORM       = 0x0100; ///ditto
41 enum int _MM_MASK_DIV_ZERO     = 0x0200; ///ditto
42 enum int _MM_MASK_OVERFLOW     = 0x0400; ///ditto
43 enum int _MM_MASK_UNDERFLOW    = 0x0800; ///ditto
44 enum int _MM_MASK_INEXACT      = 0x1000; ///ditto
45 enum int _MM_MASK_MASK         = 0x1f80; /// MXCSR Exception masks mask.
46 
47 enum int _MM_ROUND_NEAREST     = 0x0000; /// MXCSR Rounding mode.
48 enum int _MM_ROUND_DOWN        = 0x2000; ///ditto
49 enum int _MM_ROUND_UP          = 0x4000; ///ditto
50 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto
51 enum int _MM_ROUND_MASK        = 0x6000; /// MXCSR Rounding mode mask.
52 
53 enum int _MM_FLUSH_ZERO_MASK   = 0x8000; /// MXCSR Denormal flush to zero mask.
54 enum int _MM_FLUSH_ZERO_ON     = 0x8000; /// MXCSR Denormal flush to zero modes.
55 enum int _MM_FLUSH_ZERO_OFF    = 0x0000; ///ditto
56 
57 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
58 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
59 {
60     return a + b;
61 }
62 unittest
63 {
64     __m128 a = [1, 2, 3, 4];
65     a = _mm_add_ps(a, a);
66     assert(a.array[0] == 2);
67     assert(a.array[1] == 4);
68     assert(a.array[2] == 6);
69     assert(a.array[3] == 8);
70 }
71 
72 /// Add the lower single-precision (32-bit) floating-point element 
73 /// in `a` and `b`, store the result in the lower element of result, 
74 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
75 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
76 {
77     static if (GDC_with_SSE)
78     {
79         return __builtin_ia32_addss(a, b);
80     }
81     else static if (DMD_with_DSIMD)
82     {
83         return cast(__m128) __simd(XMM.ADDSS, a, b);
84     }
85     else
86     {
87         a[0] += b[0];
88         return a;
89     }
90 }
91 unittest
92 {
93     __m128 a = [1, 2, 3, 4];
94     a = _mm_add_ss(a, a);
95     assert(a.array == [2.0f, 2, 3, 4]);
96 }
97 
98 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
99 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
100 {
101     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
102 }
103 unittest
104 {
105     float a = 4.32f;
106     float b = -78.99f;
107     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
108     __m128 A = _mm_set_ps(a, b, a, b);
109     __m128 B = _mm_set_ps(b, a, b, a);
110     int4 R = cast(int4)( _mm_and_ps(A, B) );
111     assert(R.array[0] == correct);
112     assert(R.array[1] == correct);
113     assert(R.array[2] == correct);
114     assert(R.array[3] == correct);
115 }
116 
117 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
118 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
119 {
120     static if (DMD_with_DSIMD)
121         return cast(__m128) __simd(XMM.ANDNPS, a, b);
122     else
123         return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
124 }
125 unittest
126 {
127     float a = 4.32f;
128     float b = -78.99f;
129     int correct  = ~(*cast(int*)(&a)) &  (*cast(int*)(&b));
130     int correct2 =  (*cast(int*)(&a)) & ~(*cast(int*)(&b));
131     __m128 A = _mm_set_ps(a, b, a, b);
132     __m128 B = _mm_set_ps(b, a, b, a);
133     int4 R = cast(int4)( _mm_andnot_ps(A, B) );
134     assert(R.array[0] == correct2);
135     assert(R.array[1] == correct);
136     assert(R.array[2] == correct2);
137     assert(R.array[3] == correct);
138 }
139 
140 /// Average packed unsigned 16-bit integers in ``a` and `b`.
141 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
142 {
143     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
144 }
145 
146 /// Average packed unsigned 8-bit integers in ``a` and `b`.
147 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
148 {
149     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
150 }
151 
152 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality.
153 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
154 {
155     static if (DMD_with_DSIMD)
156         return cast(__m128) __simd(XMM.CMPPS, a, b, 0);
157     else
158         return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
159 }
160 unittest
161 {
162     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
163     __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan);
164     __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B);
165     int[4] correct = [0, -1, 0, 0];
166     assert(R.array == correct);
167 }
168 
169 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 
170 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
171 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
172 {
173     static if (DMD_with_DSIMD)
174         return cast(__m128) __simd(XMM.CMPSS, a, b, 0);
175     else
176         return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
177 }
178 unittest
179 {
180     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
181     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
182     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
183     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
184     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
185     __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B);
186     __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C);
187     __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D);
188     __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E);
189     int[4] correct1 = [-1, 0, 0, 0];
190     int[4] correct2 = [0, 0, 0, 0];
191     int[4] correct3 = [0, 0, 0, 0];
192     int[4] correct4 = [0, 0, 0, 0];
193     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
194 }
195 
196 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal.
197 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
198 {
199     static if (DMD_with_DSIMD)
200         return cast(__m128) __simd(XMM.CMPPS, b, a, 2);
201     else
202         return cast(__m128) cmpps!(FPComparison.oge)(a, b);
203 }
204 unittest
205 {
206     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
207     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
208     __m128i R = cast(__m128i) _mm_cmpge_ps(A, B);
209     int[4] correct = [0, -1,-1, 0];
210     assert(R.array == correct);
211 }
212 
213 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 
214 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
215 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
216 {
217     static if (DMD_with_DSIMD)
218     {
219         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2);
220         a[0] = c[0];
221         return a;
222     }
223     else
224         return cast(__m128) cmpss!(FPComparison.oge)(a, b);
225 }
226 unittest
227 {
228     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
229     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
230     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
231     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
232     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
233     __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B);
234     __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C);
235     __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D);
236     __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E);
237     int[4] correct1 = [-1, 0, 0, 0];
238     int[4] correct2 = [-1, 0, 0, 0];
239     int[4] correct3 = [0, 0, 0, 0];
240     int[4] correct4 = [0, 0, 0, 0];
241     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
242 }
243 
244 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than.
245 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
246 {
247     static if (DMD_with_DSIMD)
248         return cast(__m128) __simd(XMM.CMPPS, b, a, 1);
249     else
250         return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
251 }
252 unittest
253 {
254     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
255     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
256     __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B);
257     int[4] correct = [0, 0,-1, 0];
258     assert(R.array == correct);
259 }
260 
261 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 
262 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
263 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
264 {
265     static if (DMD_with_DSIMD)
266     {
267         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1);
268         a[0] = c[0];
269         return a;
270     }
271     else
272         return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
273 }
274 unittest
275 {
276     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
277     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
278     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
279     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
280     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
281     __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B);
282     __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C);
283     __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D);
284     __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E);
285     int[4] correct1 = [0, 0, 0, 0];
286     int[4] correct2 = [-1, 0, 0, 0];
287     int[4] correct3 = [0, 0, 0, 0];
288     int[4] correct4 = [0, 0, 0, 0];
289     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
290 }
291 
292 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal.
293 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
294 {
295     static if (DMD_with_DSIMD)
296         return cast(__m128) __simd(XMM.CMPPS, a, b, 2);
297     else
298         return cast(__m128) cmpps!(FPComparison.ole)(a, b);
299 }
300 unittest
301 {
302     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
303     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
304     __m128i R = cast(__m128i) _mm_cmple_ps(A, B);
305     int[4] correct = [-1, -1, 0, 0];
306     assert(R.array == correct);
307 }
308 
309 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 
310 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
311 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
312 {
313     static if (DMD_with_DSIMD)
314         return cast(__m128) __simd(XMM.CMPSS, a, b, 2);
315     else
316         return cast(__m128) cmpss!(FPComparison.ole)(a, b);
317 }
318 unittest
319 {
320     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
321     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
322     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
323     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
324     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
325     __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B);
326     __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C);
327     __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D);
328     __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E);
329     int[4] correct1 = [-1, 0, 0, 0];
330     int[4] correct2 = [0, 0, 0, 0];
331     int[4] correct3 = [0, 0, 0, 0];
332     int[4] correct4 = [-1, 0, 0, 0];
333     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
334 }
335 
336 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than.
337 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
338 {
339     static if (DMD_with_DSIMD)
340         return cast(__m128) __simd(XMM.CMPPS, a, b, 1);
341     else
342         return cast(__m128) cmpps!(FPComparison.olt)(a, b);
343 }
344 unittest
345 {
346     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
347     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
348     __m128i R = cast(__m128i) _mm_cmplt_ps(A, B);
349     int[4] correct = [-1, 0, 0, 0];
350     assert(R.array == correct);
351 }
352 
353 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 
354 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
355 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
356 {
357     static if (DMD_with_DSIMD)
358         return cast(__m128) __simd(XMM.CMPSS, a, b, 1);
359     else
360         return cast(__m128) cmpss!(FPComparison.olt)(a, b);
361 }
362 unittest
363 {
364     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
365     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
366     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
367     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
368     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
369     __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B);
370     __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C);
371     __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D);
372     __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E);
373     int[4] correct1 = [0, 0, 0, 0];
374     int[4] correct2 = [0, 0, 0, 0];
375     int[4] correct3 = [0, 0, 0, 0];
376     int[4] correct4 = [-1, 0, 0, 0];
377     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
378 }
379 
380 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal.
381 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
382 {
383     static if (DMD_with_DSIMD)
384         return cast(__m128) __simd(XMM.CMPPS, a, b, 4);
385     else
386         return cast(__m128) cmpps!(FPComparison.une)(a, b);
387 }
388 unittest
389 {
390     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
391     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
392     __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B);
393     int[4] correct = [-1, 0, -1, -1];
394     assert(R.array == correct);
395 }
396 
397 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 
398 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
399 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
400 {
401     static if (DMD_with_DSIMD)
402         return cast(__m128) __simd(XMM.CMPSS, a, b, 4);
403     else
404         return cast(__m128) cmpss!(FPComparison.une)(a, b);
405 }
406 unittest
407 {
408     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
409     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
410     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
411     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
412     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
413     __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B);
414     __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C);
415     __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D);
416     __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E);
417     int[4] correct1 = [0, 0, 0, 0];
418     int[4] correct2 = [-1, 0, 0, 0];
419     int[4] correct3 = [-1, 0, 0, 0];
420     int[4] correct4 = [-1, 0, 0, 0];
421     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
422 }
423 
424 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal.
425 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
426 {
427     static if (DMD_with_DSIMD)
428         return cast(__m128) __simd(XMM.CMPPS, b, a, 6);
429     else
430         return cast(__m128) cmpps!(FPComparison.ult)(a, b);
431 }
432 unittest
433 {
434     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
435     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
436     __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B);
437     int[4] correct = [-1, 0, 0, -1];
438     assert(R.array == correct);
439 }
440 
441 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 
442 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
443 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
444 {
445     static if (DMD_with_DSIMD)
446     {
447         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6);
448         a[0] = c[0];
449         return a;
450     }
451     else
452         return cast(__m128) cmpss!(FPComparison.ult)(a, b);
453 }
454 unittest
455 {
456     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
457     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
458     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
459     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
460     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
461     __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B);
462     __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C);
463     __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D);
464     __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E);
465     int[4] correct1 = [0, 0, 0, 0];
466     int[4] correct2 = [0, 0, 0, 0];
467     int[4] correct3 = [-1, 0, 0, 0];
468     int[4] correct4 = [-1, 0, 0, 0];
469     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
470 }
471 
472 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than.
473 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
474 {
475     static if (DMD_with_DSIMD)
476         return cast(__m128) __simd(XMM.CMPPS, b, a, 5);
477     else
478         return cast(__m128) cmpps!(FPComparison.ule)(a, b);
479 }
480 unittest
481 {
482     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
483     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
484     __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B);
485     int[4] correct = [-1, -1, 0, -1];
486     assert(R.array == correct);
487 }
488 
489 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 
490 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
491 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
492 {
493     static if (DMD_with_DSIMD)
494     {
495         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5);
496         a[0] = c[0];
497         return a;
498     }
499     else
500         return cast(__m128) cmpss!(FPComparison.ule)(a, b);
501 }
502 unittest
503 {
504     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
505     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
506     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
507     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
508     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
509     __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B);
510     __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C);
511     __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D);
512     __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E);
513     int[4] correct1 = [-1, 0, 0, 0];
514     int[4] correct2 = [0, 0, 0, 0];
515     int[4] correct3 = [-1, 0, 0, 0];
516     int[4] correct4 = [-1, 0, 0, 0];
517     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
518 }
519 
520 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal.
521 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
522 {
523     static if (DMD_with_DSIMD)
524         return cast(__m128) __simd(XMM.CMPPS, a, b, 6);
525     else
526         return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
527 }
528 unittest
529 {
530     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
531     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
532     __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B);
533     int[4] correct = [0, 0, -1, -1];
534     assert(R.array == correct);
535 }
536 
537 
538 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 
539 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
540 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
541 {
542     static if (DMD_with_DSIMD)
543         return cast(__m128) __simd(XMM.CMPSS, a, b, 6);
544     else
545         return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
546 }
547 unittest
548 {
549     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
550     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
551     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
552     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
553     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
554     __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B);
555     __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C);
556     __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D);
557     __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E);
558     int[4] correct1 = [0, 0, 0, 0];
559     int[4] correct2 = [-1, 0, 0, 0];
560     int[4] correct3 = [-1, 0, 0, 0];
561     int[4] correct4 = [0, 0, 0, 0];
562     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
563 }
564 
565 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than.
566 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
567 {
568     static if (DMD_with_DSIMD)
569         return cast(__m128) __simd(XMM.CMPPS, a, b, 5);
570     else
571         return cast(__m128) cmpps!(FPComparison.uge)(a, b);
572 }
573 unittest
574 {
575     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
576     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
577     __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B);
578     int[4] correct = [0, -1, -1, -1];
579     assert(R.array == correct);
580 }
581 
582 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 
583 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
584 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
585 {
586     static if (DMD_with_DSIMD)
587         return cast(__m128) __simd(XMM.CMPSS, a, b, 5);
588     else
589         return cast(__m128) cmpss!(FPComparison.uge)(a, b);
590 }
591 unittest
592 {
593     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
594     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
595     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
596     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
597     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
598     __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B);
599     __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C);
600     __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D);
601     __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E);
602     int[4] correct1 = [-1, 0, 0, 0];
603     int[4] correct2 = [-1, 0, 0, 0];
604     int[4] correct3 = [-1, 0, 0, 0];
605     int[4] correct4 = [0, 0, 0, 0];
606     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
607 }
608 
609 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN.
610 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
611 {
612     static if (DMD_with_DSIMD)
613         return cast(__m128) __simd(XMM.CMPPS, a, b, 7);
614     else
615         return cast(__m128) cmpps!(FPComparison.ord)(a, b);
616 }
617 unittest
618 {
619     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
620     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
621     __m128i R = cast(__m128i) _mm_cmpord_ps(A, B);
622     int[4] correct = [-1, -1, -1, 0];
623     assert(R.array == correct);
624 }
625 
626 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 
627 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
628 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
629 {
630     static if (DMD_with_DSIMD)
631         return cast(__m128) __simd(XMM.CMPSS, a, b, 7);
632     else
633         return cast(__m128) cmpss!(FPComparison.ord)(a, b);
634 }
635 unittest
636 {
637     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
638     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
639     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
640     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
641     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
642     __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B);
643     __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C);
644     __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D);
645     __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E);
646     int[4] correct1 = [-1, 0, 0, 0];
647     int[4] correct2 = [-1, 0, 0, 0];
648     int[4] correct3 = [0, 0, 0, 0];
649     int[4] correct4 = [-1, 0, 0, 0];
650     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
651 }
652 
653 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
654 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
655 {
656     static if (DMD_with_DSIMD)
657         return cast(__m128) __simd(XMM.CMPPS, a, b, 3);
658     else
659         return cast(__m128) cmpps!(FPComparison.uno)(a, b);
660 }
661 unittest
662 {
663     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
664     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
665     __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B);
666     int[4] correct = [0, 0, 0, -1];
667     assert(R.array == correct);
668 }
669 
670 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
671 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
672 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
673 {
674     static if (DMD_with_DSIMD)
675         return cast(__m128) __simd(XMM.CMPSS, a, b, 3);
676     else return cast(__m128) cmpss!(FPComparison.uno)(a, b);
677 }
678 unittest
679 {
680     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
681     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
682     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
683     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
684     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
685     __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B);
686     __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C);
687     __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D);
688     __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E);
689     int[4] correct1 = [0, 0, 0, 0];
690     int[4] correct2 = [0, 0, 0, 0];
691     int[4] correct3 = [-1, 0, 0, 0];
692     int[4] correct4 = [0, 0, 0, 0];
693     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
694 }
695 
696 
697 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 
698 /// and return the boolean result (0 or 1).
699 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe
700 {
701     return a.array[0] == b.array[0];
702 }
703 unittest
704 {
705     assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
706     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
707     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
708     assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
709     assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0)));
710 }
711 
712 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 
713 /// and return the boolean result (0 or 1).
714 int _mm_comige_ss (__m128 a, __m128 b) pure @safe
715 {
716     return a.array[0] >= b.array[0];
717 }
718 unittest
719 {
720     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
721     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
722     assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
723     assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
724     assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
725     assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
726 }
727 
728 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 
729 /// and return the boolean result (0 or 1).
730 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
731 {
732     return a.array[0] > b.array[0];
733 }
734 unittest
735 {
736     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
737     assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
738     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
739     assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
740     assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
741 }
742 
743 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 
744 /// and return the boolean result (0 or 1).
745 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
746 {
747     return a.array[0] <= b.array[0];
748 }
749 unittest
750 {
751     assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
752     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
753     assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
754     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
755     assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
756     assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
757 }
758 
759 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 
760 /// and return the boolean result (0 or 1).
761 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
762 {
763     return a.array[0] < b.array[0];
764 }
765 unittest
766 {
767     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
768     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
769     assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
770     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
771     assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
772     assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
773 }
774 
775 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 
776 /// and return the boolean result (0 or 1).
777 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
778 {
779     return a.array[0] != b.array[0];
780 }
781 unittest
782 {
783     assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
784     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
785     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
786     assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
787     assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
788 }
789 
790 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
791 /// floating-point elements, store the results in the lower 2 elements, 
792 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
793 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
794 
795 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 
796 /// to packed 32-bit integers.
797 __m64 _mm_cvt_ps2pi (__m128 a) @safe
798 {
799     return to_m64(_mm_cvtps_epi32(a));
800 }
801 
802 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
803 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
804 /// upper elements of the result.
805 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted
806 {
807     v.ptr[0] = cast(float)x;
808     return v;
809 }
810 unittest
811 {
812     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
813     assert(a.array == [42f, 0, 0, 0]);
814 }
815 
816 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
817 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
818 {
819     __m128i ma = to_m128i(a);
820     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
821     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
822     return _mm_cvtepi32_ps(ma);
823 }
824 unittest
825 {
826     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
827     __m128 R = _mm_cvtpi16_ps(A);
828     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
829     assert(R.array == correct);
830 }
831 
832 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
833 /// floating-point elements, store the results in the lower 2 elements, 
834 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
835 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
836 {
837     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
838     a.ptr[0] = fb.array[0];
839     a.ptr[1] = fb.array[1];
840     return a;
841 }
842 unittest
843 {
844     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
845     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
846     assert(R.array == correct);
847 }
848 
849 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 
850 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 
851 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements.
852 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
853 {
854     long2 l;
855     l.ptr[0] = a.array[0];
856     l.ptr[1] = b.array[0];
857     return _mm_cvtepi32_ps(cast(__m128i)l);
858 }
859 unittest
860 {
861     __m64 A = _mm_setr_pi32(-45, 128);
862     __m64 B = _mm_setr_pi32(0, 1000);
863     __m128 R = _mm_cvtpi32x2_ps(A, B);
864     float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f];
865     assert(R.array == correct);
866 }
867 
868 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
869 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
870 {
871     __m128i b = to_m128i(a); 
872 
873     // Zero extend to 32-bit
874     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
875     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
876 
877     // Replicate sign bit
878     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
879     return _mm_cvtepi32_ps(b);
880 }
881 unittest
882 {
883     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
884     __m128 R = _mm_cvtpi8_ps(A);
885     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
886     assert(R.array == correct);
887 }
888 
889 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers.
890 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.
891 __m64 _mm_cvtps_pi16 (__m128 a) @safe
892 {
893     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
894     // Which means the 16-bit integers should be saturated
895     __m128i b = _mm_cvtps_epi32(a);
896     b = _mm_packs_epi32(b, b);
897     return to_m64(b);
898 }
899 unittest
900 {
901     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
902     short4 R = cast(short4) _mm_cvtps_pi16(A);
903     short[4] correct = [-1, 2, -32768, 32767];
904     assert(R.array == correct);
905 }
906 
907 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
908 __m64 _mm_cvtps_pi32 (__m128 a) @safe
909 {
910     return to_m64(_mm_cvtps_epi32(a));
911 }
912 unittest
913 {
914     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
915     int2 R = cast(int2) _mm_cvtps_pi32(A);
916     int[2] correct = [-33000, 70000];
917     assert(R.array == correct);
918 }
919 
920 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 
921 /// and store the results in lower 4 elements. 
922 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.
923 __m64 _mm_cvtps_pi8 (__m128 a) @safe
924 {
925     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
926     // Which means the 8-bit integers should be saturated
927     __m128i b = _mm_cvtps_epi32(a);
928     b = _mm_packs_epi32(b, _mm_setzero_si128());
929     b = _mm_packs_epi16(b, _mm_setzero_si128());
930     return to_m64(b);
931 }
932 unittest
933 {
934     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
935     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
936     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
937     assert(R.array == correct);
938 }
939 
940 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
941 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
942 {
943     __m128i ma = to_m128i(a);
944     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
945     return _mm_cvtepi32_ps(ma);
946 }
947 unittest
948 {
949     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
950     __m128 R = _mm_cvtpu16_ps(A);
951     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
952     assert(R.array == correct);
953 }
954 
955 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element.
956 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
957 {
958     __m128i b = to_m128i(a); 
959 
960     // Zero extend to 32-bit
961     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
962     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
963     return _mm_cvtepi32_ps(b);
964 }
965 unittest
966 {
967     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
968     __m128 R = _mm_cvtpu8_ps(A);
969     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
970     assert(R.array == correct);
971 }
972 
973 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
974 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
975 /// upper elements of result.
976 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
977 {
978     v.ptr[0] = cast(float)x;
979     return v;
980 }
981 unittest
982 {
983     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
984     assert(a.array == [42.0f, 0, 0, 0]);
985 }
986 
987 
988 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 
989 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
990 /// upper elements of result.
991 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
992 {
993     v.ptr[0] = cast(float)x;
994     return v;
995 }
996 unittest
997 {
998     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
999     assert(a.array == [42.0f, 0, 0, 0]);
1000 }
1001 
1002 /// Take the lower single-precision (32-bit) floating-point element of `a`.
1003 float _mm_cvtss_f32(__m128 a) pure @safe
1004 {
1005     return a.array[0];
1006 }
1007 
1008 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
1009 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC
1010 {
1011     static if (GDC_with_SSE)
1012     {
1013         return __builtin_ia32_cvtss2si(a);
1014     }
1015     else static if (LDC_with_SSE1)
1016     {
1017         return __builtin_ia32_cvtss2si(a);
1018     }
1019     else static if (DMD_with_DSIMD)
1020     {
1021         __m128 b;
1022         __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers.
1023         return r.array[0];
1024     }
1025     else
1026     {
1027         return convertFloatToInt32UsingMXCSR(a.array[0]);
1028     }
1029 }
1030 unittest
1031 {
1032     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1033 }
1034 
1035 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
1036 long _mm_cvtss_si64 (__m128 a) @safe
1037 {
1038     version(LDC)
1039     {
1040         version(X86_64)
1041         {
1042             return __builtin_ia32_cvtss2si64(a);
1043         }
1044         else
1045         {
1046             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1047             // using SSE instructions only. So the builtin doesn't exit for this arch.
1048             return convertFloatToInt64UsingMXCSR(a.array[0]);
1049         }
1050     }
1051     else
1052     {
1053         return convertFloatToInt64UsingMXCSR(a.array[0]);
1054     }
1055 }
1056 unittest
1057 {
1058     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1059 
1060     uint savedRounding = _MM_GET_ROUNDING_MODE();
1061 
1062     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1063     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
1064 
1065     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1066     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
1067 
1068     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1069     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
1070 
1071     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1072     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
1073 
1074     _MM_SET_ROUNDING_MODE(savedRounding);
1075 }
1076 
1077 
1078 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 
1079 /// integer with truncation.
1080 int _mm_cvtt_ss2si (__m128 a) pure @safe
1081 {
1082     // x86: cvttss2si always generated, even in -O0
1083     return cast(int)(a.array[0]);
1084 }
1085 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto
1086 unittest
1087 {
1088     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1089 }
1090 
1091 
1092 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
1093 /// integers with truncation.
1094 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
1095 {
1096     return to_m64(_mm_cvttps_epi32(a));
1097 }
1098 
1099 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 
1100 /// integer with truncation.
1101 long _mm_cvttss_si64 (__m128 a) pure @safe
1102 {
1103     return cast(long)(a.array[0]);
1104 }
1105 unittest
1106 {
1107     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1108 }
1109 
1110 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
1111 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
1112 {
1113     return a / b;
1114 }
1115 unittest
1116 {
1117     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1118     a = _mm_div_ps(a, a);
1119     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
1120     assert(a.array == correct);
1121 }
1122 
1123 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 
1124 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 
1125 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1126 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
1127 {
1128     static if (DMD_with_DSIMD)
1129         return cast(__m128) __simd(XMM.DIVSS, a, b);
1130     else static if (GDC_with_SSE)
1131         return __builtin_ia32_divss(a, b);
1132     else
1133     {
1134         a[0] /= b[0];
1135         return a;
1136     }
1137 }
1138 unittest
1139 {
1140     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1141     a = _mm_div_ss(a, a);
1142     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
1143     assert(a.array == correct);
1144 }
1145 
1146 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended.
1147 int _mm_extract_pi16 (__m64 a, int imm8)
1148 {
1149     short4 sa = cast(short4)a;
1150     return cast(ushort)(sa.array[imm8]);
1151 }
1152 unittest
1153 {
1154     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
1155     assert(_mm_extract_pi16(A, 0) == 65535);
1156     assert(_mm_extract_pi16(A, 1) == 6);
1157     assert(_mm_extract_pi16(A, 2) == 0);
1158     assert(_mm_extract_pi16(A, 3) == 4);
1159 }
1160 
1161 /// Free aligned memory that was allocated with `_mm_malloc`.
1162 void _mm_free(void * mem_addr) @trusted
1163 {
1164     // support for free(NULL)
1165     if (mem_addr is null)
1166         return;
1167 
1168     // Technically we don't need to store size and alignement in the chunk, but we do in case we
1169     // have to implement _mm_realloc
1170 
1171     size_t pointerSize = (void*).sizeof;
1172     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
1173     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
1174     size_t alignment = *alignmentLocation;
1175     assert(alignment != 0);
1176     assert(isPointerAligned(mem_addr, alignment));
1177     free(*rawLocation);
1178 }
1179 
1180 /// Get the exception mask bits from the MXCSR control and status register. 
1181 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 
1182 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1183 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated.
1184 uint _MM_GET_EXCEPTION_MASK() @safe
1185 {
1186     return _mm_getcsr() & _MM_MASK_MASK;
1187 }
1188 
1189 /// Get the exception state bits from the MXCSR control and status register. 
1190 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
1191 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
1192 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported.
1193 uint _MM_GET_EXCEPTION_STATE() @safe
1194 {
1195     return _mm_getcsr() & _MM_EXCEPT_MASK;
1196 }
1197 
1198 /// Get the flush zero bits from the MXCSR control and status register. 
1199 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`
1200 uint _MM_GET_FLUSH_ZERO_MODE() @safe
1201 {
1202     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
1203 }
1204 
1205 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 
1206 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
1207 uint _MM_GET_ROUNDING_MODE() @safe
1208 {
1209     return _mm_getcsr() & _MM_ROUND_MASK;
1210 }
1211 
1212 /// Get the unsigned 32-bit value of the MXCSR control and status register.
1213 /// Note: this is emulated on ARM, because there is no MXCSR register then.
1214 uint _mm_getcsr() @trusted
1215 {
1216     static if (LDC_with_ARM)
1217     {
1218         // Note: we convert the ARM FPSCR into a x86 SSE control word.
1219         // However, only rounding mode and flush to zero are actually set.
1220         // The returned control word will have all exceptions masked, and no exception detected.
1221 
1222         uint fpscr = arm_get_fpcr();
1223 
1224         uint cw = 0; // No exception detected
1225         if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
1226         {
1227             // ARM has one single flag for ARM.
1228             // It does both x86 bits.
1229             // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
1230             cw |= _MM_FLUSH_ZERO_ON;
1231             cw |= 0x40; // set "denormals are zeros"
1232         } 
1233         cw |= _MM_MASK_MASK; // All exception maske
1234 
1235         // Rounding mode
1236         switch(fpscr & _MM_ROUND_MASK_ARM)
1237         {
1238             default:
1239             case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
1240             case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
1241             case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
1242             case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
1243         }
1244         return cw;
1245     }
1246     else version(GNU)
1247     {
1248         static if (GDC_with_SSE)
1249         {
1250             return __builtin_ia32_stmxcsr();
1251         }
1252         else version(X86)
1253         {
1254             uint sseRounding = 0;
1255             asm pure nothrow @nogc @trusted
1256             {
1257                 "stmxcsr %0;\n" 
1258                   : "=m" (sseRounding)
1259                   : 
1260                   : ;
1261             }
1262             return sseRounding;
1263         }
1264         else
1265             static assert(false);
1266     }
1267     else version (InlineX86Asm)
1268     {
1269         uint controlWord;
1270         asm nothrow @nogc pure @safe
1271         {
1272             stmxcsr controlWord;
1273         }
1274         return controlWord;
1275     }
1276     else
1277         static assert(0, "Not yet supported");
1278 }
1279 unittest
1280 {
1281     uint csr = _mm_getcsr();
1282 }
1283 
1284 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`.
1285 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted
1286 {
1287     short4 r = cast(short4)v;
1288     r.ptr[imm8 & 3] = cast(short)i;
1289     return cast(__m64)r;
1290 }
1291 unittest
1292 {
1293     __m64 A = _mm_set_pi16(3, 2, 1, 0);
1294     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
1295     short[4] correct = [0, 42, 2, 3];
1296     assert(R.array == correct);
1297 }
1298 
1299 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory.
1300 //  `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1301 __m128 _mm_load_ps(const(float)*p) pure @trusted // TODO shouldn't be trusted
1302 {
1303     return *cast(__m128*)p;
1304 }
1305 unittest
1306 {
1307     static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f];
1308     __m128 A = _mm_load_ps(correct.ptr);
1309     assert(A.array == correct);
1310 }
1311 
1312 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1313 __m128 _mm_load_ps1(const(float)*p) pure @trusted
1314 {
1315     return __m128(*p);
1316 }
1317 unittest
1318 {
1319     float n = 2.5f;
1320     float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f];
1321     __m128 A = _mm_load_ps1(&n);
1322     assert(A.array == correct);
1323 }
1324 
1325 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 
1326 /// elements. `mem_addr` does not need to be aligned on any particular boundary.
1327 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
1328 {
1329     static if (DMD_with_DSIMD)
1330     {
1331         return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr);
1332     }
1333     else
1334     {
1335         __m128 r;
1336         r.ptr[0] = *mem_addr;
1337         r.ptr[1] = 0;
1338         r.ptr[2] = 0;
1339         r.ptr[3] = 0;
1340         return r;
1341     }
1342 }
1343 unittest
1344 {
1345     float n = 2.5f;
1346     float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f];
1347     __m128 A = _mm_load_ss(&n);
1348     assert(A.array == correct);
1349 }
1350 
1351 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1352 alias _mm_load1_ps = _mm_load_ps1;
1353 
1354 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 
1355 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary.
1356 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1357 {
1358     static if (DMD_with_DSIMD)
1359     {
1360         return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 
1361     }
1362     else
1363     {
1364         // x86: movlhps generated since LDC 1.9.0 -O1
1365         long2 la = cast(long2)a;
1366         la.ptr[1] = (*mem_addr).array[0];
1367         return cast(__m128)la;
1368     }
1369 }
1370 unittest
1371 {
1372     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1373     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1374     __m64 M = to_m64(cast(__m128i)B);
1375      __m128 R = _mm_loadh_pi(A, &M);
1376     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1377     assert(R.array == correct);
1378 }
1379 
1380 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 
1381 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1382 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1383 {
1384     static if (DMD_with_DSIMD)
1385     {
1386         return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 
1387     }
1388     else
1389     {
1390         // x86: movlpd/movlps generated with all LDC -01
1391         long2 la = cast(long2)a;
1392         la.ptr[0] = (*mem_addr).array[0];
1393         return cast(__m128)la;
1394     }
1395 }
1396 unittest
1397 {
1398     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1399     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1400     __m64 M = to_m64(cast(__m128i)B);
1401      __m128 R = _mm_loadl_pi(A, &M);
1402     float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f];
1403     assert(R.array == correct);
1404 }
1405 
1406 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 
1407 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1408 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // TODO shouldn't be trusted
1409 {
1410     __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1
1411     __m128 a = *aligned;
1412     static if (DMD_with_DSIMD)
1413     {
1414         return cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
1415     }
1416     else
1417     {
1418         __m128 r;
1419         r.ptr[0] = a.array[3];
1420         r.ptr[1] = a.array[2];
1421         r.ptr[2] = a.array[1];
1422         r.ptr[3] = a.array[0];
1423         return r;
1424     }
1425 }
1426 unittest
1427 {
1428     align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ];
1429     __m128 A = _mm_loadr_ps(arr.ptr);
1430     float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ];
1431     assert(A.array == correct);
1432 }
1433 
1434 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 
1435 /// `mem_addr` does not need to be aligned on any particular boundary.
1436 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted
1437 {
1438     static if (GDC_with_SSE2)
1439     {
1440         return __builtin_ia32_loadups(mem_addr);
1441     }
1442     else version(LDC)
1443     {
1444         return loadUnaligned!(__m128)(mem_addr);
1445     }
1446     else version(DigitalMars)
1447     {
1448         static if (DMD_with_DSIMD)
1449         {
1450             return cast(__m128)__simd(XMM.LODUPS, *mem_addr);
1451         }
1452         else static if (SSESizedVectorsAreEmulated)
1453         {
1454             // Since this vector is emulated, it doesn't have alignement constraints
1455             // and as such we can just cast it.
1456             return *cast(__m128*)(mem_addr);
1457         }
1458         else
1459         {
1460             __m128 result;
1461             result.ptr[0] = mem_addr[0];
1462             result.ptr[1] = mem_addr[1];
1463             result.ptr[2] = mem_addr[2];
1464             result.ptr[3] = mem_addr[3];
1465             return result;
1466         }
1467     }
1468     else
1469     {
1470         __m128 result;
1471         result.ptr[0] = mem_addr[0];
1472         result.ptr[1] = mem_addr[1];
1473         result.ptr[2] = mem_addr[2];
1474         result.ptr[3] = mem_addr[3];
1475         return result;
1476     }
1477 }
1478 unittest
1479 {
1480     align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ];  // force unaligned load
1481     __m128 A = _mm_loadu_ps(&arr[1]);
1482     float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ];
1483     assert(A.array == correct);
1484 }
1485 
1486 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
1487 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
1488 {
1489     static if (DMD_with_DSIMD)
1490     {
1491         int r = *cast(short*)(mem_addr);
1492         return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
1493     }
1494     else version(DigitalMars)
1495     {
1496         // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
1497         // DMD cannot handle the below code...
1498         align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
1499         r[0] = *cast(short*)(mem_addr);
1500         return *cast(int4*)(r.ptr);
1501     }
1502     else
1503     {
1504         short r = *cast(short*)(mem_addr);
1505         short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
1506         result.ptr[0] = r;
1507         return cast(__m128i)result;
1508     }
1509 }
1510 unittest
1511 {
1512     short r = 13;
1513     short8 A = cast(short8) _mm_loadu_si16(&r);
1514     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
1515     assert(A.array == correct);
1516 }
1517 
1518 /// Load unaligned 64-bit integer from memory into the first element of result.
1519 /// Upper 64-bit is zeroed.
1520 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
1521 {
1522     static if (DMD_with_DSIMD)
1523     {
1524         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
1525     }
1526     else
1527     {
1528         long r = *cast(long*)(mem_addr);
1529         long2 result = [0, 0];
1530         result.ptr[0] = r;
1531         return cast(__m128i)result;
1532     }
1533 }
1534 unittest
1535 {
1536     long r = 446446446446;
1537     long2 A = cast(long2) _mm_loadu_si64(&r);
1538     long[2] correct = [446446446446, 0];
1539     assert(A.array == correct);
1540 }
1541 
1542 /// Allocate size bytes of memory, aligned to the alignment specified in align,
1543 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
1544 /// memory that is allocated with `_mm_malloc`.
1545 void* _mm_malloc(size_t size, size_t alignment) @trusted
1546 {
1547     assert(alignment != 0);
1548     size_t request = requestedSize(size, alignment);
1549     void* raw = malloc(request);
1550     if (request > 0 && raw == null) // malloc(0) can validly return anything
1551         onOutOfMemoryError();
1552     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
1553 }
1554 
1555 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 
1556 /// bit is not set in the corresponding element) and a non-temporal memory hint.
1557 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
1558 {
1559     // this works since mask is zero-extended
1560     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
1561 }
1562 
1563 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;///
1564 
1565 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value.
1566 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
1567 {
1568     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
1569 }
1570 
1571 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1572 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
1573 {
1574     static if (DMD_with_DSIMD)
1575     {
1576         return cast(__m128) __simd(XMM.MAXPS, a, b);
1577     }
1578     else static if (GDC_with_SSE)
1579     {
1580         return __builtin_ia32_maxps(a, b);
1581     }
1582     else static if (LDC_with_SSE1)
1583     {
1584         return __builtin_ia32_maxps(a, b);
1585     }
1586     else
1587     {
1588         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1589         __m128 r;
1590         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1591         r[1] = (a[1] > b[1]) ? a[1] : b[1];
1592         r[2] = (a[2] > b[2]) ? a[2] : b[2];
1593         r[3] = (a[3] > b[3]) ? a[3] : b[3];
1594         return r;    
1595     }
1596 }
1597 unittest
1598 {
1599     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1600     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1601     __m128 M = _mm_max_ps(A, B);
1602     assert(M.array[0] == 4);
1603     assert(M.array[1] == 2);
1604     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1605     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1606 }
1607 
1608 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
1609 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
1610 {
1611     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
1612 }
1613 
1614 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 
1615 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1616  __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
1617 {
1618     static if (DMD_with_DSIMD)
1619     {
1620         return cast(__m128) __simd(XMM.MAXSS, a, b);
1621     }
1622     else static if (GDC_with_SSE)
1623     {
1624         return __builtin_ia32_maxss(a, b);
1625     }
1626     else static if (LDC_with_SSE1)
1627     {
1628         return __builtin_ia32_maxss(a, b); 
1629     }
1630     else
1631     {  
1632         __m128 r = a;
1633         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1634         return r;
1635     }
1636 }
1637 unittest
1638 {
1639     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1640     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1641     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1642     __m128 M = _mm_max_ss(A, B);
1643     assert(M.array[0] == 4);
1644     assert(M.array[1] == 2);
1645     assert(M.array[2] == 3);
1646     assert(M.array[3] == 4);
1647     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
1648     assert(M.array[0] != M.array[0]);
1649     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
1650     assert(M.array[0] == 1);
1651 }
1652 
1653 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values.
1654 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
1655 {
1656     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
1657 }
1658 
1659 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1660 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
1661 {
1662     static if (DMD_with_DSIMD)
1663     {
1664         return cast(__m128) __simd(XMM.MINPS, a, b);
1665     }
1666     else static if (GDC_with_SSE)
1667     {
1668         return __builtin_ia32_minps(a, b);
1669     }
1670     else static if (LDC_with_SSE1)
1671     {
1672         // not technically needed, but better perf in debug mode
1673         return __builtin_ia32_minps(a, b);
1674     }
1675     else
1676     {
1677         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1678         __m128 r;
1679         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1680         r[1] = (a[1] < b[1]) ? a[1] : b[1];
1681         r[2] = (a[2] < b[2]) ? a[2] : b[2];
1682         r[3] = (a[3] < b[3]) ? a[3] : b[3];
1683         return r;
1684     }
1685 }
1686 unittest
1687 {
1688     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1689     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1690     __m128 M = _mm_min_ps(A, B);
1691     assert(M.array[0] == 1);
1692     assert(M.array[1] == 1);
1693     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1694     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1695 }
1696 
1697 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
1698 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
1699 {
1700     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
1701 }
1702 
1703 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 
1704 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1705 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
1706 {
1707     static if (DMD_with_DSIMD)
1708     {
1709         return cast(__m128) __simd(XMM.MINSS, a, b);
1710     }
1711     else static if (GDC_with_SSE)
1712     {
1713         return __builtin_ia32_minss(a, b);
1714     }
1715     else static if (LDC_with_SSE1)
1716     {
1717         return __builtin_ia32_minss(a, b);
1718     }
1719     else
1720     {
1721         // Generates minss since LDC 1.3 -O1
1722         __m128 r = a;
1723         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1724         return r;
1725     }
1726 }
1727 unittest
1728 {
1729     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1730     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1731     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1732     __m128 M = _mm_min_ss(A, B);
1733     assert(M.array[0] == 1);
1734     assert(M.array[1] == 2);
1735     assert(M.array[2] == 3);
1736     assert(M.array[3] == 4);
1737     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
1738     assert(M.array[0] != M.array[0]);
1739     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
1740     assert(M.array[0] == 1);
1741 }
1742 
1743 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 
1744 /// the upper 3 packed elements from `a` to the upper elements of result.
1745 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
1746 {
1747     // Workaround https://issues.dlang.org/show_bug.cgi?id=21673
1748     // inlining of this function fails.
1749     version(DigitalMars) asm nothrow @nogc pure { nop; }
1750 
1751     a.ptr[0] = b.array[0];
1752     return a;
1753 }
1754 unittest
1755 {
1756     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1757     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1758     __m128 R = _mm_move_ss(A, B);
1759     float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f];
1760     assert(R.array == correct);
1761 }
1762 
1763 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 
1764 /// copy the upper 2 elements from `a` to the upper 2 elements of dst.
1765 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
1766 {
1767     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1768     /*
1769     static if (DMD_with_DSIMD)
1770     {
1771         
1772         return cast(__m128) __simd(XMM.MOVHLPS, a, b);
1773     }
1774     else */
1775     {
1776         a.ptr[0] = b.array[2];
1777         a.ptr[1] = b.array[3];
1778         return a;
1779     }
1780 }
1781 unittest
1782 {
1783     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1784     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1785     __m128 R = _mm_movehl_ps(A, B);
1786     float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f];
1787     assert(R.array == correct);
1788 }
1789 
1790 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 
1791 /// copy the lower 2 elements from `a` to the lower 2 elements of result
1792 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
1793 {    
1794     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1795     /*
1796     static if (DMD_with_DSIMD)
1797     {
1798         return cast(__m128) __simd(XMM.MOVLHPS, a, b);
1799     }
1800     else
1801     */
1802     {
1803         a.ptr[2] = b.array[0];
1804         a.ptr[3] = b.array[1];
1805         return a;
1806     }    
1807 }
1808 unittest
1809 {
1810     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1811     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1812     __m128 R = _mm_movelh_ps(A, B);
1813     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1814     assert(R.array == correct);
1815 }
1816 
1817 /// Create mask from the most significant bit of each 8-bit element in `a`.
1818 int _mm_movemask_pi8 (__m64 a) pure @safe
1819 {
1820     return _mm_movemask_epi8(to_m128i(a));
1821 }
1822 unittest
1823 {
1824     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
1825 }
1826 
1827 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 
1828 /// floating-point element in `a`.
1829 int _mm_movemask_ps (__m128 a) pure @trusted
1830 {
1831     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
1832     static if (GDC_with_SSE)
1833     {
1834         return __builtin_ia32_movmskps(a);
1835     }
1836     else static if (LDC_with_SSE1)
1837     {
1838         return __builtin_ia32_movmskps(a);
1839     }
1840     else static if (LDC_with_ARM)
1841     {
1842         int4 ai = cast(int4)a;
1843         int4 shift31 = [31, 31, 31, 31]; 
1844         ai = ai >>> shift31;
1845         int4 shift = [0, 1, 2, 3]; 
1846         ai = ai << shift; // 4-way shift, only efficient on ARM.
1847         int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
1848         return r;
1849     }
1850     else
1851     {
1852         int4 ai = cast(int4)a;
1853         int r = 0;
1854         if (ai.array[0] < 0) r += 1;
1855         if (ai.array[1] < 0) r += 2;
1856         if (ai.array[2] < 0) r += 4;
1857         if (ai.array[3] < 0) r += 8;
1858         return r;
1859     }
1860 }
1861 unittest
1862 {
1863     int4 A = [-1, 0, -43, 0];
1864     assert(5 == _mm_movemask_ps(cast(float4)A));
1865 }
1866 
1867 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
1868 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
1869 {
1870     return a * b;
1871 }
1872 unittest
1873 {
1874     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1875     a = _mm_mul_ps(a, a);
1876     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
1877     assert(a.array == correct);
1878 }
1879 
1880 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 
1881 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1882 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
1883 {
1884     static if (DMD_with_DSIMD)
1885         return cast(__m128) __simd(XMM.MULSS, a, b);
1886     else static if (GDC_with_SSE)
1887         return __builtin_ia32_mulss(a, b);
1888     else
1889     {
1890         a[0] *= b[0];
1891         return a;
1892     }
1893 }
1894 unittest
1895 {
1896     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1897     a = _mm_mul_ss(a, a);
1898     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
1899     assert(a.array == correct);
1900 }
1901 
1902 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
1903 /// and return the high 16 bits of the intermediate integers.
1904 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
1905 {
1906     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
1907 }
1908 unittest
1909 {
1910     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
1911     __m64 B = _mm_set1_pi16(16384);
1912     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
1913     short[4] correct = [0, 0x3FFC, 0, 0];
1914     assert(R.array == correct);
1915 }
1916 
1917 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 
1918 /// return the result.
1919 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
1920 {
1921     static if (DMD_with_DSIMD)
1922         return cast(__m128)__simd(XMM.ORPS, a, b);
1923     else
1924         return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
1925 }
1926 
1927 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;///
1928 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;///
1929 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;///
1930 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;///
1931 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;///
1932 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;///
1933 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;///
1934 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;///
1935 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;///
1936 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;///
1937 
1938 enum _MM_HINT_T0  = 3; ///
1939 enum _MM_HINT_T1  = 2; ///
1940 enum _MM_HINT_T2  = 1; ///
1941 enum _MM_HINT_NTA = 0; ///
1942 
1943 
1944 version(LDC)
1945 {
1946     // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
1947     // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
1948     static if (__VERSION__ >= 2091) 
1949     {
1950         pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
1951             void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
1952     }
1953 }
1954 
1955 /// Fetch the line of data from memory that contains address `p` to a location in the 
1956 /// cache hierarchy specified by the locality hint i.
1957 ///
1958 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
1959 void _mm_prefetch(int locality)(const(void)* p) pure @trusted
1960 {
1961     static if (GDC_with_SSE)
1962     {
1963         return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
1964     }
1965     else static if (DMD_with_DSIMD)
1966     {
1967         enum bool isWrite = (locality & 0x4) != 0;
1968         enum level = locality & 3;
1969         return prefetch!(isWrite, level)(p);
1970     }
1971     else version(LDC)
1972     {
1973         static if (__VERSION__ >= 2091)
1974         {
1975             // const_cast here. `llvm_prefetch` wants a mutable pointer
1976             llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
1977         }
1978         else
1979         {
1980             // const_cast here. `llvm_prefetch` wants a mutable pointer
1981             llvm_prefetch( cast(void*)p, 0, locality, 1);
1982         }
1983     }
1984     else version(D_InlineAsm_X86_64)
1985     {
1986         static if (locality == _MM_HINT_NTA)
1987         {
1988             asm pure nothrow @nogc @trusted
1989             {
1990                 mov RAX, p;
1991                 prefetchnta [RAX];
1992             }
1993         }
1994         else static if (locality == _MM_HINT_T0)
1995         {
1996             asm pure nothrow @nogc @trusted
1997             {
1998                 mov RAX, p;
1999                 prefetcht0 [RAX];
2000             }
2001         }
2002         else static if (locality == _MM_HINT_T1)
2003         {
2004             asm pure nothrow @nogc @trusted
2005             {
2006                 mov RAX, p;
2007                 prefetcht1 [RAX];
2008             }
2009         }
2010         else static if (locality == _MM_HINT_T2)
2011         {
2012             asm pure nothrow @nogc @trusted
2013             {
2014                 mov RAX, p;
2015                 prefetcht2 [RAX];
2016             }
2017         }
2018         else
2019             assert(false); // invalid locality hint
2020     }
2021     else version(D_InlineAsm_X86)
2022     {
2023         static if (locality == _MM_HINT_NTA)
2024         {
2025             asm pure nothrow @nogc @trusted
2026             {
2027                 mov EAX, p;
2028                 prefetchnta [EAX];
2029             }
2030         }
2031         else static if (locality == _MM_HINT_T0)
2032         {
2033             asm pure nothrow @nogc @trusted
2034             {
2035                 mov EAX, p;
2036                 prefetcht0 [EAX];
2037             }
2038         }
2039         else static if (locality == _MM_HINT_T1)
2040         {
2041             asm pure nothrow @nogc @trusted
2042             {
2043                 mov EAX, p;
2044                 prefetcht1 [EAX];
2045             }
2046         }
2047         else static if (locality == _MM_HINT_T2)
2048         {
2049             asm pure nothrow @nogc @trusted
2050             {
2051                 mov EAX, p;
2052                 prefetcht2 [EAX];
2053             }
2054         }
2055         else 
2056             assert(false); // invalid locality hint
2057     }
2058     else
2059     {
2060         // Generic version: do nothing. From bitter experience, 
2061         // it's unlikely you get ANY speed-up with manual prefetching.
2062         // Prefetching or not doesn't change program behaviour.
2063     }
2064 }
2065 unittest
2066 {
2067     // From Intel documentation:
2068     // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 
2069     // 32 bytes."
2070     ubyte[256] cacheline; // though it seems it cannot generate GP fault
2071     _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
2072     _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
2073     _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
2074     _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
2075 }
2076 
2077 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;///
2078 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;///
2079 
2080 
2081 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 
2082 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12.
2083 __m128 _mm_rcp_ps (__m128 a) pure @trusted
2084 {
2085     static if (DMD_with_DSIMD)
2086     {
2087         return cast(__m128) __simd(XMM.RCPPS, a);
2088     }
2089     else static if (GDC_with_SSE)
2090     {
2091         return __builtin_ia32_rcpps(a);
2092     }
2093     else static if (LDC_with_SSE1)
2094     {
2095         return __builtin_ia32_rcpps(a);
2096     }
2097     else
2098     {        
2099         a.ptr[0] = 1.0f / a.array[0];
2100         a.ptr[1] = 1.0f / a.array[1];
2101         a.ptr[2] = 1.0f / a.array[2];
2102         a.ptr[3] = 1.0f / a.array[3];
2103         return a;
2104     }
2105 }
2106 unittest
2107 {
2108     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2109     __m128 groundTruth = _mm_set1_ps(1.0f) / A;
2110     __m128 result = _mm_rcp_ps(A);
2111     foreach(i; 0..4)
2112     {
2113         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2114         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2115     }
2116 }
2117 
2118 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 
2119 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 
2120 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2121 __m128 _mm_rcp_ss (__m128 a) pure @trusted
2122 {
2123     static if (DMD_with_DSIMD)
2124     {
2125         return cast(__m128) __simd(XMM.RCPSS, a);
2126     }
2127     else static if (GDC_with_SSE)
2128     {
2129         return __builtin_ia32_rcpss(a);
2130     }
2131     else static if (LDC_with_SSE1)
2132     {
2133         return __builtin_ia32_rcpss(a);
2134     }
2135     else
2136     {
2137         a.ptr[0] = 1.0f / a.array[0];
2138         return a;
2139     }
2140 }
2141 unittest
2142 {
2143     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2144     __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f);
2145     __m128 R = _mm_rcp_ss(A);
2146     double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1;
2147     assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2148     assert(R.array[1] == correct.array[1]);
2149     assert(R.array[2] == correct.array[2]);
2150     assert(R.array[3] == correct.array[3]);
2151 }
2152 
2153 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 
2154 /// and return a pointer to the newly allocated memory. 
2155 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 
2156 /// allocated with `_mm_malloc` or `_mm_realloc`.
2157 /// Previous data is preserved.
2158 void* _mm_realloc(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
2159 {
2160     return alignedReallocImpl!true(aligned, size, alignment);
2161 }
2162 unittest
2163 {
2164     enum NALLOC = 8;
2165     enum size_t[8] ALIGNMENTS = [1, 2, 4, 8, 16, 32, 64, 128];
2166     
2167     void*[NALLOC] alloc;
2168 
2169     foreach(t; 0..100)
2170     {
2171         foreach(n; 0..NALLOC)
2172         {
2173             size_t alignment = ALIGNMENTS[n];
2174             size_t s = ( (n + t * 69096) & 0xffff );
2175             alloc[n] = _mm_realloc(alloc[n], s, alignment);
2176             assert(isPointerAligned(alloc[n], alignment));
2177             foreach(b; 0..s)
2178                 (cast(ubyte*)alloc[n])[b] = cast(ubyte)n;
2179         }
2180     }
2181     foreach(n; 0..NALLOC)
2182     {
2183         alloc[n] = _mm_realloc(alloc[n], 0, ALIGNMENTS[n]);
2184     }
2185 }
2186 
2187 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 
2188 /// and return a pointer to the newly allocated memory. 
2189 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 
2190 /// allocated with `_mm_malloc` or `_mm_realloc`.
2191 /// Previous data is discarded.
2192 void* _mm_realloc_discard(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
2193 {
2194     return alignedReallocImpl!false(aligned, size, alignment);
2195 }
2196 
2197 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 
2198 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2199 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted
2200 {
2201     static if (DMD_with_DSIMD)
2202     {
2203         return cast(__m128) __simd(XMM.RSQRTPS, a);
2204     }
2205     else static if (GDC_with_SSE)
2206     {
2207         return __builtin_ia32_rsqrtps(a);
2208     }
2209     else static if (LDC_with_SSE1)
2210     {
2211         return __builtin_ia32_rsqrtps(a);
2212     }
2213     else version(LDC)
2214     {
2215         a[0] = 1.0f / llvm_sqrt(a[0]);
2216         a[1] = 1.0f / llvm_sqrt(a[1]);
2217         a[2] = 1.0f / llvm_sqrt(a[2]);
2218         a[3] = 1.0f / llvm_sqrt(a[3]);
2219         return a;
2220     }
2221     else
2222     {
2223         a.ptr[0] = 1.0f / sqrt(a.array[0]);
2224         a.ptr[1] = 1.0f / sqrt(a.array[1]);
2225         a.ptr[2] = 1.0f / sqrt(a.array[2]);
2226         a.ptr[3] = 1.0f / sqrt(a.array[3]);
2227         return a;
2228     }
2229 }
2230 unittest
2231 {
2232     __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f);
2233     __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
2234     __m128 result = _mm_rsqrt_ps(A);
2235     foreach(i; 0..4)
2236     {
2237         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2238         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2239     }
2240 }
2241 
2242 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`,
2243 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 
2244 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2245 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted
2246 {   
2247     static if (DMD_with_DSIMD)
2248     {
2249         return cast(__m128) __simd(XMM.RSQRTSS, a);
2250     }
2251     else static if (GDC_with_SSE)
2252     {
2253         return __builtin_ia32_rsqrtss(a);
2254     }
2255     else static if (LDC_with_SSE1)
2256     {
2257         return __builtin_ia32_rsqrtss(a);
2258     }
2259     else version(LDC)
2260     {
2261         a[0] = 1.0f / llvm_sqrt(a[0]);
2262         return a;
2263     }
2264     else
2265     {
2266         a[0] = 1.0f / sqrt(a[0]);
2267         return a;
2268     }
2269 }
2270 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss
2271 {
2272     double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc?
2273     void testApproximateSSE(float number) nothrow @nogc
2274     {
2275         __m128 A = _mm_set1_ps(number);
2276 
2277         // test _mm_rcp_ps
2278         __m128 B = _mm_rcp_ps(A);
2279         foreach(i; 0..4)
2280         {
2281             double exact = 1.0f / A.array[i];
2282             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2283             assert(abs_double(ratio - 1) <= maxRelativeError);
2284         }
2285 
2286         // test _mm_rcp_ss
2287         {
2288             B = _mm_rcp_ss(A);
2289             double exact = 1.0f / A.array[0];
2290             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2291             assert(abs_double(ratio - 1) <= maxRelativeError);
2292         }
2293 
2294         // test _mm_rsqrt_ps
2295         B = _mm_rsqrt_ps(A);
2296         foreach(i; 0..4)
2297         {
2298             double exact = 1.0f / sqrt(A.array[i]);
2299             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2300             assert(abs_double(ratio - 1) <= maxRelativeError);
2301         }
2302 
2303         // test _mm_rsqrt_ss
2304         {
2305             B = _mm_rsqrt_ss(A);
2306             double exact = 1.0f / sqrt(A.array[0]);
2307             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2308             assert(abs_double(ratio - 1) <= maxRelativeError);
2309         }
2310     }
2311 
2312     testApproximateSSE(0.00001f);
2313     testApproximateSSE(1.1f);
2314     testApproximateSSE(345.0f);
2315     testApproximateSSE(2.45674864151f);
2316     testApproximateSSE(700000.0f);
2317     testApproximateSSE(10000000.0f);
2318     testApproximateSSE(27841456468.0f);
2319 }
2320 
2321 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2322 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2323 /// low 16 bits of result.
2324 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
2325 {
2326     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
2327 }
2328 
2329 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2330 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`,
2331 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
2332 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
2333 {
2334     // Note: unsupported on ARM
2335     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
2336 }
2337 
2338 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2339 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
2340 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
2341 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
2342 {
2343     // Note: unsupported on ARM
2344     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
2345 }
2346 
2347 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2348 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`.
2349 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
2350 {
2351     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
2352 }
2353 
2354 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
2355 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
2356 {
2357     // Note: despite appearances, generates sensible code,
2358     //       inlines correctly and is constant folded
2359     float[4] result = [e0, e1, e2, e3];
2360     return loadUnaligned!(float4)(result.ptr);
2361 }
2362 unittest
2363 {
2364     __m128 A = _mm_set_ps(3, 2, 1, 546);
2365     float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
2366     assert(A.array == correct);
2367 }
2368 
2369 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; ///
2370 
2371 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2372 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 
2373 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
2374 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
2375 {
2376     // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2377     version(GNU) asm @trusted { "" : : : "memory"; }
2378     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
2379 }
2380 
2381 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements.
2382 __m128 _mm_set_ss (float a) pure @trusted
2383 {
2384     static if (DMD_with_DSIMD)
2385     {
2386         return cast(__m128) __simd(XMM.LODSS, a);
2387     }
2388     else
2389     {
2390         __m128 r = _mm_setzero_ps();
2391         r.ptr[0] = a;
2392         return r;
2393     }
2394 }
2395 unittest
2396 {
2397     float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
2398     __m128 A = _mm_set_ss(42.0f);
2399     assert(A.array == correct);
2400 }
2401 
2402 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements.
2403 __m128 _mm_set1_ps (float a) pure @trusted
2404 {
2405     __m128 r = a;
2406     return r;
2407 }
2408 unittest
2409 {
2410     float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
2411     __m128 A = _mm_set1_ps(42.0f);
2412     assert(A.array == correct);
2413 }
2414 
2415 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`.
2416 void _mm_setcsr(uint controlWord) @trusted
2417 {
2418     static if (LDC_with_ARM)
2419     {
2420         // Convert from SSE to ARM control word. This is done _partially_
2421         // and only support rounding mode changes.
2422 
2423         // "To alter some bits of a VFP system register without 
2424         // affecting other bits, use a read-modify-write procedure"
2425         uint fpscr = arm_get_fpcr();
2426         
2427         // Bits 23 to 22 are rounding modes, however not used in NEON
2428         fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
2429         switch(controlWord & _MM_ROUND_MASK)
2430         {
2431             default:
2432             case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
2433             case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
2434             case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
2435             case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
2436         }
2437         fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
2438         if (controlWord & _MM_FLUSH_ZERO_MASK)
2439             fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
2440         arm_set_fpcr(fpscr);
2441     }
2442     else version(GNU)
2443     {
2444         static if (GDC_with_SSE)
2445         {
2446             // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2447             version(GNU) asm @trusted { "" : : : "memory"; }
2448             __builtin_ia32_ldmxcsr(controlWord);
2449         }
2450         else version(X86)
2451         {
2452             asm pure nothrow @nogc @trusted
2453             {
2454                 "ldmxcsr %0;\n" 
2455                   : 
2456                   : "m" (controlWord)
2457                   : ;
2458             }
2459         }
2460         else
2461             static assert(false);
2462     }
2463     else version (InlineX86Asm)
2464     {
2465         asm pure nothrow @nogc @safe
2466         {
2467             ldmxcsr controlWord;
2468         }
2469     }
2470     else
2471         static assert(0, "Not yet supported");
2472 }
2473 unittest
2474 {
2475     _mm_setcsr(_mm_getcsr());
2476 }
2477 
2478 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
2479 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
2480 {
2481     float[4] result = [e3, e2, e1, e0];
2482     return loadUnaligned!(float4)(result.ptr);
2483 }
2484 unittest
2485 {
2486     __m128 A = _mm_setr_ps(3, 2, 1, 546);
2487     float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
2488     assert(A.array == correct);
2489     assert(A.array[0] == 3.0f);
2490     assert(A.array[1] == 2.0f);
2491     assert(A.array[2] == 1.0f);
2492     assert(A.array[3] == 546.0f);
2493 }
2494 
2495 /// Return vector of type `__m128` with all elements set to zero.
2496 __m128 _mm_setzero_ps() pure @trusted
2497 {
2498     float4 r;
2499     r = 0.0f;
2500     return r;
2501 }
2502 unittest
2503 {
2504     __m128 R = _mm_setzero_ps();
2505     float[4] correct = [0.0f, 0, 0, 0];
2506     assert(R.array == correct);
2507 }
2508 
2509 /// Perform a serializing operation on all store-to-memory instructions that were issued prior 
2510 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 
2511 /// is globally visible before any store instruction which follows the fence in program order.
2512 void _mm_sfence() @trusted
2513 {
2514     version(GNU)
2515     {
2516         static if (GDC_with_SSE)
2517         {
2518             __builtin_ia32_sfence();
2519         }
2520         else version(X86)
2521         {
2522             asm pure nothrow @nogc @trusted
2523             {
2524                 "sfence;\n" : : : ;
2525             }
2526         }
2527         else
2528             static assert(false);
2529     }
2530     else static if (LDC_with_SSE1)
2531     {
2532         __builtin_ia32_sfence();
2533     }
2534     else static if (DMD_with_asm)
2535     {
2536         asm nothrow @nogc pure @safe
2537         {
2538             sfence;
2539         }
2540     }
2541     else version(LDC)
2542     {
2543         llvm_memory_fence(); // PERF: this generates mfence instead of sfence
2544     }
2545     else
2546         static assert(false);
2547 }
2548 unittest
2549 {
2550     _mm_sfence();
2551 }
2552 
2553 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
2554 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe
2555 {
2556     return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ),
2557                                               ( (imm8 >> 2) & 3 ),
2558                                               ( (imm8 >> 4) & 3 ),
2559                                               ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
2560 }
2561 unittest
2562 {
2563     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
2564     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2565     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
2566     short[4] expectedB = [ 3, 2, 1, 0 ];
2567     assert(B.array == expectedB);
2568 }
2569 
2570 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime.
2571 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe
2572 {
2573     return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b);
2574 }
2575 
2576 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
2577 __m128 _mm_sqrt_ps(__m128 a) @trusted
2578 {
2579     static if (GDC_with_SSE)
2580     {
2581         return __builtin_ia32_sqrtps(a);
2582     }
2583     else version(LDC)
2584     {
2585         // Disappeared with LDC 1.11
2586         static if (__VERSION__ < 2081)
2587             return __builtin_ia32_sqrtps(a);
2588         else
2589         {
2590             a[0] = llvm_sqrt(a[0]);
2591             a[1] = llvm_sqrt(a[1]);
2592             a[2] = llvm_sqrt(a[2]);
2593             a[3] = llvm_sqrt(a[3]);
2594             return a;
2595         }
2596     }
2597     else
2598     {
2599         a.ptr[0] = sqrt(a.array[0]);
2600         a.ptr[1] = sqrt(a.array[1]);
2601         a.ptr[2] = sqrt(a.array[2]);
2602         a.ptr[3] = sqrt(a.array[3]);
2603         return a;
2604     }
2605 }
2606 unittest
2607 {
2608     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
2609     assert(A.array[0] == 2.0f);
2610     assert(A.array[1] == 2.0f);
2611     assert(A.array[2] == 2.0f);
2612     assert(A.array[3] == 2.0f);
2613 }
2614 
2615 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower
2616 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result.
2617 __m128 _mm_sqrt_ss(__m128 a) @trusted
2618 {
2619     static if (GDC_with_SSE)
2620     {
2621         return __builtin_ia32_sqrtss(a);
2622     }
2623     else version(LDC)
2624     {
2625         a.ptr[0] = llvm_sqrt(a.array[0]);
2626         return a;
2627     }
2628     else
2629     {   
2630         a.ptr[0] = sqrt(a.array[0]);
2631         return a;
2632     }
2633 }
2634 unittest
2635 {
2636     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
2637     assert(A.array[0] == 2.0f);
2638     assert(A.array[1] == 4.0f);
2639     assert(A.array[2] == 4.0f);
2640     assert(A.array[3] == 4.0f);
2641 }
2642 
2643 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2644 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2645 void _mm_store_ps (float* mem_addr, __m128 a) pure
2646 {
2647     __m128* aligned = cast(__m128*)mem_addr;
2648     *aligned = a;
2649 }
2650 
2651 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; ///
2652 
2653 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 
2654 /// `mem_addr` does not need to be aligned on any particular boundary.
2655 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
2656 {
2657     *mem_addr = a.array[0];
2658 }
2659 unittest
2660 {
2661     float a;
2662     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
2663     assert(a == 546);
2664 }
2665 
2666 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 
2667 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2668 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // TODO: shouldn't be trusted
2669 {
2670     __m128* aligned = cast(__m128*)mem_addr;
2671     __m128 r;
2672     r.ptr[0] = a.array[0];
2673     r.ptr[1] = a.array[0];
2674     r.ptr[2] = a.array[0];
2675     r.ptr[3] = a.array[0];
2676     *aligned = r;
2677 }
2678 unittest
2679 {
2680     align(16) float[4] A;
2681     _mm_store1_ps(A.ptr, _mm_set_ss(42.0f));
2682     float[4] correct = [42.0f, 42, 42, 42];
2683     assert(A == correct);
2684 }
2685 
2686 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory.
2687 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
2688 {
2689     long2 la = cast(long2)a;
2690     (*p).ptr[0] = la.array[1];
2691 }
2692 unittest
2693 {
2694     __m64 R = _mm_setzero_si64();
2695     long2 A = [13, 25];
2696     _mm_storeh_pi(&R, cast(__m128)A);
2697     assert(R.array[0] == 25);
2698 }
2699 
2700 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory.
2701 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
2702 {
2703     long2 la = cast(long2)a;
2704     (*p).ptr[0] = la.array[0];
2705 }
2706 unittest
2707 {
2708     __m64 R = _mm_setzero_si64();
2709     long2 A = [13, 25];
2710     _mm_storel_pi(&R, cast(__m128)A);
2711     assert(R.array[0] == 13);
2712 }
2713 
2714 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 
2715 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2716 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // TODO should not be trusted
2717 {
2718     __m128* aligned = cast(__m128*)mem_addr;
2719     __m128 r;
2720     r.ptr[0] = a.array[3];
2721     r.ptr[1] = a.array[2];
2722     r.ptr[2] = a.array[1];
2723     r.ptr[3] = a.array[0];
2724     *aligned = r;
2725 }
2726 unittest
2727 {
2728     align(16) float[4] A;
2729     _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4));
2730     float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f];
2731     assert(A == correct);
2732 }
2733 
2734 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2735 /// `mem_addr` does not need to be aligned on any particular boundary.
2736 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe // TODO should not be trusted
2737 {
2738     storeUnaligned!(float4)(a, mem_addr);
2739 }
2740 
2741 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint.
2742 void _mm_stream_pi (__m64* mem_addr, __m64 a)
2743 {
2744     // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves
2745     *mem_addr = a; // it's a regular move instead
2746 }
2747 
2748 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`s into memory using
2749 /// a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be
2750 /// generated.
2751 void _mm_stream_ps (float* mem_addr, __m128 a)
2752 {
2753     // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal
2754     // needs some IR outside this function that would say:
2755     //
2756     //  !0 = !{ i32 1 }
2757     //
2758     // It's a LLVM IR metadata description.
2759     __m128* dest = cast(__m128*)mem_addr;
2760     *dest = a; // it's a regular move instead
2761 }
2762 unittest
2763 {
2764     align(16) float[4] A;
2765     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
2766     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
2767 }
2768 
2769 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 
2770 /// floating-point elements in `a`.
2771 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
2772 {
2773     return a - b;
2774 }
2775 unittest
2776 {
2777     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2778     a = _mm_sub_ps(a, a);
2779     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
2780     assert(a.array == correct);
2781 }
2782 
2783 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit)
2784 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 
2785 /// packed elements from a to the upper elements of result.
2786 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
2787 {
2788     static if (DMD_with_DSIMD)
2789         return cast(__m128) __simd(XMM.SUBSS, a, b);
2790     else static if (GDC_with_SSE)
2791         return __builtin_ia32_subss(a, b);
2792     else
2793     {
2794         a[0] -= b[0];
2795         return a;
2796     }
2797 }
2798 unittest
2799 {
2800     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2801     a = _mm_sub_ss(a, a);
2802     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
2803     assert(a.array == correct);
2804 }
2805 
2806 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 
2807 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
2808 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
2809 {
2810     __m128 tmp3, tmp2, tmp1, tmp0;
2811     tmp0 = _mm_unpacklo_ps(row0, row1);
2812     tmp2 = _mm_unpacklo_ps(row2, row3);
2813     tmp1 = _mm_unpackhi_ps(row0, row1);
2814     tmp3 = _mm_unpackhi_ps(row2, row3);
2815     row0 = _mm_movelh_ps(tmp0, tmp2);
2816     row1 = _mm_movehl_ps(tmp2, tmp0);
2817     row2 = _mm_movelh_ps(tmp1, tmp3);
2818     row3 = _mm_movehl_ps(tmp3, tmp1);
2819 }
2820 unittest
2821 {
2822     __m128 l0 = _mm_setr_ps(0, 1, 2, 3);
2823     __m128 l1 = _mm_setr_ps(4, 5, 6, 7);
2824     __m128 l2 = _mm_setr_ps(8, 9, 10, 11);
2825     __m128 l3 = _mm_setr_ps(12, 13, 14, 15);
2826     _MM_TRANSPOSE4_PS(l0, l1, l2, l3);
2827     float[4] r0 = [0.0f, 4, 8, 12];
2828     float[4] r1 = [1.0f, 5, 9, 13];
2829     float[4] r2 = [2.0f, 6, 10, 14];
2830     float[4] r3 = [3.0f, 7, 11, 15];
2831     assert(l0.array == r0);
2832     assert(l1.array == r1);
2833     assert(l2.array == r2);
2834     assert(l3.array == r3);
2835 }
2836 
2837 // Note: the only difference between these intrinsics is the signalling
2838 //       behaviour of quiet NaNs. This is incorrect but the case where
2839 //       you would want to differentiate between qNaN and sNaN and then
2840 //       treat them differently on purpose seems extremely rare.
2841 alias _mm_ucomieq_ss = _mm_comieq_ss;
2842 alias _mm_ucomige_ss = _mm_comige_ss;
2843 alias _mm_ucomigt_ss = _mm_comigt_ss;
2844 alias _mm_ucomile_ss = _mm_comile_ss;
2845 alias _mm_ucomilt_ss = _mm_comilt_ss;
2846 alias _mm_ucomineq_ss = _mm_comineq_ss;
2847 
2848 /// Return vector of type `__m128` with undefined elements.
2849 __m128 _mm_undefined_ps() pure @safe
2850 {
2851     __m128 undef = void;
2852     return undef;
2853 }
2854 
2855 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`.
2856 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
2857 {
2858     version(LDC)
2859     {
2860         // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0
2861         return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b);
2862     }
2863     else
2864     {
2865         __m128 r;
2866         r.ptr[0] = a.array[2];
2867         r.ptr[1] = b.array[2];
2868         r.ptr[2] = a.array[3];
2869         r.ptr[3] = b.array[3];
2870         return r;
2871     }
2872 }
2873 unittest
2874 {
2875     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
2876     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
2877     __m128 R = _mm_unpackhi_ps(A, B);
2878     float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f];
2879     assert(R.array == correct);
2880 }
2881 
2882 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`.
2883 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
2884 {
2885     version(LDC)
2886     {
2887         // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0
2888         return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b);
2889     }
2890     else
2891     {
2892         __m128 r;
2893         r.ptr[0] = a.array[0];
2894         r.ptr[1] = b.array[0];
2895         r.ptr[2] = a.array[1];
2896         r.ptr[3] = b.array[1];
2897         return r;
2898     }
2899 }
2900 unittest
2901 {
2902     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
2903     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
2904     __m128 R = _mm_unpacklo_ps(A, B);
2905     float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f];
2906     assert(R.array == correct);
2907 }
2908 
2909 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
2910 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
2911 {
2912     return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
2913 }
2914 
2915 
2916 private
2917 {
2918     // Returns: `true` if the pointer is suitably aligned.
2919     bool isPointerAligned(void* p, size_t alignment) pure
2920     {
2921         assert(alignment != 0);
2922         return ( cast(size_t)p & (alignment - 1) ) == 0;
2923     }
2924 
2925     // Returns: next pointer aligned with alignment bytes.
2926     void* nextAlignedPointer(void* start, size_t alignment) pure
2927     {
2928         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
2929     }
2930 
2931     // Returns number of bytes to actually allocate when asking
2932     // for a particular alignment
2933     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
2934     {
2935         enum size_t pointerSize = size_t.sizeof;
2936         return askedSize + alignment - 1 + pointerSize * 3;
2937     }
2938 
2939     // Store pointer given by malloc + size + alignment
2940     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
2941     {
2942         enum size_t pointerSize = size_t.sizeof;
2943         char* start = cast(char*)raw + pointerSize * 3;
2944         void* aligned = nextAlignedPointer(start, alignment);
2945         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
2946         *rawLocation = raw;
2947         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
2948         *sizeLocation = size;
2949         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
2950         *alignmentLocation = alignment;
2951         assert( isPointerAligned(aligned, alignment) );
2952         return aligned;
2953     }
2954 
2955     // Returns: x, multiple of powerOfTwo, so that x >= n.
2956     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
2957     {
2958         // check power-of-two
2959         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
2960 
2961         size_t mask = ~(powerOfTwo - 1);
2962         return (n + powerOfTwo - 1) & mask;
2963     }
2964 
2965     void* alignedReallocImpl(bool PreserveDataIfResized)(void* aligned, size_t size, size_t alignment)
2966     {
2967         if (aligned is null)
2968             return _mm_malloc(size, alignment);
2969 
2970         assert(alignment != 0);
2971         assert(isPointerAligned(aligned, alignment));
2972 
2973         size_t previousSize = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 2);
2974         size_t prevAlignment = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 3);
2975 
2976         // It is illegal to change the alignment across calls.
2977         assert(prevAlignment == alignment);
2978 
2979         void* raw = *cast(void**)(cast(char*)aligned - size_t.sizeof);
2980         size_t request = requestedSize(size, alignment);
2981         size_t previousRequest = requestedSize(previousSize, alignment);
2982         assert(previousRequest - request == previousSize - size);
2983 
2984         // Heuristic: if a requested size is within 50% to 100% of what is already allocated
2985         //            then exit with the same pointer
2986         // PERF it seems like `realloc` should do that, not us.
2987         if ( (previousRequest < request * 4) && (request <= previousRequest) )
2988             return aligned;
2989 
2990         void* newRaw = malloc(request);
2991         if (request > 0 && newRaw == null) // realloc(0) can validly return anything
2992             onOutOfMemoryError();
2993 
2994         void* newAligned = storeRawPointerPlusInfo(newRaw, size, alignment);
2995 
2996         static if (PreserveDataIfResized)
2997         {
2998             size_t minSize = size < previousSize ? size : previousSize;
2999             memcpy(newAligned, aligned, minSize);
3000         }
3001 
3002         // Free previous data
3003         _mm_free(aligned);
3004         assert(isPointerAligned(newAligned, alignment));
3005         return newAligned;
3006     }
3007 }
3008 
3009 unittest
3010 {
3011     assert(nextMultipleOf(0, 4) == 0);
3012     assert(nextMultipleOf(1, 4) == 4);
3013     assert(nextMultipleOf(2, 4) == 4);
3014     assert(nextMultipleOf(3, 4) == 4);
3015     assert(nextMultipleOf(4, 4) == 4);
3016     assert(nextMultipleOf(5, 4) == 8);
3017 
3018     {
3019         void* p = _mm_malloc(23, 16);
3020         assert(p !is null);
3021         assert(((cast(size_t)p) & 0xf) == 0);
3022         _mm_free(p);
3023     }
3024 
3025     void* nullAlloc = _mm_malloc(0, 32);
3026     assert(nullAlloc != null);
3027     _mm_free(nullAlloc);
3028 }
3029 
3030 // For some reason, order of declaration is important for this one
3031 // so it is misplaced.
3032 // Note: is just another name for _mm_cvtss_si32
3033 alias _mm_cvt_ss2si = _mm_cvtss_si32;