1 /**
2 * SSE intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE
4 * 
5 * Copyright: Copyright Guillaume Piolat 2016-2020.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.xmmintrin;
9 
10 public import inteli.types;
11 
12 import inteli.internals;
13 
14 import inteli.mmx;
15 import inteli.emmintrin;
16 
17 import core.stdc.stdlib: malloc, free;
18 import core.stdc.string: memcpy;
19 import core.exception: onOutOfMemoryError;
20 
21 version(D_InlineAsm_X86)
22     version = InlineX86Asm;
23 else version(D_InlineAsm_X86_64)
24     version = InlineX86Asm;
25 
26 
27 // SSE1
28 
29 nothrow @nogc:
30 
31 
32 enum int _MM_EXCEPT_INVALID    = 0x0001; /// MXCSR Exception states.
33 enum int _MM_EXCEPT_DENORM     = 0x0002; ///ditto
34 enum int _MM_EXCEPT_DIV_ZERO   = 0x0004; ///ditto
35 enum int _MM_EXCEPT_OVERFLOW   = 0x0008; ///ditto
36 enum int _MM_EXCEPT_UNDERFLOW  = 0x0010; ///ditto
37 enum int _MM_EXCEPT_INEXACT    = 0x0020; ///ditto
38 enum int _MM_EXCEPT_MASK       = 0x003f; /// MXCSR Exception states mask.
39 
40 enum int _MM_MASK_INVALID      = 0x0080; /// MXCSR Exception masks.
41 enum int _MM_MASK_DENORM       = 0x0100; ///ditto
42 enum int _MM_MASK_DIV_ZERO     = 0x0200; ///ditto
43 enum int _MM_MASK_OVERFLOW     = 0x0400; ///ditto
44 enum int _MM_MASK_UNDERFLOW    = 0x0800; ///ditto
45 enum int _MM_MASK_INEXACT      = 0x1000; ///ditto
46 enum int _MM_MASK_MASK         = 0x1f80; /// MXCSR Exception masks mask.
47 
48 enum int _MM_ROUND_NEAREST     = 0x0000; /// MXCSR Rounding mode.
49 enum int _MM_ROUND_DOWN        = 0x2000; ///ditto
50 enum int _MM_ROUND_UP          = 0x4000; ///ditto
51 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto
52 enum int _MM_ROUND_MASK        = 0x6000; /// MXCSR Rounding mode mask.
53 
54 enum int _MM_FLUSH_ZERO_MASK   = 0x8000; /// MXCSR Denormal flush to zero mask.
55 enum int _MM_FLUSH_ZERO_ON     = 0x8000; /// MXCSR Denormal flush to zero modes.
56 enum int _MM_FLUSH_ZERO_OFF    = 0x0000; ///ditto
57 
58 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
59 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe
60 {
61     pragma(inline, true);
62     return a + b;
63 }
64 unittest
65 {
66     __m128 a = [1, 2, 3, 4];
67     a = _mm_add_ps(a, a);
68     assert(a.array[0] == 2);
69     assert(a.array[1] == 4);
70     assert(a.array[2] == 6);
71     assert(a.array[3] == 8);
72 }
73 
74 /// Add the lower single-precision (32-bit) floating-point element 
75 /// in `a` and `b`, store the result in the lower element of result, 
76 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
77 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe
78 {
79     static if (GDC_with_SSE)
80     {
81         return __builtin_ia32_addss(a, b);
82     }
83     else static if (DMD_with_DSIMD)
84     {
85         return cast(__m128) __simd(XMM.ADDSS, a, b);
86     }
87     else
88     {
89         a[0] += b[0];
90         return a;
91     }
92 }
93 unittest
94 {
95     __m128 a = [1, 2, 3, 4];
96     a = _mm_add_ss(a, a);
97     assert(a.array == [2.0f, 2, 3, 4]);
98 }
99 
100 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
101 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe
102 {
103     pragma(inline, true);
104     return cast(__m128)(cast(__m128i)a & cast(__m128i)b);
105 }
106 unittest
107 {
108     float a = 4.32f;
109     float b = -78.99f;
110     int correct = (*cast(int*)(&a)) & (*cast(int*)(&b));
111     __m128 A = _mm_set_ps(a, b, a, b);
112     __m128 B = _mm_set_ps(b, a, b, a);
113     int4 R = cast(int4)( _mm_and_ps(A, B) );
114     assert(R.array[0] == correct);
115     assert(R.array[1] == correct);
116     assert(R.array[2] == correct);
117     assert(R.array[3] == correct);
118 }
119 
120 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
121 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe
122 {
123     static if (DMD_with_DSIMD)
124         return cast(__m128) __simd(XMM.ANDNPS, a, b);
125     else
126         return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b );
127 }
128 unittest
129 {
130     float a = 4.32f;
131     float b = -78.99f;
132     int correct  = ~(*cast(int*)(&a)) &  (*cast(int*)(&b));
133     int correct2 =  (*cast(int*)(&a)) & ~(*cast(int*)(&b));
134     __m128 A = _mm_set_ps(a, b, a, b);
135     __m128 B = _mm_set_ps(b, a, b, a);
136     int4 R = cast(int4)( _mm_andnot_ps(A, B) );
137     assert(R.array[0] == correct2);
138     assert(R.array[1] == correct);
139     assert(R.array[2] == correct2);
140     assert(R.array[3] == correct);
141 }
142 
143 /// Average packed unsigned 16-bit integers in ``a` and `b`.
144 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe
145 {
146     return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b)));
147 }
148 
149 /// Average packed unsigned 8-bit integers in ``a` and `b`.
150 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe
151 {
152     return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b)));
153 }
154 
155 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality.
156 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe
157 {
158     static if (DMD_with_DSIMD)
159         return cast(__m128) __simd(XMM.CMPPS, a, b, 0);
160     else
161         return cast(__m128) cmpps!(FPComparison.oeq)(a, b);
162 }
163 unittest
164 {
165     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
166     __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan);
167     __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B);
168     int[4] correct = [0, -1, 0, 0];
169     assert(R.array == correct);
170 }
171 
172 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 
173 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
174 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe
175 {
176     static if (DMD_with_DSIMD)
177         return cast(__m128) __simd(XMM.CMPSS, a, b, 0);
178     else
179         return cast(__m128) cmpss!(FPComparison.oeq)(a, b);
180 }
181 unittest
182 {
183     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
184     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
185     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
186     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
187     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
188     __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B);
189     __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C);
190     __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D);
191     __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E);
192     int[4] correct1 = [-1, 0, 0, 0];
193     int[4] correct2 = [0, 0, 0, 0];
194     int[4] correct3 = [0, 0, 0, 0];
195     int[4] correct4 = [0, 0, 0, 0];
196     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
197 }
198 
199 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal.
200 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe
201 {
202     static if (DMD_with_DSIMD)
203         return cast(__m128) __simd(XMM.CMPPS, b, a, 2);
204     else
205         return cast(__m128) cmpps!(FPComparison.oge)(a, b);
206 }
207 unittest
208 {
209     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
210     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
211     __m128i R = cast(__m128i) _mm_cmpge_ps(A, B);
212     int[4] correct = [0, -1,-1, 0];
213     assert(R.array == correct);
214 }
215 
216 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 
217 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
218 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe
219 {
220     static if (DMD_with_DSIMD)
221     {
222         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2);
223         a[0] = c[0];
224         return a;
225     }
226     else
227         return cast(__m128) cmpss!(FPComparison.oge)(a, b);
228 }
229 unittest
230 {
231     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
232     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
233     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
234     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
235     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
236     __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B);
237     __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C);
238     __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D);
239     __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E);
240     int[4] correct1 = [-1, 0, 0, 0];
241     int[4] correct2 = [-1, 0, 0, 0];
242     int[4] correct3 = [0, 0, 0, 0];
243     int[4] correct4 = [0, 0, 0, 0];
244     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
245 }
246 
247 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than.
248 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe
249 {
250     static if (DMD_with_DSIMD)
251         return cast(__m128) __simd(XMM.CMPPS, b, a, 1);
252     else
253         return cast(__m128) cmpps!(FPComparison.ogt)(a, b);
254 }
255 unittest
256 {
257     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
258     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
259     __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B);
260     int[4] correct = [0, 0,-1, 0];
261     assert(R.array == correct);
262 }
263 
264 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 
265 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
266 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe
267 {
268     static if (DMD_with_DSIMD)
269     {
270         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1);
271         a[0] = c[0];
272         return a;
273     }
274     else
275         return cast(__m128) cmpss!(FPComparison.ogt)(a, b);
276 }
277 unittest
278 {
279     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
280     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
281     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
282     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
283     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
284     __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B);
285     __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C);
286     __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D);
287     __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E);
288     int[4] correct1 = [0, 0, 0, 0];
289     int[4] correct2 = [-1, 0, 0, 0];
290     int[4] correct3 = [0, 0, 0, 0];
291     int[4] correct4 = [0, 0, 0, 0];
292     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
293 }
294 
295 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal.
296 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe
297 {
298     static if (DMD_with_DSIMD)
299         return cast(__m128) __simd(XMM.CMPPS, a, b, 2);
300     else
301         return cast(__m128) cmpps!(FPComparison.ole)(a, b);
302 }
303 unittest
304 {
305     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
306     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
307     __m128i R = cast(__m128i) _mm_cmple_ps(A, B);
308     int[4] correct = [-1, -1, 0, 0];
309     assert(R.array == correct);
310 }
311 
312 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 
313 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
314 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe
315 {
316     static if (DMD_with_DSIMD)
317         return cast(__m128) __simd(XMM.CMPSS, a, b, 2);
318     else
319         return cast(__m128) cmpss!(FPComparison.ole)(a, b);
320 }
321 unittest
322 {
323     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
324     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
325     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
326     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
327     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
328     __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B);
329     __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C);
330     __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D);
331     __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E);
332     int[4] correct1 = [-1, 0, 0, 0];
333     int[4] correct2 = [0, 0, 0, 0];
334     int[4] correct3 = [0, 0, 0, 0];
335     int[4] correct4 = [-1, 0, 0, 0];
336     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
337 }
338 
339 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than.
340 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe
341 {
342     static if (DMD_with_DSIMD)
343         return cast(__m128) __simd(XMM.CMPPS, a, b, 1);
344     else
345         return cast(__m128) cmpps!(FPComparison.olt)(a, b);
346 }
347 unittest
348 {
349     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
350     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
351     __m128i R = cast(__m128i) _mm_cmplt_ps(A, B);
352     int[4] correct = [-1, 0, 0, 0];
353     assert(R.array == correct);
354 }
355 
356 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 
357 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
358 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe
359 {
360     static if (DMD_with_DSIMD)
361         return cast(__m128) __simd(XMM.CMPSS, a, b, 1);
362     else
363         return cast(__m128) cmpss!(FPComparison.olt)(a, b);
364 }
365 unittest
366 {
367     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
368     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
369     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
370     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
371     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
372     __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B);
373     __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C);
374     __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D);
375     __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E);
376     int[4] correct1 = [0, 0, 0, 0];
377     int[4] correct2 = [0, 0, 0, 0];
378     int[4] correct3 = [0, 0, 0, 0];
379     int[4] correct4 = [-1, 0, 0, 0];
380     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
381 }
382 
383 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal.
384 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe
385 {
386     static if (DMD_with_DSIMD)
387         return cast(__m128) __simd(XMM.CMPPS, a, b, 4);
388     else
389         return cast(__m128) cmpps!(FPComparison.une)(a, b);
390 }
391 unittest
392 {
393     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
394     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
395     __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B);
396     int[4] correct = [-1, 0, -1, -1];
397     assert(R.array == correct);
398 }
399 
400 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 
401 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
402 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe
403 {
404     static if (DMD_with_DSIMD)
405         return cast(__m128) __simd(XMM.CMPSS, a, b, 4);
406     else
407         return cast(__m128) cmpss!(FPComparison.une)(a, b);
408 }
409 unittest
410 {
411     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
412     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
413     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
414     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
415     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
416     __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B);
417     __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C);
418     __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D);
419     __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E);
420     int[4] correct1 = [0, 0, 0, 0];
421     int[4] correct2 = [-1, 0, 0, 0];
422     int[4] correct3 = [-1, 0, 0, 0];
423     int[4] correct4 = [-1, 0, 0, 0];
424     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
425 }
426 
427 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal.
428 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe
429 {
430     static if (DMD_with_DSIMD)
431         return cast(__m128) __simd(XMM.CMPPS, b, a, 6);
432     else
433         return cast(__m128) cmpps!(FPComparison.ult)(a, b);
434 }
435 unittest
436 {
437     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
438     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
439     __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B);
440     int[4] correct = [-1, 0, 0, -1];
441     assert(R.array == correct);
442 }
443 
444 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 
445 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
446 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe
447 {
448     static if (DMD_with_DSIMD)
449     {
450         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6);
451         a[0] = c[0];
452         return a;
453     }
454     else
455         return cast(__m128) cmpss!(FPComparison.ult)(a, b);
456 }
457 unittest
458 {
459     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
460     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
461     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
462     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
463     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
464     __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B);
465     __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C);
466     __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D);
467     __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E);
468     int[4] correct1 = [0, 0, 0, 0];
469     int[4] correct2 = [0, 0, 0, 0];
470     int[4] correct3 = [-1, 0, 0, 0];
471     int[4] correct4 = [-1, 0, 0, 0];
472     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
473 }
474 
475 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than.
476 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe
477 {
478     static if (DMD_with_DSIMD)
479         return cast(__m128) __simd(XMM.CMPPS, b, a, 5);
480     else
481         return cast(__m128) cmpps!(FPComparison.ule)(a, b);
482 }
483 unittest
484 {
485     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
486     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
487     __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B);
488     int[4] correct = [-1, -1, 0, -1];
489     assert(R.array == correct);
490 }
491 
492 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 
493 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
494 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe
495 {
496     static if (DMD_with_DSIMD)
497     {
498         __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5);
499         a[0] = c[0];
500         return a;
501     }
502     else
503         return cast(__m128) cmpss!(FPComparison.ule)(a, b);
504 }
505 unittest
506 {
507     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
508     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
509     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
510     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
511     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
512     __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B);
513     __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C);
514     __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D);
515     __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E);
516     int[4] correct1 = [-1, 0, 0, 0];
517     int[4] correct2 = [0, 0, 0, 0];
518     int[4] correct3 = [-1, 0, 0, 0];
519     int[4] correct4 = [-1, 0, 0, 0];
520     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
521 }
522 
523 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal.
524 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe
525 {
526     static if (DMD_with_DSIMD)
527         return cast(__m128) __simd(XMM.CMPPS, a, b, 6);
528     else
529         return cast(__m128) cmpps!(FPComparison.ugt)(a, b);
530 }
531 unittest
532 {
533     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
534     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
535     __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B);
536     int[4] correct = [0, 0, -1, -1];
537     assert(R.array == correct);
538 }
539 
540 
541 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 
542 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
543 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe
544 {
545     static if (DMD_with_DSIMD)
546         return cast(__m128) __simd(XMM.CMPSS, a, b, 6);
547     else
548         return cast(__m128) cmpss!(FPComparison.ugt)(a, b);
549 }
550 unittest
551 {
552     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
553     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
554     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
555     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
556     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
557     __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B);
558     __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C);
559     __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D);
560     __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E);
561     int[4] correct1 = [0, 0, 0, 0];
562     int[4] correct2 = [-1, 0, 0, 0];
563     int[4] correct3 = [-1, 0, 0, 0];
564     int[4] correct4 = [0, 0, 0, 0];
565     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
566 }
567 
568 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than.
569 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe
570 {
571     static if (DMD_with_DSIMD)
572         return cast(__m128) __simd(XMM.CMPPS, a, b, 5);
573     else
574         return cast(__m128) cmpps!(FPComparison.uge)(a, b);
575 }
576 unittest
577 {
578     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
579     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
580     __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B);
581     int[4] correct = [0, -1, -1, -1];
582     assert(R.array == correct);
583 }
584 
585 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 
586 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
587 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe
588 {
589     static if (DMD_with_DSIMD)
590         return cast(__m128) __simd(XMM.CMPSS, a, b, 5);
591     else
592         return cast(__m128) cmpss!(FPComparison.uge)(a, b);
593 }
594 unittest
595 {
596     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
597     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
598     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
599     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
600     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
601     __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B);
602     __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C);
603     __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D);
604     __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E);
605     int[4] correct1 = [-1, 0, 0, 0];
606     int[4] correct2 = [-1, 0, 0, 0];
607     int[4] correct3 = [-1, 0, 0, 0];
608     int[4] correct4 = [0, 0, 0, 0];
609     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
610 }
611 
612 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN.
613 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe
614 {
615     static if (DMD_with_DSIMD)
616         return cast(__m128) __simd(XMM.CMPPS, a, b, 7);
617     else
618         return cast(__m128) cmpps!(FPComparison.ord)(a, b);
619 }
620 unittest
621 {
622     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
623     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
624     __m128i R = cast(__m128i) _mm_cmpord_ps(A, B);
625     int[4] correct = [-1, -1, -1, 0];
626     assert(R.array == correct);
627 }
628 
629 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 
630 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
631 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe
632 {
633     static if (DMD_with_DSIMD)
634         return cast(__m128) __simd(XMM.CMPSS, a, b, 7);
635     else
636         return cast(__m128) cmpss!(FPComparison.ord)(a, b);
637 }
638 unittest
639 {
640     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
641     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
642     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
643     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
644     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
645     __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B);
646     __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C);
647     __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D);
648     __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E);
649     int[4] correct1 = [-1, 0, 0, 0];
650     int[4] correct2 = [-1, 0, 0, 0];
651     int[4] correct3 = [0, 0, 0, 0];
652     int[4] correct4 = [-1, 0, 0, 0];
653     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
654 }
655 
656 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
657 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe
658 {
659     static if (DMD_with_DSIMD)
660         return cast(__m128) __simd(XMM.CMPPS, a, b, 3);
661     else
662         return cast(__m128) cmpps!(FPComparison.uno)(a, b);
663 }
664 unittest
665 {
666     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan);
667     __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan);
668     __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B);
669     int[4] correct = [0, 0, 0, -1];
670     assert(R.array == correct);
671 }
672 
673 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN.
674 /// and copy the upper 3 packed elements from `a` to the upper elements of result.
675 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe
676 {
677     static if (DMD_with_DSIMD)
678         return cast(__m128) __simd(XMM.CMPSS, a, b, 3);
679     else return cast(__m128) cmpss!(FPComparison.uno)(a, b);
680 }
681 unittest
682 {
683     __m128 A = _mm_setr_ps(3.0f, 0, 0, 0);
684     __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan);
685     __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan);
686     __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan);
687     __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan);
688     __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B);
689     __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C);
690     __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D);
691     __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E);
692     int[4] correct1 = [0, 0, 0, 0];
693     int[4] correct2 = [0, 0, 0, 0];
694     int[4] correct3 = [-1, 0, 0, 0];
695     int[4] correct4 = [0, 0, 0, 0];
696     assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4);
697 }
698 
699 
700 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 
701 /// and return the boolean result (0 or 1).
702 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe
703 {
704     return a.array[0] == b.array[0];
705 }
706 unittest
707 {
708     assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
709     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
710     assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
711     assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
712     assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0)));
713 }
714 
715 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 
716 /// and return the boolean result (0 or 1).
717 int _mm_comige_ss (__m128 a, __m128 b) pure @safe
718 {
719     return a.array[0] >= b.array[0];
720 }
721 unittest
722 {
723     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
724     assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
725     assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
726     assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
727     assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
728     assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
729 }
730 
731 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 
732 /// and return the boolean result (0 or 1).
733 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta
734 {
735     return a.array[0] > b.array[0];
736 }
737 unittest
738 {
739     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
740     assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
741     assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
742     assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
743     assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
744 }
745 
746 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 
747 /// and return the boolean result (0 or 1).
748 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe
749 {
750     return a.array[0] <= b.array[0];
751 }
752 unittest
753 {
754     assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
755     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
756     assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
757     assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
758     assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
759     assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
760 }
761 
762 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 
763 /// and return the boolean result (0 or 1).
764 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb
765 {
766     return a.array[0] < b.array[0];
767 }
768 unittest
769 {
770     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
771     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
772     assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f)));
773     assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
774     assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
775     assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f)));
776 }
777 
778 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 
779 /// and return the boolean result (0 or 1).
780 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne
781 {
782     return a.array[0] != b.array[0];
783 }
784 unittest
785 {
786     assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f)));
787     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f)));
788     assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan)));
789     assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f)));
790     assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f)));
791 }
792 
793 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
794 /// floating-point elements, store the results in the lower 2 elements, 
795 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
796 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps;
797 
798 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 
799 /// to packed 32-bit integers.
800 __m64 _mm_cvt_ps2pi (__m128 a) @safe
801 {
802     return to_m64(_mm_cvtps_epi32(a));
803 }
804 
805 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
806 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
807 /// upper elements of the result.
808 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted
809 {
810     v.ptr[0] = cast(float)x;
811     return v;
812 }
813 unittest
814 {
815     __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42);
816     assert(a.array == [42f, 0, 0, 0]);
817 }
818 
819 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
820 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe
821 {
822     __m128i ma = to_m128i(a);
823     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
824     ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit
825     return _mm_cvtepi32_ps(ma);
826 }
827 unittest
828 {
829     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
830     __m128 R = _mm_cvtpi16_ps(A);
831     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
832     assert(R.array == correct);
833 }
834 
835 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 
836 /// floating-point elements, store the results in the lower 2 elements, 
837 /// and copy the upper 2 packed elements from `a` to the upper elements of result.
838 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted
839 {
840     __m128 fb = _mm_cvtepi32_ps(to_m128i(b));
841     a.ptr[0] = fb.array[0];
842     a.ptr[1] = fb.array[1];
843     return a;
844 }
845 unittest
846 {
847     __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2));
848     float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f];
849     assert(R.array == correct);
850 }
851 
852 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 
853 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 
854 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements.
855 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted
856 {
857     long2 l;
858     l.ptr[0] = a.array[0];
859     l.ptr[1] = b.array[0];
860     return _mm_cvtepi32_ps(cast(__m128i)l);
861 }
862 unittest
863 {
864     __m64 A = _mm_setr_pi32(-45, 128);
865     __m64 B = _mm_setr_pi32(0, 1000);
866     __m128 R = _mm_cvtpi32x2_ps(A, B);
867     float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f];
868     assert(R.array == correct);
869 }
870 
871 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
872 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe
873 {
874     __m128i b = to_m128i(a); 
875 
876     // Zero extend to 32-bit
877     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
878     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
879 
880     // Replicate sign bit
881     b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit
882     return _mm_cvtepi32_ps(b);
883 }
884 unittest
885 {
886     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
887     __m128 R = _mm_cvtpi8_ps(A);
888     float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f];
889     assert(R.array == correct);
890 }
891 
892 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers.
893 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF.
894 __m64 _mm_cvtps_pi16 (__m128 a) @safe
895 {
896     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw
897     // Which means the 16-bit integers should be saturated
898     __m128i b = _mm_cvtps_epi32(a);
899     b = _mm_packs_epi32(b, b);
900     return to_m64(b);
901 }
902 unittest
903 {
904     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f);
905     short4 R = cast(short4) _mm_cvtps_pi16(A);
906     short[4] correct = [-1, 2, -32768, 32767];
907     assert(R.array == correct);
908 }
909 
910 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
911 __m64 _mm_cvtps_pi32 (__m128 a) @safe
912 {
913     return to_m64(_mm_cvtps_epi32(a));
914 }
915 unittest
916 {
917     __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, );
918     int2 R = cast(int2) _mm_cvtps_pi32(A);
919     int[2] correct = [-33000, 70000];
920     assert(R.array == correct);
921 }
922 
923 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 
924 /// and store the results in lower 4 elements. 
925 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF.
926 __m64 _mm_cvtps_pi8 (__m128 a) @safe
927 {
928     // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb
929     // Which means the 8-bit integers should be saturated
930     __m128i b = _mm_cvtps_epi32(a);
931     b = _mm_packs_epi32(b, _mm_setzero_si128());
932     b = _mm_packs_epi16(b, _mm_setzero_si128());
933     return to_m64(b);
934 }
935 unittest
936 {
937     __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f);
938     byte8 R = cast(byte8) _mm_cvtps_pi8(A);
939     byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0];
940     assert(R.array == correct);
941 }
942 
943 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
944 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe
945 {
946     __m128i ma = to_m128i(a);
947     ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit
948     return _mm_cvtepi32_ps(ma);
949 }
950 unittest
951 {
952     __m64 A = _mm_setr_pi16(-1, 2, -3, 4);
953     __m128 R = _mm_cvtpu16_ps(A);
954     float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f];
955     assert(R.array == correct);
956 }
957 
958 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element.
959 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe
960 {
961     __m128i b = to_m128i(a); 
962 
963     // Zero extend to 32-bit
964     b = _mm_unpacklo_epi8(b, _mm_setzero_si128());
965     b = _mm_unpacklo_epi16(b, _mm_setzero_si128());
966     return _mm_cvtepi32_ps(b);
967 }
968 unittest
969 {
970     __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0);
971     __m128 R = _mm_cvtpu8_ps(A);
972     float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f];
973     assert(R.array == correct);
974 }
975 
976 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 
977 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
978 /// upper elements of result.
979 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted
980 {
981     v.ptr[0] = cast(float)x;
982     return v;
983 }
984 unittest
985 {
986     __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42);
987     assert(a.array == [42.0f, 0, 0, 0]);
988 }
989 
990 
991 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 
992 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 
993 /// upper elements of result.
994 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted
995 {
996     v.ptr[0] = cast(float)x;
997     return v;
998 }
999 unittest
1000 {
1001     __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42);
1002     assert(a.array == [42.0f, 0, 0, 0]);
1003 }
1004 
1005 /// Take the lower single-precision (32-bit) floating-point element of `a`.
1006 float _mm_cvtss_f32(__m128 a) pure @safe
1007 {
1008     return a.array[0];
1009 }
1010 
1011 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer.
1012 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC
1013 {
1014     static if (GDC_with_SSE)
1015     {
1016         return __builtin_ia32_cvtss2si(a);
1017     }
1018     else static if (LDC_with_SSE)
1019     {
1020         return __builtin_ia32_cvtss2si(a);
1021     }
1022     else static if (DMD_with_DSIMD)
1023     {
1024         __m128 b;
1025         __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers.
1026         return r.array[0];
1027     }
1028     else
1029     {
1030         return convertFloatToInt32UsingMXCSR(a.array[0]);
1031     }
1032 }
1033 unittest
1034 {
1035     assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1036 }
1037 
1038 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer.
1039 long _mm_cvtss_si64 (__m128 a) @safe
1040 {
1041     version(LDC)
1042     {
1043         version(X86_64)
1044         {
1045             return __builtin_ia32_cvtss2si64(a);
1046         }
1047         else
1048         {
1049             // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer
1050             // using SSE instructions only. So the builtin doesn't exit for this arch.
1051             return convertFloatToInt64UsingMXCSR(a.array[0]);
1052         }
1053     }
1054     else
1055     {
1056         return convertFloatToInt64UsingMXCSR(a.array[0]);
1057     }
1058 }
1059 unittest
1060 {
1061     assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f)));
1062 
1063     uint savedRounding = _MM_GET_ROUNDING_MODE();
1064 
1065     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1066     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f)));
1067 
1068     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
1069     assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f)));
1070 
1071     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
1072     assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f)));
1073 
1074     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1075     assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f)));
1076 
1077     _MM_SET_ROUNDING_MODE(savedRounding);
1078 }
1079 
1080 
1081 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 
1082 /// integer with truncation.
1083 int _mm_cvtt_ss2si (__m128 a) pure @safe
1084 {
1085     // x86: cvttss2si always generated, even in -O0
1086     return cast(int)(a.array[0]);
1087 }
1088 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto
1089 unittest
1090 {
1091     assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1092 }
1093 
1094 
1095 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 
1096 /// integers with truncation.
1097 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe
1098 {
1099     return to_m64(_mm_cvttps_epi32(a));
1100 }
1101 
1102 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 
1103 /// integer with truncation.
1104 long _mm_cvttss_si64 (__m128 a) pure @safe
1105 {
1106     return cast(long)(a.array[0]);
1107 }
1108 unittest
1109 {
1110     assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f)));
1111 }
1112 
1113 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`.
1114 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe
1115 {
1116     pragma(inline, true);
1117     return a / b;
1118 }
1119 unittest
1120 {
1121     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1122     a = _mm_div_ps(a, a);
1123     float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f];
1124     assert(a.array == correct);
1125 }
1126 
1127 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 
1128 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 
1129 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1130 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe
1131 {
1132     static if (DMD_with_DSIMD)
1133         return cast(__m128) __simd(XMM.DIVSS, a, b);
1134     else static if (GDC_with_SSE)
1135         return __builtin_ia32_divss(a, b);
1136     else
1137     {
1138         a[0] /= b[0];
1139         return a;
1140     }
1141 }
1142 unittest
1143 {
1144     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1145     a = _mm_div_ss(a, a);
1146     float[4] correct = [1.0f, -2.0, 3.0f, 1.0f];
1147     assert(a.array == correct);
1148 }
1149 
1150 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended.
1151 int _mm_extract_pi16 (__m64 a, int imm8)
1152 {
1153     short4 sa = cast(short4)a;
1154     return cast(ushort)(sa.array[imm8]);
1155 }
1156 unittest
1157 {
1158     __m64 A = _mm_setr_pi16(-1, 6, 0, 4);
1159     assert(_mm_extract_pi16(A, 0) == 65535);
1160     assert(_mm_extract_pi16(A, 1) == 6);
1161     assert(_mm_extract_pi16(A, 2) == 0);
1162     assert(_mm_extract_pi16(A, 3) == 4);
1163 }
1164 
1165 /// Free aligned memory that was allocated with `_mm_malloc`.
1166 void _mm_free(void * mem_addr) @trusted
1167 {
1168     // support for free(NULL)
1169     if (mem_addr is null)
1170         return;
1171 
1172     // Technically we don't need to store size and alignement in the chunk, but we do in case we
1173     // have to implement _mm_realloc
1174 
1175     size_t pointerSize = (void*).sizeof;
1176     void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof);
1177     size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize);
1178     size_t alignment = *alignmentLocation;
1179     assert(alignment != 0);
1180     assert(isPointerAligned(mem_addr, alignment));
1181     free(*rawLocation);
1182 }
1183 
1184 /// Get the exception mask bits from the MXCSR control and status register. 
1185 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 
1186 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1187 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated.
1188 uint _MM_GET_EXCEPTION_MASK() @safe
1189 {
1190     return _mm_getcsr() & _MM_MASK_MASK;
1191 }
1192 
1193 /// Get the exception state bits from the MXCSR control and status register. 
1194 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
1195 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
1196 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported.
1197 uint _MM_GET_EXCEPTION_STATE() @safe
1198 {
1199     return _mm_getcsr() & _MM_EXCEPT_MASK;
1200 }
1201 
1202 /// Get the flush zero bits from the MXCSR control and status register. 
1203 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`
1204 uint _MM_GET_FLUSH_ZERO_MODE() @safe
1205 {
1206     return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
1207 }
1208 
1209 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 
1210 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
1211 uint _MM_GET_ROUNDING_MODE() @safe
1212 {
1213     return _mm_getcsr() & _MM_ROUND_MASK;
1214 }
1215 
1216 /// Get the unsigned 32-bit value of the MXCSR control and status register.
1217 /// Note: this is emulated on ARM, because there is no MXCSR register then.
1218 uint _mm_getcsr() @trusted
1219 {
1220     static if (LDC_with_ARM)
1221     {
1222         // Note: we convert the ARM FPSCR into a x86 SSE control word.
1223         // However, only rounding mode and flush to zero are actually set.
1224         // The returned control word will have all exceptions masked, and no exception detected.
1225 
1226         uint fpscr = arm_get_fpcr();
1227 
1228         uint cw = 0; // No exception detected
1229         if (fpscr & _MM_FLUSH_ZERO_MASK_ARM)
1230         {
1231             // ARM has one single flag for ARM.
1232             // It does both x86 bits.
1233             // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode
1234             cw |= _MM_FLUSH_ZERO_ON;
1235             cw |= 0x40; // set "denormals are zeros"
1236         } 
1237         cw |= _MM_MASK_MASK; // All exception maske
1238 
1239         // Rounding mode
1240         switch(fpscr & _MM_ROUND_MASK_ARM)
1241         {
1242             default:
1243             case _MM_ROUND_NEAREST_ARM:     cw |= _MM_ROUND_NEAREST;     break;
1244             case _MM_ROUND_DOWN_ARM:        cw |= _MM_ROUND_DOWN;        break;
1245             case _MM_ROUND_UP_ARM:          cw |= _MM_ROUND_UP;          break;
1246             case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break;
1247         }
1248         return cw;
1249     }
1250     else version(GNU)
1251     {
1252         static if (GDC_with_SSE)
1253         {
1254             return __builtin_ia32_stmxcsr();
1255         }
1256         else version(X86)
1257         {
1258             uint sseRounding = 0;
1259             asm pure nothrow @nogc @trusted
1260             {
1261                 "stmxcsr %0;\n" 
1262                   : "=m" (sseRounding)
1263                   : 
1264                   : ;
1265             }
1266             return sseRounding;
1267         }
1268         else
1269             static assert(false);
1270     }
1271     else version (InlineX86Asm)
1272     {
1273         uint controlWord;
1274         asm nothrow @nogc pure @safe
1275         {
1276             stmxcsr controlWord;
1277         }
1278         return controlWord;
1279     }
1280     else
1281         static assert(0, "Not yet supported");
1282 }
1283 unittest
1284 {
1285     uint csr = _mm_getcsr();
1286 }
1287 
1288 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`.
1289 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted
1290 {
1291     short4 r = cast(short4)v;
1292     r.ptr[imm8 & 3] = cast(short)i;
1293     return cast(__m64)r;
1294 }
1295 unittest
1296 {
1297     __m64 A = _mm_set_pi16(3, 2, 1, 0);
1298     short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4);
1299     short[4] correct = [0, 42, 2, 3];
1300     assert(R.array == correct);
1301 }
1302 
1303 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory.
1304 //  `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1305 __m128 _mm_load_ps(const(float)*p) pure @trusted // FUTURE shouldn't be trusted, see #62
1306 {
1307     pragma(inline, true);
1308     return *cast(__m128*)p;
1309 }
1310 unittest
1311 {
1312     static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f];
1313     __m128 A = _mm_load_ps(correct.ptr);
1314     assert(A.array == correct);
1315 }
1316 
1317 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1318 __m128 _mm_load_ps1(const(float)*p) pure @trusted
1319 {
1320     return __m128(*p);
1321 }
1322 unittest
1323 {
1324     float n = 2.5f;
1325     float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f];
1326     __m128 A = _mm_load_ps1(&n);
1327     assert(A.array == correct);
1328 }
1329 
1330 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 
1331 /// elements. `mem_addr` does not need to be aligned on any particular boundary.
1332 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted
1333 {
1334     pragma(inline, true);
1335     static if (DMD_with_DSIMD)
1336     {
1337         return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr);
1338     }
1339     else
1340     {
1341         __m128 r; // PERf =void;
1342         r.ptr[0] = *mem_addr;
1343         r.ptr[1] = 0;
1344         r.ptr[2] = 0;
1345         r.ptr[3] = 0;
1346         return r;
1347     }
1348 }
1349 unittest
1350 {
1351     float n = 2.5f;
1352     float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f];
1353     __m128 A = _mm_load_ss(&n);
1354     assert(A.array == correct);
1355 }
1356 
1357 /// Load a single-precision (32-bit) floating-point element from memory into all elements.
1358 alias _mm_load1_ps = _mm_load_ps1;
1359 
1360 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 
1361 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary.
1362 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1363 {
1364     pragma(inline, true);
1365     static if (DMD_with_DSIMD)
1366     {
1367         return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 
1368     }
1369     else
1370     {
1371         // x86: movlhps generated since LDC 1.9.0 -O1
1372         long2 la = cast(long2)a;
1373         la.ptr[1] = (*mem_addr).array[0];
1374         return cast(__m128)la;
1375     }
1376 }
1377 unittest
1378 {
1379     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1380     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1381     __m64 M = to_m64(cast(__m128i)B);
1382      __m128 R = _mm_loadh_pi(A, &M);
1383     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1384     assert(R.array == correct);
1385 }
1386 
1387 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 
1388 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary.
1389 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted
1390 {
1391     pragma(inline, true);
1392 
1393     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=23046
1394     /*
1395     static if (DMD_with_DSIMD)
1396     {
1397         return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 
1398     }
1399     else */
1400     {
1401         // x86: movlpd/movlps generated with all LDC -01
1402         long2 la = cast(long2)a;
1403         la.ptr[0] = (*mem_addr).array[0];
1404         return cast(__m128)la;
1405     }
1406 }
1407 unittest
1408 {
1409     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1410     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1411     __m64 M = to_m64(cast(__m128i)B);
1412      __m128 R = _mm_loadl_pi(A, &M);
1413     float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f];
1414     assert(R.array == correct);
1415 }
1416 
1417 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 
1418 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1419 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // FUTURE shouldn't be trusted, see #62
1420 {
1421     __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1
1422     __m128 a = *aligned;
1423     static if (DMD_with_DSIMD)
1424     {
1425         return cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
1426     }
1427     else
1428     {
1429         __m128 r; // PERF =void;
1430         r.ptr[0] = a.array[3];
1431         r.ptr[1] = a.array[2];
1432         r.ptr[2] = a.array[1];
1433         r.ptr[3] = a.array[0];
1434         return r;
1435     }
1436 }
1437 unittest
1438 {
1439     align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ];
1440     __m128 A = _mm_loadr_ps(arr.ptr);
1441     float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ];
1442     assert(A.array == correct);
1443 }
1444 
1445 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 
1446 /// `mem_addr` does not need to be aligned on any particular boundary.
1447 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted
1448 {
1449     pragma(inline, true);
1450     static if (GDC_with_SSE2)
1451     {
1452         return __builtin_ia32_loadups(mem_addr);
1453     }
1454     else version(LDC)
1455     {
1456         return loadUnaligned!(__m128)(mem_addr);
1457     }
1458     else version(DigitalMars)
1459     {
1460         static if (DMD_with_DSIMD)
1461         {
1462             return cast(__m128)__simd(XMM.LODUPS, *cast(const(float4*))mem_addr);
1463         }
1464         else static if (SSESizedVectorsAreEmulated)
1465         {
1466             // Since this vector is emulated, it doesn't have alignement constraints
1467             // and as such we can just cast it.
1468             return *cast(__m128*)(mem_addr);
1469         }
1470         else
1471         {
1472             __m128 result;
1473             result.ptr[0] = mem_addr[0];
1474             result.ptr[1] = mem_addr[1];
1475             result.ptr[2] = mem_addr[2];
1476             result.ptr[3] = mem_addr[3];
1477             return result;
1478         }
1479     }
1480     else
1481     {
1482         __m128 result;
1483         result.ptr[0] = mem_addr[0];
1484         result.ptr[1] = mem_addr[1];
1485         result.ptr[2] = mem_addr[2];
1486         result.ptr[3] = mem_addr[3];
1487         return result;
1488     }
1489 }
1490 unittest
1491 {
1492     align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ];  // force unaligned load
1493     __m128 A = _mm_loadu_ps(&arr[1]);
1494     float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ];
1495     assert(A.array == correct);
1496 }
1497 
1498 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.
1499 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted
1500 {
1501     static if (DMD_with_DSIMD)
1502     {
1503         int r = *cast(short*)(mem_addr);
1504         return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r);
1505     }
1506     else version(DigitalMars)
1507     {
1508         // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672
1509         // DMD cannot handle the below code...
1510         align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0];
1511         r[0] = *cast(short*)(mem_addr);
1512         return *cast(int4*)(r.ptr);
1513     }
1514     else
1515     {
1516         short r = *cast(short*)(mem_addr);
1517         short8 result = [0, 0, 0, 0, 0, 0, 0, 0];
1518         result.ptr[0] = r;
1519         return cast(__m128i)result;
1520     }
1521 }
1522 unittest
1523 {
1524     short r = 13;
1525     short8 A = cast(short8) _mm_loadu_si16(&r);
1526     short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0];
1527     assert(A.array == correct);
1528 }
1529 
1530 /// Load unaligned 64-bit integer from memory into the first element of result.
1531 /// Upper 64-bit is zeroed.
1532 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted
1533 {
1534     pragma(inline, true);
1535     static if (DMD_with_DSIMD)
1536     {
1537         return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr);
1538     }
1539     else
1540     {
1541         long r = *cast(long*)(mem_addr);
1542         long2 result = [0, 0];
1543         result.ptr[0] = r;
1544         return cast(__m128i)result;
1545     }
1546 }
1547 unittest
1548 {
1549     long r = 446446446446;
1550     long2 A = cast(long2) _mm_loadu_si64(&r);
1551     long[2] correct = [446446446446, 0];
1552     assert(A.array == correct);
1553 }
1554 
1555 /// Allocate size bytes of memory, aligned to the alignment specified in align,
1556 /// and return a pointer to the allocated memory. `_mm_free` should be used to free
1557 /// memory that is allocated with `_mm_malloc`.
1558 void* _mm_malloc(size_t size, size_t alignment) @trusted
1559 {
1560     assert(alignment != 0);
1561     size_t request = requestedSize(size, alignment);
1562     void* raw = malloc(request);
1563     if (request > 0 && raw == null) // malloc(0) can validly return anything
1564         onOutOfMemoryError();
1565     return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size
1566 }
1567 
1568 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 
1569 /// bit is not set in the corresponding element) and a non-temporal memory hint.
1570 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted
1571 {
1572     // this works since mask is zero-extended
1573     return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr);
1574 }
1575 
1576 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;///
1577 
1578 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value.
1579 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe
1580 {
1581     return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b)));
1582 }
1583 
1584 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1585 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe
1586 {
1587     static if (DMD_with_DSIMD)
1588     {
1589         return cast(__m128) __simd(XMM.MAXPS, a, b);
1590     }
1591     else static if (GDC_with_SSE)
1592     {
1593         return __builtin_ia32_maxps(a, b);
1594     }
1595     else static if (LDC_with_SSE)
1596     {
1597         return __builtin_ia32_maxps(a, b);
1598     }
1599     else
1600     {
1601         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1602         __m128 r; // PERF =void;
1603         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1604         r[1] = (a[1] > b[1]) ? a[1] : b[1];
1605         r[2] = (a[2] > b[2]) ? a[2] : b[2];
1606         r[3] = (a[3] > b[3]) ? a[3] : b[3];
1607         return r;    
1608     }
1609 }
1610 unittest
1611 {
1612     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1613     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1614     __m128 M = _mm_max_ps(A, B);
1615     assert(M.array[0] == 4);
1616     assert(M.array[1] == 2);
1617     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1618     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1619 }
1620 
1621 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values.
1622 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe
1623 {
1624     return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b)));
1625 }
1626 
1627 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 
1628 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1629  __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe
1630 {
1631     static if (DMD_with_DSIMD)
1632     {
1633         return cast(__m128) __simd(XMM.MAXSS, a, b);
1634     }
1635     else static if (GDC_with_SSE)
1636     {
1637         return __builtin_ia32_maxss(a, b);
1638     }
1639     else static if (LDC_with_SSE)
1640     {
1641         return __builtin_ia32_maxss(a, b); 
1642     }
1643     else
1644     {  
1645         __m128 r = a;
1646         r[0] = (a[0] > b[0]) ? a[0] : b[0];
1647         return r;
1648     }
1649 }
1650 unittest
1651 {
1652     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1653     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1654     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1655     __m128 M = _mm_max_ss(A, B);
1656     assert(M.array[0] == 4);
1657     assert(M.array[1] == 2);
1658     assert(M.array[2] == 3);
1659     assert(M.array[3] == 4);
1660     M = _mm_max_ps(A, C); // in case of NaN, second operand prevails
1661     assert(M.array[0] != M.array[0]);
1662     M = _mm_max_ps(C, A); // in case of NaN, second operand prevails
1663     assert(M.array[0] == 1);
1664 }
1665 
1666 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values.
1667 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe
1668 {
1669     return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b)));
1670 }
1671 
1672 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values.
1673 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe
1674 {
1675     static if (DMD_with_DSIMD)
1676     {
1677         return cast(__m128) __simd(XMM.MINPS, a, b);
1678     }
1679     else static if (GDC_with_SSE)
1680     {
1681         return __builtin_ia32_minps(a, b);
1682     }
1683     else static if (LDC_with_SSE)
1684     {
1685         // not technically needed, but better perf in debug mode
1686         return __builtin_ia32_minps(a, b);
1687     }
1688     else
1689     {
1690         // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02
1691         __m128 r; // PERF =void;
1692         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1693         r[1] = (a[1] < b[1]) ? a[1] : b[1];
1694         r[2] = (a[2] < b[2]) ? a[2] : b[2];
1695         r[3] = (a[3] < b[3]) ? a[3] : b[3];
1696         return r;
1697     }
1698 }
1699 unittest
1700 {
1701     __m128 A = _mm_setr_ps(1, 2, float.nan, 4);
1702     __m128 B = _mm_setr_ps(4, 1, 4, float.nan);
1703     __m128 M = _mm_min_ps(A, B);
1704     assert(M.array[0] == 1);
1705     assert(M.array[1] == 1);
1706     assert(M.array[2] == 4);    // in case of NaN, second operand prevails (as it seems)
1707     assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems)
1708 }
1709 
1710 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values.
1711 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe
1712 {
1713     return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b)));
1714 }
1715 
1716 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 
1717 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result.
1718 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe
1719 {
1720     static if (DMD_with_DSIMD)
1721     {
1722         return cast(__m128) __simd(XMM.MINSS, a, b);
1723     }
1724     else static if (GDC_with_SSE)
1725     {
1726         return __builtin_ia32_minss(a, b);
1727     }
1728     else static if (LDC_with_SSE)
1729     {
1730         return __builtin_ia32_minss(a, b);
1731     }
1732     else
1733     {
1734         // Generates minss since LDC 1.3 -O1
1735         __m128 r = a;
1736         r[0] = (a[0] < b[0]) ? a[0] : b[0];
1737         return r;
1738     }
1739 }
1740 unittest
1741 {
1742     __m128 A = _mm_setr_ps(1, 2, 3, 4);
1743     __m128 B = _mm_setr_ps(4, 1, 4, 1);
1744     __m128 C = _mm_setr_ps(float.nan, 1, 4, 1);
1745     __m128 M = _mm_min_ss(A, B);
1746     assert(M.array[0] == 1);
1747     assert(M.array[1] == 2);
1748     assert(M.array[2] == 3);
1749     assert(M.array[3] == 4);
1750     M = _mm_min_ps(A, C); // in case of NaN, second operand prevails
1751     assert(M.array[0] != M.array[0]);
1752     M = _mm_min_ps(C, A); // in case of NaN, second operand prevails
1753     assert(M.array[0] == 1);
1754 }
1755 
1756 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 
1757 /// the upper 3 packed elements from `a` to the upper elements of result.
1758 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted
1759 {
1760     // Workaround https://issues.dlang.org/show_bug.cgi?id=21673
1761     // inlining of this function fails.
1762     version(DigitalMars) asm nothrow @nogc pure { nop; }
1763 
1764     a.ptr[0] = b.array[0];
1765     return a;
1766 }
1767 unittest
1768 {
1769     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1770     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1771     __m128 R = _mm_move_ss(A, B);
1772     float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f];
1773     assert(R.array == correct);
1774 }
1775 
1776 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 
1777 /// copy the upper 2 elements from `a` to the upper 2 elements of dst.
1778 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted
1779 {
1780     // PERF DMD
1781     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1782     /*
1783     static if (DMD_with_DSIMD)
1784     {
1785         
1786         return cast(__m128) __simd(XMM.MOVHLPS, a, b);
1787     }
1788     else */
1789     {
1790         a.ptr[0] = b.array[2];
1791         a.ptr[1] = b.array[3];
1792         return a;
1793     }
1794 }
1795 unittest
1796 {
1797     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1798     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1799     __m128 R = _mm_movehl_ps(A, B);
1800     float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f];
1801     assert(R.array == correct);
1802 }
1803 
1804 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 
1805 /// copy the lower 2 elements from `a` to the lower 2 elements of result
1806 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
1807 {    
1808     // PERF DMD
1809     // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443
1810     /*
1811     static if (DMD_with_DSIMD)
1812     {
1813         return cast(__m128) __simd(XMM.MOVLHPS, a, b);
1814     }
1815     else
1816     */
1817     {
1818         a.ptr[2] = b.array[0];
1819         a.ptr[3] = b.array[1];
1820         return a;
1821     }    
1822 }
1823 unittest
1824 {
1825     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
1826     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
1827     __m128 R = _mm_movelh_ps(A, B);
1828     float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f];
1829     assert(R.array == correct);
1830 }
1831 
1832 /// Create mask from the most significant bit of each 8-bit element in `a`.
1833 int _mm_movemask_pi8 (__m64 a) pure @safe
1834 {
1835     return _mm_movemask_epi8(to_m128i(a));
1836 }
1837 unittest
1838 {
1839     assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0)));
1840 }
1841 
1842 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 
1843 /// floating-point element in `a`.
1844 int _mm_movemask_ps (__m128 a) pure @trusted
1845 {
1846     // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047
1847     static if (GDC_with_SSE)
1848     {
1849         return __builtin_ia32_movmskps(a);
1850     }
1851     else static if (LDC_with_SSE)
1852     {
1853         return __builtin_ia32_movmskps(a);
1854     }
1855     else static if (LDC_with_ARM)
1856     {
1857         int4 ai = cast(int4)a;
1858         int4 shift31 = [31, 31, 31, 31]; 
1859         ai = ai >>> shift31;
1860         int4 shift = [0, 1, 2, 3]; 
1861         ai = ai << shift; // 4-way shift, only efficient on ARM.
1862         int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]);
1863         return r;
1864     }
1865     else
1866     {
1867         int4 ai = cast(int4)a;
1868         int r = 0;
1869         if (ai.array[0] < 0) r += 1;
1870         if (ai.array[1] < 0) r += 2;
1871         if (ai.array[2] < 0) r += 4;
1872         if (ai.array[3] < 0) r += 8;
1873         return r;
1874     }
1875 }
1876 unittest
1877 {
1878     int4 A = [-1, 0, -43, 0];
1879     assert(5 == _mm_movemask_ps(cast(float4)A));
1880 }
1881 
1882 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`.
1883 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe
1884 {
1885     pragma(inline, true);
1886     return a * b;
1887 }
1888 unittest
1889 {
1890     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1891     a = _mm_mul_ps(a, a);
1892     float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f];
1893     assert(a.array == correct);
1894 }
1895 
1896 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 
1897 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result.
1898 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe
1899 {
1900     static if (DMD_with_DSIMD)
1901         return cast(__m128) __simd(XMM.MULSS, a, b);
1902     else static if (GDC_with_SSE)
1903         return __builtin_ia32_mulss(a, b);
1904     else
1905     {
1906         a[0] *= b[0];
1907         return a;
1908     }
1909 }
1910 unittest
1911 {
1912     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
1913     a = _mm_mul_ss(a, a);
1914     float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f];
1915     assert(a.array == correct);
1916 }
1917 
1918 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
1919 /// and return the high 16 bits of the intermediate integers.
1920 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe
1921 {
1922     return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b)));
1923 }
1924 unittest
1925 {
1926     __m64 A = _mm_setr_pi16(0, -16, 2, 3);
1927     __m64 B = _mm_set1_pi16(16384);
1928     short4 R = cast(short4)_mm_mulhi_pu16(A, B);
1929     short[4] correct = [0, 0x3FFC, 0, 0];
1930     assert(R.array == correct);
1931 }
1932 
1933 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 
1934 /// return the result.
1935 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe
1936 {
1937     static if (DMD_with_DSIMD)
1938         return cast(__m128)__simd(XMM.ORPS, a, b);
1939     else
1940         return cast(__m128)(cast(__m128i)a | cast(__m128i)b);
1941 }
1942 unittest
1943 {
1944     __m128 A = cast(__m128) _mm_set1_epi32(0x80000000);
1945     __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity);
1946     __m128 C = _mm_or_ps(A, B);
1947     float[4] correct = [-4.0f, -5.0, -9.5f, -float.infinity];
1948     assert(C.array == correct);
1949 }
1950 
1951 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;///
1952 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;///
1953 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;///
1954 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;///
1955 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;///
1956 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;///
1957 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;///
1958 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;///
1959 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;///
1960 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;///
1961 
1962 enum _MM_HINT_T0  = 3; ///
1963 enum _MM_HINT_T1  = 2; ///
1964 enum _MM_HINT_T2  = 1; ///
1965 enum _MM_HINT_NTA = 0; ///
1966 
1967 
1968 version(LDC)
1969 {
1970     // Starting with LLVM 10, it seems llvm.prefetch has changed its name.
1971     // Was reported at: https://github.com/ldc-developers/ldc/issues/3397
1972     static if (__VERSION__ >= 2091) 
1973     {
1974         pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch"
1975             void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe;
1976     }
1977 }
1978 
1979 /// Fetch the line of data from memory that contains address `p` to a location in the 
1980 /// cache hierarchy specified by the locality hint i.
1981 ///
1982 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API.
1983 void _mm_prefetch(int locality)(const(void)* p) pure @trusted
1984 {
1985     static if (GDC_with_SSE)
1986     {
1987         return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3);
1988     }
1989     else static if (DMD_with_DSIMD)
1990     {
1991         enum bool isWrite = (locality & 0x4) != 0;
1992         enum level = locality & 3;
1993         return prefetch!(isWrite, level)(p);
1994     }
1995     else version(LDC)
1996     {
1997         static if (__VERSION__ >= 2091)
1998         {
1999             // const_cast here. `llvm_prefetch` wants a mutable pointer
2000             llvm_prefetch_fixed( cast(void*)p, 0, locality, 1);
2001         }
2002         else
2003         {
2004             // const_cast here. `llvm_prefetch` wants a mutable pointer
2005             llvm_prefetch( cast(void*)p, 0, locality, 1);
2006         }
2007     }
2008     else version(D_InlineAsm_X86_64)
2009     {
2010         static if (locality == _MM_HINT_NTA)
2011         {
2012             asm pure nothrow @nogc @trusted
2013             {
2014                 mov RAX, p;
2015                 prefetchnta [RAX];
2016             }
2017         }
2018         else static if (locality == _MM_HINT_T0)
2019         {
2020             asm pure nothrow @nogc @trusted
2021             {
2022                 mov RAX, p;
2023                 prefetcht0 [RAX];
2024             }
2025         }
2026         else static if (locality == _MM_HINT_T1)
2027         {
2028             asm pure nothrow @nogc @trusted
2029             {
2030                 mov RAX, p;
2031                 prefetcht1 [RAX];
2032             }
2033         }
2034         else static if (locality == _MM_HINT_T2)
2035         {
2036             asm pure nothrow @nogc @trusted
2037             {
2038                 mov RAX, p;
2039                 prefetcht2 [RAX];
2040             }
2041         }
2042         else
2043             assert(false); // invalid locality hint
2044     }
2045     else version(D_InlineAsm_X86)
2046     {
2047         static if (locality == _MM_HINT_NTA)
2048         {
2049             asm pure nothrow @nogc @trusted
2050             {
2051                 mov EAX, p;
2052                 prefetchnta [EAX];
2053             }
2054         }
2055         else static if (locality == _MM_HINT_T0)
2056         {
2057             asm pure nothrow @nogc @trusted
2058             {
2059                 mov EAX, p;
2060                 prefetcht0 [EAX];
2061             }
2062         }
2063         else static if (locality == _MM_HINT_T1)
2064         {
2065             asm pure nothrow @nogc @trusted
2066             {
2067                 mov EAX, p;
2068                 prefetcht1 [EAX];
2069             }
2070         }
2071         else static if (locality == _MM_HINT_T2)
2072         {
2073             asm pure nothrow @nogc @trusted
2074             {
2075                 mov EAX, p;
2076                 prefetcht2 [EAX];
2077             }
2078         }
2079         else 
2080             assert(false); // invalid locality hint
2081     }
2082     else
2083     {
2084         // Generic version: do nothing. From bitter experience, 
2085         // it's unlikely you get ANY speed-up with manual prefetching.
2086         // Prefetching or not doesn't change program behaviour.
2087     }
2088 }
2089 unittest
2090 {
2091     // From Intel documentation:
2092     // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 
2093     // 32 bytes."
2094     ubyte[256] cacheline; // though it seems it cannot generate GP fault
2095     _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 
2096     _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 
2097     _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 
2098     _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 
2099 }
2100 
2101 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;///
2102 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;///
2103 
2104 
2105 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 
2106 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12.
2107 __m128 _mm_rcp_ps (__m128 a) pure @trusted
2108 {
2109     static if (DMD_with_DSIMD)
2110     {
2111         return cast(__m128) __simd(XMM.RCPPS, a);
2112     }
2113     else static if (GDC_with_SSE)
2114     {
2115         return __builtin_ia32_rcpps(a);
2116     }
2117     else static if (LDC_with_SSE)
2118     {
2119         return __builtin_ia32_rcpps(a);
2120     }
2121     else
2122     {        
2123         a.ptr[0] = 1.0f / a.array[0];
2124         a.ptr[1] = 1.0f / a.array[1];
2125         a.ptr[2] = 1.0f / a.array[2];
2126         a.ptr[3] = 1.0f / a.array[3];
2127         return a;
2128     }
2129 }
2130 unittest
2131 {
2132     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2133     __m128 groundTruth = _mm_set1_ps(1.0f) / A;
2134     __m128 result = _mm_rcp_ps(A);
2135     foreach(i; 0..4)
2136     {
2137         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2138         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2139     }
2140 }
2141 
2142 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 
2143 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 
2144 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2145 __m128 _mm_rcp_ss (__m128 a) pure @trusted
2146 {
2147     // Disabled, see https://issues.dlang.org/show_bug.cgi?id=23049
2148     /*static if (DMD_with_DSIMD)
2149     {
2150         return cast(__m128) __simd(XMM.RCPSS, a);
2151     }
2152     else*/
2153     static if (GDC_with_SSE)
2154     {
2155         return __builtin_ia32_rcpss(a);
2156     }
2157     else static if (LDC_with_SSE)
2158     {
2159         return __builtin_ia32_rcpss(a);
2160     }
2161     else
2162     {
2163         a.ptr[0] = 1.0f / a.array[0];
2164         return a;
2165     }
2166 }
2167 unittest
2168 {
2169     __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f);
2170     __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f);
2171     __m128 R = _mm_rcp_ss(A);
2172     double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1;
2173     assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2174     assert(R.array[1] == correct.array[1]);
2175     assert(R.array[2] == correct.array[2]);
2176     assert(R.array[3] == correct.array[3]);
2177 }
2178 
2179 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 
2180 /// and return a pointer to the newly allocated memory. 
2181 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 
2182 /// allocated with `_mm_malloc` or `_mm_realloc`.
2183 /// Previous data is preserved.
2184 void* _mm_realloc(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
2185 {
2186     return alignedReallocImpl!true(aligned, size, alignment);
2187 }
2188 unittest
2189 {
2190     enum NALLOC = 8;
2191     enum size_t[8] ALIGNMENTS = [1, 2, 4, 8, 16, 32, 64, 128];
2192     
2193     void*[NALLOC] alloc;
2194 
2195     foreach(t; 0..100)
2196     {
2197         foreach(n; 0..NALLOC)
2198         {
2199             size_t alignment = ALIGNMENTS[n];
2200             size_t s = ( (n + t * 69096) & 0xffff );
2201             alloc[n] = _mm_realloc(alloc[n], s, alignment);
2202             assert(isPointerAligned(alloc[n], alignment));
2203             foreach(b; 0..s)
2204                 (cast(ubyte*)alloc[n])[b] = cast(ubyte)n;
2205         }
2206     }
2207     foreach(n; 0..NALLOC)
2208     {
2209         alloc[n] = _mm_realloc(alloc[n], 0, ALIGNMENTS[n]);
2210     }
2211 }
2212 
2213 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 
2214 /// and return a pointer to the newly allocated memory. 
2215 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 
2216 /// allocated with `_mm_malloc` or `_mm_realloc`.
2217 /// Previous data is discarded.
2218 void* _mm_realloc_discard(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS
2219 {
2220     return alignedReallocImpl!false(aligned, size, alignment);
2221 }
2222 
2223 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 
2224 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2225 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted
2226 {
2227     static if (DMD_with_DSIMD)
2228     {
2229         return cast(__m128) __simd(XMM.RSQRTPS, a);
2230     }
2231     else static if (GDC_with_SSE)
2232     {
2233         return __builtin_ia32_rsqrtps(a);
2234     }
2235     else static if (LDC_with_SSE)
2236     {
2237         return __builtin_ia32_rsqrtps(a);
2238     }
2239     else version(LDC)
2240     {
2241         a[0] = 1.0f / llvm_sqrt(a[0]);
2242         a[1] = 1.0f / llvm_sqrt(a[1]);
2243         a[2] = 1.0f / llvm_sqrt(a[2]);
2244         a[3] = 1.0f / llvm_sqrt(a[3]);
2245         return a;
2246     }
2247     else
2248     {
2249         a.ptr[0] = 1.0f / sqrt(a.array[0]);
2250         a.ptr[1] = 1.0f / sqrt(a.array[1]);
2251         a.ptr[2] = 1.0f / sqrt(a.array[2]);
2252         a.ptr[3] = 1.0f / sqrt(a.array[3]);
2253         return a;
2254     }
2255 }
2256 unittest
2257 {
2258     __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f);
2259     __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f);
2260     __m128 result = _mm_rsqrt_ps(A);
2261     foreach(i; 0..4)
2262     {
2263         double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1;
2264         assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093
2265     }
2266 }
2267 
2268 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`,
2269 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 
2270 /// The maximum relative error for this approximation is less than 1.5*2^-12.
2271 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted
2272 {   
2273     static if (DMD_with_DSIMD)
2274     {
2275         return cast(__m128) __simd(XMM.RSQRTSS, a);
2276     }
2277     else static if (GDC_with_SSE)
2278     {
2279         return __builtin_ia32_rsqrtss(a);
2280     }
2281     else static if (LDC_with_SSE)
2282     {
2283         return __builtin_ia32_rsqrtss(a);
2284     }
2285     else version(LDC)
2286     {
2287         a[0] = 1.0f / llvm_sqrt(a[0]);
2288         return a;
2289     }
2290     else
2291     {
2292         a[0] = 1.0f / sqrt(a[0]);
2293         return a;
2294     }
2295 }
2296 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss
2297 {
2298     double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc?
2299     void testApproximateSSE(float number) nothrow @nogc
2300     {
2301         __m128 A = _mm_set1_ps(number);
2302 
2303         // test _mm_rcp_ps
2304         __m128 B = _mm_rcp_ps(A);
2305         foreach(i; 0..4)
2306         {
2307             double exact = 1.0f / A.array[i];
2308             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2309             assert(abs_double(ratio - 1) <= maxRelativeError);
2310         }
2311 
2312         // test _mm_rcp_ss
2313         {
2314             B = _mm_rcp_ss(A);
2315             double exact = 1.0f / A.array[0];
2316             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2317             assert(abs_double(ratio - 1) <= maxRelativeError);
2318         }
2319 
2320         // test _mm_rsqrt_ps
2321         B = _mm_rsqrt_ps(A);
2322         foreach(i; 0..4)
2323         {
2324             double exact = 1.0f / sqrt(A.array[i]);
2325             double ratio = cast(double)(B.array[i]) / cast(double)(exact);
2326             assert(abs_double(ratio - 1) <= maxRelativeError);
2327         }
2328 
2329         // test _mm_rsqrt_ss
2330         {
2331             B = _mm_rsqrt_ss(A);
2332             double exact = 1.0f / sqrt(A.array[0]);
2333             double ratio = cast(double)(B.array[0]) / cast(double)(exact);
2334             assert(abs_double(ratio - 1) <= maxRelativeError);
2335         }
2336     }
2337 
2338     testApproximateSSE(0.00001f);
2339     testApproximateSSE(1.1f);
2340     testApproximateSSE(345.0f);
2341     testApproximateSSE(2.45674864151f);
2342     testApproximateSSE(700000.0f);
2343     testApproximateSSE(10000000.0f);
2344     testApproximateSSE(27841456468.0f);
2345 }
2346 
2347 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 
2348 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 
2349 /// low 16 bits of result.
2350 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe
2351 {
2352     return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b)));
2353 }
2354 
2355 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2356 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`,
2357 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
2358 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe
2359 {
2360     // Note: unsupported on ARM
2361     _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx);
2362 }
2363 
2364 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2365 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 
2366 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`.
2367 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe
2368 {
2369     // Note: unsupported on ARM
2370     _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx);
2371 }
2372 
2373 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2374 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`.
2375 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe
2376 {
2377     _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx);
2378 }
2379 
2380 /// Set packed single-precision (32-bit) floating-point elements with the supplied values.
2381 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted
2382 {
2383     __m128 r = void;
2384     r.ptr[0] = e0;
2385     r.ptr[1] = e1;
2386     r.ptr[2] = e2;
2387     r.ptr[3] = e3;
2388     return r;
2389 }
2390 unittest
2391 {
2392     __m128 A = _mm_set_ps(3, 2, 1, 546);
2393     float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f];
2394     assert(A.array == correct);
2395 }
2396 
2397 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; ///
2398 
2399 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 
2400 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 
2401 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`.
2402 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe
2403 {
2404     // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2405     version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; }
2406     _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx);
2407 }
2408 
2409 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements.
2410 __m128 _mm_set_ss (float a) pure @trusted
2411 {
2412     static if (DMD_with_DSIMD)
2413     {
2414         return cast(__m128) __simd(XMM.LODSS, a);
2415     }
2416     else
2417     {
2418         __m128 r = _mm_setzero_ps();
2419         r.ptr[0] = a;
2420         return r;
2421     }
2422 }
2423 unittest
2424 {
2425     float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f];
2426     __m128 A = _mm_set_ss(42.0f);
2427     assert(A.array == correct);
2428 }
2429 
2430 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements.
2431 __m128 _mm_set1_ps (float a) pure @trusted
2432 {
2433     pragma(inline, true);
2434     __m128 r = a;
2435     return r;
2436 }
2437 unittest
2438 {
2439     float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f];
2440     __m128 A = _mm_set1_ps(42.0f);
2441     assert(A.array == correct);
2442 }
2443 
2444 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`.
2445 void _mm_setcsr(uint controlWord) @trusted
2446 {
2447     static if (LDC_with_ARM)
2448     {
2449         // Convert from SSE to ARM control word. This is done _partially_
2450         // and only support rounding mode changes.
2451 
2452         // "To alter some bits of a VFP system register without 
2453         // affecting other bits, use a read-modify-write procedure"
2454         uint fpscr = arm_get_fpcr();
2455         
2456         // Bits 23 to 22 are rounding modes, however not used in NEON
2457         fpscr = fpscr & ~_MM_ROUND_MASK_ARM;
2458         switch(controlWord & _MM_ROUND_MASK)
2459         {
2460             default:
2461             case _MM_ROUND_NEAREST:     fpscr |= _MM_ROUND_NEAREST_ARM;     break;
2462             case _MM_ROUND_DOWN:        fpscr |= _MM_ROUND_DOWN_ARM;        break;
2463             case _MM_ROUND_UP:          fpscr |= _MM_ROUND_UP_ARM;          break;
2464             case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break;
2465         }
2466         fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM;
2467         if (controlWord & _MM_FLUSH_ZERO_MASK)
2468             fpscr |= _MM_FLUSH_ZERO_MASK_ARM;
2469         arm_set_fpcr(fpscr);
2470     }
2471     else version(GNU)
2472     {
2473         static if (GDC_with_SSE)
2474         {
2475             // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607
2476             version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; }
2477             __builtin_ia32_ldmxcsr(controlWord);
2478         }
2479         else version(X86)
2480         {
2481             asm nothrow @nogc @trusted
2482             {
2483                 "ldmxcsr %0;\n" 
2484                   : 
2485                   : "m" (controlWord)
2486                   : ;
2487             }
2488         }
2489         else
2490             static assert(false);
2491     }
2492     else version (InlineX86Asm)
2493     {
2494         asm nothrow @nogc @safe
2495         {
2496             ldmxcsr controlWord;
2497         }
2498     }
2499     else
2500         static assert(0, "Not yet supported");
2501 }
2502 unittest
2503 {
2504     _mm_setcsr(_mm_getcsr());
2505 }
2506 
2507 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order.
2508 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted
2509 {
2510     pragma(inline, true);
2511   
2512     // This small = void here wins a bit in all optimization levels in GDC
2513     // and in -O0 in LDC.
2514     __m128 r = void;
2515     r.ptr[0] = e3;
2516     r.ptr[1] = e2;
2517     r.ptr[2] = e1;
2518     r.ptr[3] = e0;
2519     return r;
2520 }
2521 unittest
2522 {
2523     __m128 A = _mm_setr_ps(3, 2, 1, 546);
2524     float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f];
2525     assert(A.array == correct);
2526 }
2527 
2528 /// Return vector of type `__m128` with all elements set to zero.
2529 __m128 _mm_setzero_ps() pure @trusted
2530 {
2531     pragma(inline, true);
2532 
2533     // Note: for all compilers, this works best in debug builds, and in DMD -O
2534     int4 r; 
2535     return cast(__m128)r;
2536 }
2537 unittest
2538 {
2539     __m128 R = _mm_setzero_ps();
2540     float[4] correct = [0.0f, 0, 0, 0];
2541     assert(R.array == correct);
2542 }
2543 
2544 /// Do a serializing operation on all store-to-memory instructions that were issued prior 
2545 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 
2546 /// is globally visible before any store instruction which follows the fence in program order.
2547 void _mm_sfence() @trusted
2548 {
2549     version(GNU)
2550     {
2551         static if (GDC_with_SSE)
2552         {
2553             __builtin_ia32_sfence();
2554         }
2555         else version(X86)
2556         {
2557             asm pure nothrow @nogc @trusted
2558             {
2559                 "sfence;\n" : : : ;
2560             }
2561         }
2562         else
2563             static assert(false);
2564     }
2565     else static if (LDC_with_SSE)
2566     {
2567         __builtin_ia32_sfence();
2568     }
2569     else static if (DMD_with_asm)
2570     {
2571         // PERF: can't be inlined in DMD, probably because of that assembly.
2572         asm nothrow @nogc pure @safe
2573         {
2574             sfence;
2575         }
2576     }
2577     else static if (LDC_with_ARM64)
2578     {
2579         __builtin_arm_dmb(10); // dmb ishst
2580     }
2581     else version(LDC)
2582     {
2583         // When the architecture is unknown, generate a full memory barrier,
2584         // as the semantics of sfence do not really match those of atomics.
2585         llvm_memory_fence();
2586     }
2587     else
2588         static assert(false);
2589 }
2590 unittest
2591 {
2592     _mm_sfence();
2593 }
2594 
2595 
2596 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @trusted
2597 {
2598     // PERF DMD + D_SIMD
2599     version(LDC)
2600     {
2601         return cast(__m64) shufflevectorLDC!(short4, ( (imm8 >> 0) & 3 ),
2602                                                      ( (imm8 >> 2) & 3 ),
2603                                                      ( (imm8 >> 4) & 3 ),
2604                                                      ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a);
2605     }
2606     else
2607     {
2608         // GDC optimizes that correctly starting with -O2
2609         short4 sa = cast(short4)a;
2610         short4 r = void;
2611         r.ptr[0] = sa.array[ (imm8 >> 0) & 3 ];
2612         r.ptr[1] = sa.array[ (imm8 >> 2) & 3 ];
2613         r.ptr[2] = sa.array[ (imm8 >> 4) & 3 ];
2614         r.ptr[3] = sa.array[ (imm8 >> 6) & 3 ];
2615         return cast(__m64)r;
2616     }
2617 }
2618 unittest
2619 {
2620     __m64 A = _mm_setr_pi16(0, 1, 2, 3);
2621     enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3);
2622     short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A);
2623     short[4] expectedB = [ 3, 2, 1, 0 ];
2624     assert(B.array == expectedB);
2625 }
2626 
2627 /// Shuffle single-precision (32-bit) floating-point elements in `a` and `b` using the control in `imm8`, 
2628 /// Warning: the immediate shuffle value `imm` is given at compile-time instead of runtime.
2629 __m128 _mm_shuffle_ps(ubyte imm8)(__m128 a, __m128 b) pure @trusted
2630 {
2631     static if (GDC_with_SSE)
2632     {
2633         return __builtin_ia32_shufps(a, b, imm8);
2634     }
2635     else static if (DMD_with_DSIMD)
2636     {
2637         return cast(__m128) __simd(XMM.SHUFPS, a, b, imm8);
2638     }
2639     else version(LDC)
2640     {
2641         return shufflevectorLDC!(__m128, imm8 & 3, (imm8>>2) & 3, 
2642                                  4 + ((imm8>>4) & 3), 4 + ((imm8>>6) & 3) )(a, b);
2643     }
2644     else
2645     {
2646         float4 r = void;
2647         r.ptr[0] = a.array[ (imm8 >> 0) & 3 ];
2648         r.ptr[1] = a.array[ (imm8 >> 2) & 3 ];
2649         r.ptr[2] = b.array[ (imm8 >> 4) & 3 ];
2650         r.ptr[3] = b.array[ (imm8 >> 6) & 3 ];
2651         return r;
2652     }
2653 }
2654 unittest
2655 {
2656     __m128 A = _mm_setr_ps(0, 1, 2, 3);
2657     __m128 B = _mm_setr_ps(4, 5, 6, 7);
2658     __m128 C = _mm_shuffle_ps!0x9c(A, B);
2659     float[4] correct = [0.0f, 3, 5, 6];
2660     assert(C.array == correct);
2661 }
2662 
2663 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`.
2664 __m128 _mm_sqrt_ps(__m128 a) @trusted
2665 {
2666     static if (GDC_with_SSE)
2667     {
2668         return __builtin_ia32_sqrtps(a);
2669     }
2670     else static if (DMD_with_DSIMD)
2671     {
2672         return cast(__m128) __simd(XMM.SQRTPS, a);
2673     }
2674     else version(LDC)
2675     {
2676         // Disappeared with LDC 1.11
2677         static if (__VERSION__ < 2081)
2678             return __builtin_ia32_sqrtps(a);
2679         else
2680         {
2681             // PERF: use llvm_sqrt on the vector, works better
2682             a[0] = llvm_sqrt(a[0]);
2683             a[1] = llvm_sqrt(a[1]);
2684             a[2] = llvm_sqrt(a[2]);
2685             a[3] = llvm_sqrt(a[3]);
2686             return a;
2687         }
2688     }
2689     else
2690     {
2691         a.ptr[0] = sqrt(a.array[0]);
2692         a.ptr[1] = sqrt(a.array[1]);
2693         a.ptr[2] = sqrt(a.array[2]);
2694         a.ptr[3] = sqrt(a.array[3]);
2695         return a;
2696     }
2697 }
2698 unittest
2699 {
2700     __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f));
2701     assert(A.array[0] == 2.0f);
2702     assert(A.array[1] == 2.0f);
2703     assert(A.array[2] == 2.0f);
2704     assert(A.array[3] == 2.0f);
2705 }
2706 
2707 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower
2708 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result.
2709 __m128 _mm_sqrt_ss(__m128 a) @trusted
2710 {
2711     static if (GDC_with_SSE)
2712     {
2713         return __builtin_ia32_sqrtss(a);
2714     }
2715     // PERF DMD
2716     // TODO: enable when https://issues.dlang.org/show_bug.cgi?id=23437 is fixed for good
2717     /*else static if (DMD_with_DSIMD)
2718     {
2719         return cast(__m128) __simd(XMM.SQRTSS, a);
2720     }*/
2721     else version(LDC)
2722     {
2723         a.ptr[0] = llvm_sqrt(a.array[0]);
2724         return a;
2725     }
2726     else
2727     {   
2728         a.ptr[0] = sqrt(a.array[0]);
2729         return a;
2730     }
2731 }
2732 unittest
2733 {
2734     __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f));
2735     assert(A.array[0] == 2.0f);
2736     assert(A.array[1] == 4.0f);
2737     assert(A.array[2] == 4.0f);
2738     assert(A.array[3] == 4.0f);
2739 }
2740 
2741 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2742 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2743 void _mm_store_ps (float* mem_addr, __m128 a) pure
2744 {
2745     pragma(inline, true);
2746     __m128* aligned = cast(__m128*)mem_addr;
2747     *aligned = a;
2748 }
2749 
2750 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; ///
2751 
2752 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 
2753 /// `mem_addr` does not need to be aligned on any particular boundary.
2754 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe
2755 {
2756     pragma(inline, true);
2757     *mem_addr = a.array[0];
2758 }
2759 unittest
2760 {
2761     float a;
2762     _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546));
2763     assert(a == 546);
2764 }
2765 
2766 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 
2767 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2768 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE: shouldn't be trusted, see #62
2769 {
2770     __m128* aligned = cast(__m128*)mem_addr;
2771     static if (DMD_with_DSIMD)
2772     {
2773         __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 0);
2774     }
2775     else
2776     {
2777         __m128 r; // PERF =void;
2778         r.ptr[0] = a.array[0];
2779         r.ptr[1] = a.array[0];
2780         r.ptr[2] = a.array[0];
2781         r.ptr[3] = a.array[0];
2782     }
2783     *aligned = r;
2784 }
2785 unittest
2786 {
2787     align(16) float[4] A;
2788     _mm_store1_ps(A.ptr, _mm_set_ss(42.0f));
2789     float[4] correct = [42.0f, 42, 42, 42];
2790     assert(A == correct);
2791 }
2792 
2793 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory.
2794 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted
2795 {
2796     pragma(inline, true);
2797     long2 la = cast(long2)a;
2798     (*p).ptr[0] = la.array[1];
2799 }
2800 unittest
2801 {
2802     __m64 R = _mm_setzero_si64();
2803     long2 A = [13, 25];
2804     _mm_storeh_pi(&R, cast(__m128)A);
2805     assert(R.array[0] == 25);
2806 }
2807 
2808 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory.
2809 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted
2810 {
2811     pragma(inline, true);
2812     long2 la = cast(long2)a;
2813     (*p).ptr[0] = la.array[0];
2814 }
2815 unittest
2816 {
2817     __m64 R = _mm_setzero_si64();
2818     long2 A = [13, 25];
2819     _mm_storel_pi(&R, cast(__m128)A);
2820     assert(R.array[0] == 13);
2821 }
2822 
2823 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 
2824 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
2825 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted
2826 {
2827     __m128* aligned = cast(__m128*)mem_addr;
2828     static if (DMD_with_DSIMD)
2829     {
2830         __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 27);
2831     }
2832     else
2833     {
2834         __m128 r; // PERF =void;
2835         r.ptr[0] = a.array[3];
2836         r.ptr[1] = a.array[2];
2837         r.ptr[2] = a.array[1];
2838         r.ptr[3] = a.array[0];
2839     }
2840     *aligned = r;
2841 }
2842 unittest
2843 {
2844     align(16) float[4] A;
2845     _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4));
2846     float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f];
2847     assert(A == correct);
2848 }
2849 
2850 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 
2851 /// `mem_addr` does not need to be aligned on any particular boundary.
2852 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted, see #62
2853 {
2854     pragma(inline, true);
2855     static if (DMD_with_DSIMD)
2856     {
2857         cast(void) __simd_sto(XMM.STOUPS, *cast(void16*)(cast(float*)mem_addr), a);
2858     }
2859     else static if (GDC_with_SSE)
2860     {
2861         __builtin_ia32_storeups(mem_addr, a); // better in -O0
2862     }
2863     else version(LDC)
2864     {
2865         storeUnaligned!(float4)(a, mem_addr);
2866     }
2867     else
2868     {
2869         mem_addr[0] = a.array[0];
2870         mem_addr[1] = a.array[1];
2871         mem_addr[2] = a.array[2];
2872         mem_addr[3] = a.array[3];
2873     }
2874 }
2875 unittest
2876 {
2877     __m128 A = _mm_setr_ps(1.0f, 2, 3, 4);
2878     align(16) float[6] R = [0.0f, 0, 0, 0, 0, 0];
2879     float[4] correct = [1.0f, 2, 3, 4];
2880     _mm_storeu_ps(&R[1], A);
2881     assert(R[1..5] == correct);
2882 }
2883 
2884 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint.
2885 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
2886 void _mm_stream_pi (__m64* mem_addr, __m64 a) pure @trusted
2887 {
2888     _mm_stream_si64(cast(long*)mem_addr, a.array[0]);
2889 }
2890 
2891 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 
2892 /// `a`s into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 
2893 /// boundary or a general-protection exception may be generated.
2894 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads.
2895 void _mm_stream_ps (float* mem_addr, __m128 a)
2896 {
2897     // PERF DMD D_SIMD
2898     static if (GDC_with_SSE)
2899     {
2900         return __builtin_ia32_movntps(mem_addr, a); 
2901     }
2902     else version(LDC)
2903     {
2904         enum prefix = `!0 = !{ i32 1 }`;
2905         enum ir = `
2906             store <4 x float> %1, <4 x float>* %0, align 16, !nontemporal !0
2907             ret void`;
2908         LDCInlineIREx!(prefix, ir, "", void, __m128*, float4)(cast(__m128*)mem_addr, a);
2909 
2910     }
2911     else
2912     {
2913         // Regular store instead.
2914         __m128* dest = cast(__m128*)mem_addr;
2915         *dest = a; // it's a regular move instead
2916     }
2917 }
2918 unittest
2919 {
2920     align(16) float[4] A;
2921     _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f));
2922     assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f);
2923 }
2924 
2925 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 
2926 /// floating-point elements in `a`.
2927 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe
2928 {
2929     pragma(inline, true);
2930     return a - b;
2931 }
2932 unittest
2933 {
2934     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2935     a = _mm_sub_ps(a, a);
2936     float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
2937     assert(a.array == correct);
2938 }
2939 
2940 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit)
2941 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 
2942 /// packed elements from a to the upper elements of result.
2943 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe
2944 {
2945     static if (DMD_with_DSIMD)
2946         return cast(__m128) __simd(XMM.SUBSS, a, b);
2947     else static if (GDC_with_SSE)
2948         return __builtin_ia32_subss(a, b);
2949     else
2950     {
2951         a[0] -= b[0];
2952         return a;
2953     }
2954 }
2955 unittest
2956 {
2957     __m128 a = [1.5f, -2.0f, 3.0f, 1.0f];
2958     a = _mm_sub_ss(a, a);
2959     float[4] correct = [0.0f, -2.0, 3.0f, 1.0f];
2960     assert(a.array == correct);
2961 }
2962 
2963 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 
2964 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
2965 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe
2966 {
2967     __m128 tmp3, tmp2, tmp1, tmp0;
2968     tmp0 = _mm_unpacklo_ps(row0, row1);
2969     tmp2 = _mm_unpacklo_ps(row2, row3);
2970     tmp1 = _mm_unpackhi_ps(row0, row1);
2971     tmp3 = _mm_unpackhi_ps(row2, row3);
2972     row0 = _mm_movelh_ps(tmp0, tmp2);
2973     row1 = _mm_movehl_ps(tmp2, tmp0);
2974     row2 = _mm_movelh_ps(tmp1, tmp3);
2975     row3 = _mm_movehl_ps(tmp3, tmp1);
2976 }
2977 unittest
2978 {
2979     __m128 l0 = _mm_setr_ps(0, 1, 2, 3);
2980     __m128 l1 = _mm_setr_ps(4, 5, 6, 7);
2981     __m128 l2 = _mm_setr_ps(8, 9, 10, 11);
2982     __m128 l3 = _mm_setr_ps(12, 13, 14, 15);
2983     _MM_TRANSPOSE4_PS(l0, l1, l2, l3);
2984     float[4] r0 = [0.0f, 4, 8, 12];
2985     float[4] r1 = [1.0f, 5, 9, 13];
2986     float[4] r2 = [2.0f, 6, 10, 14];
2987     float[4] r3 = [3.0f, 7, 11, 15];
2988     assert(l0.array == r0);
2989     assert(l1.array == r1);
2990     assert(l2.array == r2);
2991     assert(l3.array == r3);
2992 }
2993 
2994 // Note: the only difference between these intrinsics is the signalling
2995 //       behaviour of quiet NaNs. This is incorrect but the case where
2996 //       you would want to differentiate between qNaN and sNaN and then
2997 //       treat them differently on purpose seems extremely rare.
2998 alias _mm_ucomieq_ss = _mm_comieq_ss;
2999 alias _mm_ucomige_ss = _mm_comige_ss;
3000 alias _mm_ucomigt_ss = _mm_comigt_ss;
3001 alias _mm_ucomile_ss = _mm_comile_ss;
3002 alias _mm_ucomilt_ss = _mm_comilt_ss;
3003 alias _mm_ucomineq_ss = _mm_comineq_ss;
3004 
3005 /// Return vector of type `__m128` with undefined elements.
3006 __m128 _mm_undefined_ps() pure @safe
3007 {
3008     pragma(inline, true);
3009     __m128 undef = void;
3010     return undef;
3011 }
3012 
3013 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`.
3014 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted
3015 {
3016     // PERF GDC use intrinsic
3017     static if (DMD_with_DSIMD)
3018     {
3019         return cast(__m128) __simd(XMM.UNPCKHPS, a, b);
3020     }
3021     else version(LDC)
3022     {
3023         return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b);
3024     }
3025     else
3026     {
3027         __m128 r; // PERF =void;
3028         r.ptr[0] = a.array[2];
3029         r.ptr[1] = b.array[2];
3030         r.ptr[2] = a.array[3];
3031         r.ptr[3] = b.array[3];
3032         return r;
3033     }
3034 }
3035 unittest
3036 {
3037     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
3038     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
3039     __m128 R = _mm_unpackhi_ps(A, B);
3040     float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f];
3041     assert(R.array == correct);
3042 }
3043 
3044 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`.
3045 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted
3046 {
3047     // PERF GDC use intrinsic
3048     static if (DMD_with_DSIMD)
3049     {
3050         return cast(__m128) __simd(XMM.UNPCKLPS, a, b);
3051     }
3052     else version(LDC)
3053     {
3054         return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b);
3055     }
3056     else
3057     {
3058         __m128 r; // PERF =void;
3059         r.ptr[0] = a.array[0];
3060         r.ptr[1] = b.array[0];
3061         r.ptr[2] = a.array[1];
3062         r.ptr[3] = b.array[1];
3063         return r;
3064     }
3065 }
3066 unittest
3067 {
3068     __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
3069     __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
3070     __m128 R = _mm_unpacklo_ps(A, B);
3071     float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f];
3072     assert(R.array == correct);
3073 }
3074 
3075 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
3076 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe
3077 {
3078     return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b);
3079 }
3080 unittest
3081 {
3082     __m128 A = cast(__m128) _mm_set1_epi32(0x80000000);
3083     __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity);
3084     __m128 C = _mm_xor_ps(A, B);
3085     float[4] correct = [-4.0f, 5.0, 9.5f, -float.infinity];
3086     assert(C.array == correct);
3087 }
3088 
3089 private
3090 {
3091     // Returns: `true` if the pointer is suitably aligned.
3092     bool isPointerAligned(void* p, size_t alignment) pure
3093     {
3094         assert(alignment != 0);
3095         return ( cast(size_t)p & (alignment - 1) ) == 0;
3096     }
3097 
3098     // Returns: next pointer aligned with alignment bytes.
3099     void* nextAlignedPointer(void* start, size_t alignment) pure
3100     {
3101         return cast(void*)nextMultipleOf(cast(size_t)(start), alignment);
3102     }
3103 
3104     // Returns number of bytes to actually allocate when asking
3105     // for a particular alignment
3106     @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure
3107     {
3108         enum size_t pointerSize = size_t.sizeof;
3109         return askedSize + alignment - 1 + pointerSize * 3;
3110     }
3111 
3112     // Store pointer given by malloc + size + alignment
3113     @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure
3114     {
3115         enum size_t pointerSize = size_t.sizeof;
3116         char* start = cast(char*)raw + pointerSize * 3;
3117         void* aligned = nextAlignedPointer(start, alignment);
3118         void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize);
3119         *rawLocation = raw;
3120         size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize);
3121         *sizeLocation = size;
3122         size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize);
3123         *alignmentLocation = alignment;
3124         assert( isPointerAligned(aligned, alignment) );
3125         return aligned;
3126     }
3127 
3128     // Returns: x, multiple of powerOfTwo, so that x >= n.
3129     @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow
3130     {
3131         // check power-of-two
3132         assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0));
3133 
3134         size_t mask = ~(powerOfTwo - 1);
3135         return (n + powerOfTwo - 1) & mask;
3136     }
3137 
3138     void* alignedReallocImpl(bool PreserveDataIfResized)(void* aligned, size_t size, size_t alignment)
3139     {
3140         if (aligned is null)
3141             return _mm_malloc(size, alignment);
3142 
3143         assert(alignment != 0);
3144         assert(isPointerAligned(aligned, alignment));
3145 
3146         size_t previousSize = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 2);
3147         size_t prevAlignment = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 3);
3148 
3149         // It is illegal to change the alignment across calls.
3150         assert(prevAlignment == alignment);
3151 
3152         void* raw = *cast(void**)(cast(char*)aligned - size_t.sizeof);
3153         size_t request = requestedSize(size, alignment);
3154         size_t previousRequest = requestedSize(previousSize, alignment);
3155         assert(previousRequest - request == previousSize - size);
3156 
3157         // Heuristic: if a requested size is within 50% to 100% of what is already allocated
3158         //            then exit with the same pointer
3159         // PERF it seems like `realloc` should do that, not us.
3160         if ( (previousRequest < request * 4) && (request <= previousRequest) )
3161             return aligned;
3162 
3163         void* newRaw = malloc(request);
3164         if (request > 0 && newRaw == null) // realloc(0) can validly return anything
3165             onOutOfMemoryError();
3166 
3167         void* newAligned = storeRawPointerPlusInfo(newRaw, size, alignment);
3168 
3169         static if (PreserveDataIfResized)
3170         {
3171             size_t minSize = size < previousSize ? size : previousSize;
3172             memcpy(newAligned, aligned, minSize); // ok to use memcpy: newAligned is into new memory, always different from aligned
3173         }
3174 
3175         // Free previous data
3176         _mm_free(aligned);
3177         assert(isPointerAligned(newAligned, alignment));
3178         return newAligned;
3179     }
3180 }
3181 
3182 unittest
3183 {
3184     assert(nextMultipleOf(0, 4) == 0);
3185     assert(nextMultipleOf(1, 4) == 4);
3186     assert(nextMultipleOf(2, 4) == 4);
3187     assert(nextMultipleOf(3, 4) == 4);
3188     assert(nextMultipleOf(4, 4) == 4);
3189     assert(nextMultipleOf(5, 4) == 8);
3190 
3191     {
3192         void* p = _mm_malloc(23, 16);
3193         assert(p !is null);
3194         assert(((cast(size_t)p) & 0xf) == 0);
3195         _mm_free(p);
3196     }
3197 
3198     void* nullAlloc = _mm_malloc(0, 32);
3199     assert(nullAlloc != null);
3200     _mm_free(nullAlloc);
3201 }
3202 
3203 // For some reason, order of declaration is important for this one
3204 // so it is misplaced.
3205 // Note: is just another name for _mm_cvtss_si32
3206 alias _mm_cvt_ss2si = _mm_cvtss_si32;