1 /**
2 * MMX intrinsics.
3 *
4 * Copyright: Copyright Guillaume Piolat 2019-2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.mmx;
8 
9 public import inteli.types;
10 import inteli.internals;
11 
12 import inteli.xmmintrin;
13 import inteli.emmintrin;
14 
15 nothrow @nogc:
16 
17 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
18 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
19 // intel-intrinsics is just semantics.
20 
21 
22 /// Add packed 16-bit integers in `a` and `b`.
23 __m64 _mm_add_pi16 (__m64 a, __m64 b)
24 {
25     return cast(__m64)(cast(short4)a + cast(short4)b);
26 }
27 unittest
28 {
29     short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
30     short[4] correct = [7, 7, 7, 7];
31     assert(R.array == correct);
32 }
33 
34 /// Add packed 32-bit integers in `a` and `b`.
35 __m64 _mm_add_pi32 (__m64 a, __m64 b)
36 {
37     return cast(__m64)(cast(int2)a + cast(int2)b);
38 }
39 unittest
40 {
41     int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
42     int[2] correct = [7, 7];
43     assert(R.array == correct);
44 }
45 
46 /// Add packed 8-bit integers in `a` and `b`.
47 __m64 _mm_add_pi8 (__m64 a, __m64 b)
48 {
49     return cast(__m64)(cast(byte8)a + cast(byte8)b);
50 }
51 unittest
52 {
53     byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
54     byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
55     assert(R.array == correct);
56 }
57 
58 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
59 // PERF: PADDSW not generated
60 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
61 {
62     return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b)));
63 }
64 unittest
65 {
66     short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
67                                             _mm_set_pi16(3, 2, 1, 0));
68     static immutable short[4] correctResult = [0, 2, 4, 6];
69     assert(res.array == correctResult);
70 }
71 
72 /// Add packed 8-bit integers in `a` and `b` using signed saturation.
73 // PERF: PADDSB not generated
74 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
75 {
76     return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b)));
77 }
78 unittest
79 {
80     byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
81                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
82     static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
83     assert(res.array == correctResult);
84 }
85 
86 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
87 // PERF: PADDUSW not generated
88 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
89 {
90     return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b)));
91 }
92 unittest
93 {
94     short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
95                                             _mm_set_pi16(3, 2, 1, 0));
96     static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
97     assert(res.array == correctResult);
98 }
99 
100 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
101 // PERF: PADDUSB not generated
102 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
103 {
104     return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b)));
105 }
106 unittest
107 {
108     byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
109                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
110     static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
111     assert(res.array == correctResult);
112 }
113 
114 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
115 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
116 {
117     return a & b;
118 }
119 unittest
120 {
121     __m64 A = [7];
122     __m64 B = [14];
123     __m64 R = _mm_and_si64(A, B);
124     assert(R.array[0] == 6);
125 }
126 
127 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
128 __m64 _mm_andnot_si64 (__m64 a, __m64 b)
129 {
130     return (~a) & b;
131 }
132 unittest
133 {
134     __m64 A = [7];
135     __m64 B = [14];
136     __m64 R = _mm_andnot_si64(A, B);
137     assert(R.array[0] == 8);
138 }
139 
140 /// Compare packed 16-bit integers in `a` and `b` for equality.
141 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
142 {
143     static if (GDC_with_MMX)
144     {
145         return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b);        
146     }
147     else
148     {
149         return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
150     }
151 }
152 unittest
153 {
154     short4   A = [-3, -2, -1,  0];
155     short4   B = [ 4,  3,  2,  1];
156     short[4] E = [ 0,  0,  0,  0];
157     short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
158     assert(R.array == E);
159 }
160 
161 /// Compare packed 32-bit integers in `a` and `b` for equality.
162 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
163 {
164     static if (GDC_with_MMX)
165     {        
166         return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b);
167     }
168     else
169     {
170         return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
171     }
172 }
173 unittest
174 {
175     int2   A = [-3, -2];
176     int2   B = [ 4, -2];
177     int[2] E = [ 0, -1];
178     int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
179     assert(R.array == E);
180 }
181 
182 /// Compare packed 8-bit integers in `a` and `b` for equality,
183 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
184 {
185     static if (GDC_with_MMX)
186     {        
187         return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b);
188     }
189     else
190     {
191         return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
192     }
193 }
194 unittest
195 {
196     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
197     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
198     byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
199     byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
200     assert(C.array == correct);
201 }
202 
203 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
204 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
205 {
206     static if (GDC_with_MMX)
207     { 
208         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
209     }
210     else
211     {
212         return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
213     }
214 }
215 unittest
216 {
217     short4   A = [-3, -2, -1,  0];
218     short4   B = [ 4,  3,  2,  1];
219     short[4] E = [ 0,  0,  0,  0];
220     short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
221     assert(R.array == E);
222 }
223 
224 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
225 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
226 {
227     static if (GDC_with_MMX)
228     {
229         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
230     }
231     else
232     {
233         return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
234     }
235 }
236 unittest
237 {
238     int2   A = [-3,  2];
239     int2   B = [ 4, -2];
240     int[2] E = [ 0, -1];
241     int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
242     assert(R.array == E);
243 }
244 
245 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
246 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
247 {
248     static if (GDC_with_MMX)
249     {
250         return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b);
251     }
252     else
253     {
254         return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
255     }
256 }
257 unittest
258 {
259     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
260     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
261     byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
262     byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
263     assert(C.array == correct);
264 }
265 
266 /// Copy 64-bit integer `a` to `dst`.
267 long _mm_cvtm64_si64 (__m64 a) pure @safe
268 {
269     long1 la = cast(long1)a;
270     return a.array[0];
271 }
272 unittest
273 {
274     __m64 A = _mm_setr_pi32(2, 1);
275     long1 lA = cast(long1)A;
276     assert(A.array[0] == 0x100000002);
277 }
278 
279 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
280 __m64 _mm_cvtsi32_si64 (int a) pure @trusted
281 {
282     __m64 r = void;
283     r.ptr[0] = a;
284     return r;
285 }
286 unittest
287 {
288     __m64 R = _mm_cvtsi32_si64(-1);
289     assert(R.array[0] == -1);
290 }
291 
292 /// Copy 64-bit integer `a` to `dst`.
293 __m64 _mm_cvtsi64_m64 (long a) pure @trusted
294 {
295     __m64 r = void;
296     r.ptr[0] = a;
297     return r;
298 }
299 unittest
300 {
301     __m64 R = _mm_cvtsi64_m64(0x123456789A);
302     assert(R.array[0] == 0x123456789A);
303 }
304 
305 /// Get the lower 32-bit integer in `a`.
306 int _mm_cvtsi64_si32 (__m64 a) pure @safe
307 {
308     int2 r = cast(int2)a;
309     return r.array[0];
310 }
311 unittest
312 {
313     __m64 A = _mm_setr_pi32(-6, 5);
314     int R = _mm_cvtsi64_si32(A);
315     assert(R == -6);
316 }
317 
318 /// Empty the MMX state, which marks the x87 FPU registers as available for 
319 /// use by x87 instructions. 
320 /// This instruction is supposed to be used at the end of all MMX technology procedures.
321 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD.
322 void _mm_empty() pure @safe
323 {
324     // do nothing, see comment on top of file
325     // TODO: not sure for GDC, do something?
326 }
327 
328 
329 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics.
330 deprecated alias _m_from_int =  _mm_cvtsi32_si64; ///ditto
331 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto
332 
333 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 
334 /// Horizontally add adjacent pairs of intermediate 32-bit integers
335 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe
336 {
337     return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b)));
338 }
339 unittest
340 {
341     short4 A = [-32768, -32768, 32767, 32767];
342     short4 B = [-32768, -32768, 32767, 32767];
343     int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B);
344     int[2] correct = [-2147483648, 2*32767*32767];
345     assert(R.array == correct);
346 }
347 
348 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
349 /// and store the high 16 bits of the intermediate integers.
350 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe
351 {
352     return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b)));
353 }
354 unittest
355 {
356     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
357     __m64 B = _mm_set1_pi16(16384);
358     short4 R = cast(short4)_mm_mulhi_pi16(A, B);
359     short[4] correct = [1, 2, -4, 1];
360     assert(R.array == correct);
361 }
362 
363 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
364 /// and store the low 16 bits of the intermediate integers.
365 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe
366 {
367     return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b)));
368 }
369 unittest
370 {
371     __m64 A = _mm_setr_pi16(4, 1, 16, 7);
372     __m64 B = _mm_set1_pi16(16384);
373     short4 R = cast(short4)_mm_mullo_pi16(A, B);
374     short[4] correct = [0, 16384, 0, -16384];
375     assert(R.array == correct);
376 }
377 
378 /// Compute the bitwise OR of 64 bits in `a` and `b`.
379 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe
380 {
381     return a | b;
382 }
383 unittest
384 {
385     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
386     __m64 B = _mm_set1_pi16(15);
387     short4 R = cast(short4)_mm_or_si64(A, B);
388     short[4] correct =     [255, 15, -1, 15];
389     assert(R.array == correct);
390 }
391 
392 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
393 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted
394 {
395     int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b));
396     int2 r;
397     r.ptr[0] = p.array[0];
398     r.ptr[1] = p.array[2];
399     return cast(__m64)r;
400 }
401 unittest
402 {
403     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
404     byte8 R = cast(byte8) _mm_packs_pi16(A, A);
405     byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0];
406     assert(R.array == correct);
407 }
408 
409 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
410 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted
411 {
412     int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b));
413     int2 r;
414     r.ptr[0] = p.array[0];
415     r.ptr[1] = p.array[2];
416     return cast(__m64)r;
417 }
418 unittest
419 {
420     __m64 A = _mm_setr_pi32(100000, -100000);
421     short4 R = cast(short4) _mm_packs_pi32(A, A);
422     short[4] correct = [32767, -32768, 32767, -32768];
423     assert(R.array == correct);
424 }
425 
426 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
427 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted
428 {
429     int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b));
430     int2 r;
431     r.ptr[0] = p.array[0];
432     r.ptr[1] = p.array[2];
433     return cast(__m64)r;
434 }
435 unittest
436 {
437     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
438     byte8 R = cast(byte8) _mm_packs_pu16(A, A);
439     ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0];
440     assert(R.array == cast(byte[8])correct);
441 }
442 
443 deprecated alias
444     _m_packssdw = _mm_packs_pi32,     /// Deprecated intrinsics.
445     _m_packsswb = _mm_packs_pi16,     ///ditto
446     _m_packuswb = _mm_packs_pu16,     ///ditto
447     _m_paddb = _mm_add_pi8,           ///ditto
448     _m_paddd = _mm_add_pi32,          ///ditto
449     _m_paddsb = _mm_adds_pi8,         ///ditto
450     _m_paddsw = _mm_adds_pi16,        ///ditto
451     _m_paddusb = _mm_adds_pu8,        ///ditto
452     _m_paddusw = _mm_adds_pu16,       ///ditto
453     _m_paddw = _mm_add_pi16,          ///ditto
454     _m_pand = _mm_and_si64,           ///ditto
455     _m_pandn = _mm_andnot_si64,       ///ditto
456     _m_pcmpeqb = _mm_cmpeq_pi8,       ///ditto
457     _m_pcmpeqd = _mm_cmpeq_pi32,      ///ditto
458     _m_pcmpeqw = _mm_cmpeq_pi16,      ///ditto
459     _m_pcmpgtb = _mm_cmpgt_pi8,       ///ditto
460     _m_pcmpgtd = _mm_cmpgt_pi32,      ///ditto
461     _m_pcmpgtw = _mm_cmpgt_pi16,      ///ditto
462     _m_pmaddwd = _mm_madd_pi16,       ///ditto
463     _m_pmulhw = _mm_mulhi_pi16,       ///ditto
464     _m_pmullw = _mm_mullo_pi16,       ///ditto
465     _m_por = _mm_or_si64,             ///ditto
466     _m_pslld = _mm_sll_pi32,          ///ditto
467     _m_pslldi = _mm_slli_pi32,        ///ditto
468     _m_psllq = _mm_sll_si64,          ///ditto
469     _m_psllqi = _mm_slli_si64,        ///ditto
470     _m_psllw = _mm_sll_pi16,          ///ditto
471     _m_psllwi = _mm_slli_pi16,        ///ditto
472     _m_psrad = _mm_sra_pi32,          ///ditto
473     _m_psradi = _mm_srai_pi32,        ///ditto
474     _m_psraw = _mm_sra_pi16,          ///ditto
475     _m_psrawi = _mm_srai_pi16,        ///ditto
476     _m_psrld = _mm_srl_pi32,          ///ditto
477     _m_psrldi = _mm_srli_pi32,        ///ditto
478     _m_psrlq = _mm_srl_si64,          ///ditto
479     _m_psrlqi = _mm_srli_si64,        ///ditto
480     _m_psrlw = _mm_srl_pi16,          ///ditto
481     _m_psrlwi = _mm_srli_pi16,        ///ditto
482     _m_psubb = _mm_sub_pi8,           ///ditto
483     _m_psubd = _mm_sub_pi32,          ///ditto
484     _m_psubsb = _mm_subs_pi8,         ///ditto
485     _m_psubsw = _mm_subs_pi16,        ///ditto
486     _m_psubusb = _mm_subs_pu8,        ///ditto
487     _m_psubusw = _mm_subs_pu16,       ///ditto
488     _m_psubw = _mm_sub_pi16,          ///ditto
489     _m_punpckhbw = _mm_unpackhi_pi8,  ///ditto
490     _m_punpckhdq = _mm_unpackhi_pi32, ///ditto
491     _m_punpckhwd = _mm_unpackhi_pi16, ///ditto
492     _m_punpcklbw = _mm_unpacklo_pi8,  ///ditto
493     _m_punpckldq = _mm_unpacklo_pi32, ///ditto
494     _m_punpcklwd = _mm_unpacklo_pi16, ///ditto
495     _m_pxor = _mm_xor_si64;           ///ditto
496                 
497 /// Set packed 16-bit integers with the supplied values.
498 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
499 {
500     short[4] arr = [e0, e1, e2, e3];
501     return *cast(__m64*)(arr.ptr);
502 }
503 unittest
504 {
505     short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
506     short[4] correct = [0, 1, 2, 3];
507     assert(R.array == correct);
508 }
509 
510 /// Set packed 32-bit integers with the supplied values.
511 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted
512 {
513     int[2] arr = [e0, e1];
514     return *cast(__m64*)(arr.ptr);
515 }
516 unittest
517 {
518     int2 R = cast(int2) _mm_set_pi32(1, 0);
519     int[2] correct = [0, 1];
520     assert(R.array == correct);
521 }
522 
523 /// Set packed 8-bit integers with the supplied values.
524 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
525 {
526     byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
527     return *cast(__m64*)(arr.ptr);
528 }
529 unittest
530 {
531     byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
532     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
533     assert(R.array == correct);
534 }
535 
536 /// Broadcast 16-bit integer `a` to all elements.
537 __m64 _mm_set1_pi16 (short a) pure @trusted
538 {
539     return cast(__m64)(short4(a));
540 }
541 unittest
542 {
543     short4 R = cast(short4) _mm_set1_pi16(44);
544     short[4] correct = [44, 44, 44, 44];
545     assert(R.array == correct);
546 }
547 
548 /// Broadcast 32-bit integer `a` to all elements.
549 __m64 _mm_set1_pi32 (int a) pure @trusted
550 {
551     return cast(__m64)(int2(a));
552 }
553 unittest
554 {
555     int2 R = cast(int2) _mm_set1_pi32(43);
556     int[2] correct = [43, 43];
557     assert(R.array == correct);
558 }
559 
560 /// Broadcast 8-bit integer `a` to all elements.
561 __m64 _mm_set1_pi8 (byte a) pure @trusted
562 {
563     return cast(__m64)(byte8(a));
564 }
565 unittest
566 {
567     byte8 R = cast(byte8) _mm_set1_pi8(42);
568     byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
569     assert(R.array == correct);
570 }
571 
572 /// Set packed 16-bit integers with the supplied values in reverse order.
573 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
574 {
575     short[4] arr = [e3, e2, e1, e0];
576     return *cast(__m64*)(arr.ptr);
577 }
578 unittest
579 {
580     short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
581     short[4] correct = [0, 1, 2, 3];
582     assert(R.array == correct);
583 }
584 
585 /// Set packed 32-bit integers with the supplied values in reverse order.
586 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
587 {
588     int[2] arr = [e1, e0];
589     return *cast(__m64*)(arr.ptr);
590 }
591 unittest
592 {
593     int2 R = cast(int2) _mm_setr_pi32(0, 1);
594     int[2] correct = [0, 1];
595     assert(R.array == correct);
596 }
597 
598 /// Set packed 8-bit integers with the supplied values in reverse order.
599 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
600 {
601     byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
602     return *cast(__m64*)(arr.ptr);
603 }
604 unittest
605 {
606     byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
607     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
608     assert(R.array == correct);
609 }
610 
611 /// Return vector of type `__m64` with all elements set to zero.
612 __m64 _mm_setzero_si64 () pure @trusted
613 {
614     __m64 r;
615     r.ptr[0] = 0;
616     return r;
617 }
618 unittest
619 {
620     __m64 R = _mm_setzero_si64();
621     assert(R.array[0] == 0);
622 }
623 
624 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros.
625 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe
626 {
627     return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits)));
628 }
629 
630 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros.
631 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe
632 {
633     return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits)));
634 }
635 
636 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros.
637 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe
638 {
639     return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits)));
640 }
641 
642 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
643 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe
644 {
645     return to_m64(_mm_slli_epi16(to_m128i(a), imm8));
646 }
647 unittest
648 {
649     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
650     short4 B = cast(short4)( _mm_slli_pi16(A, 1) );
651     short[4] correct = [ -8, -10, 12, 14 ];
652     assert(B.array == correct);
653 }
654 
655 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
656 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe
657 {
658     return to_m64(_mm_slli_epi32(to_m128i(a), imm8));
659 }
660 unittest
661 {
662     __m64 A = _mm_setr_pi32(-4, 5);
663     int2 B = cast(int2)( _mm_slli_pi32(A, 1) );
664     int[2] correct = [ -8, 10 ];
665     assert(B.array == correct);
666 }
667 
668 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros.
669 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe
670 {
671     return to_m64(_mm_slli_epi64(to_m128i(a), imm8));
672 }
673 unittest
674 {
675     __m64 A = _mm_cvtsi64_m64(-1);
676     long1 R = cast(long1)( _mm_slli_si64(A, 1) );
677     long[1] correct = [ -2 ];
678     assert(R.array == correct);
679 }
680 
681 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits.
682 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe
683 {
684     return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits)));
685 }
686 
687 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits.
688 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe
689 {
690     return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits)));
691 }
692 
693 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
694 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe
695 {
696     return to_m64(_mm_srai_epi16(to_m128i(a), imm8));
697 }
698 unittest
699 {
700     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
701     short4 B = cast(short4)( _mm_srai_pi16(A, 1) );
702     short[4] correct = [ -2, -3, 3, 3 ];
703     assert(B.array == correct);
704 }
705 
706 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
707 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe
708 {
709     return to_m64(_mm_srai_epi32(to_m128i(a), imm8));
710 }
711 unittest
712 {
713     __m64 A = _mm_setr_pi32(-4, 5);
714     int2 B = cast(int2)( _mm_srai_pi32(A, 1) );
715     int[2] correct = [ -2, 2 ];
716     assert(B.array == correct);
717 }
718 
719 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros.
720 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe
721 {
722     return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits)));
723 }
724 
725 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros.
726 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe
727 {
728     return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits)));
729 }
730 
731 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros.
732 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe
733 {
734     return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits)));
735 }
736 
737 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
738 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe
739 {
740     return to_m64(_mm_srli_epi16(to_m128i(a), imm8));
741 }
742 unittest
743 {
744     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
745     short4 B = cast(short4)( _mm_srli_pi16(A, 1) );
746     short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ];
747     assert(B.array == correct);
748 }
749 
750 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
751 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe
752 {
753     return to_m64(_mm_srli_epi32(to_m128i(a), imm8));
754 }
755 unittest
756 {
757     __m64 A = _mm_setr_pi32(-4, 5);
758     int2 B = cast(int2)( _mm_srli_pi32(A, 1) );
759     int[2] correct = [ 0x7ffffffe, 2 ];
760     assert(B.array == correct);
761 }
762 
763 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros.
764 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe
765 {
766     return to_m64(_mm_srli_epi64(to_m128i(a), imm8));
767 }
768 unittest
769 {
770     __m64 A = _mm_cvtsi64_m64(-1);
771     long1 R = cast(long1)( _mm_srli_si64(A, 1) );
772     long[1] correct = [ 0x7fff_ffff_ffff_ffff ];
773     assert(R.array == correct);
774 }
775 
776 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
777 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe
778 {
779     return cast(__m64)(cast(short4)a - cast(short4)b);
780 }
781 unittest
782 {
783     short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
784                                          _mm_setr_pi16(cast(short)65535, 16, 4, 4));
785     static immutable short[4] correct =                            [ -1,-15, 1, 32764];
786     assert(R.array == correct);
787 }
788 
789 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
790 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe
791 {
792     return cast(__m64)(cast(int2)a - cast(int2)b);
793 }
794 unittest
795 {
796     int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10,   4),
797                                      _mm_setr_pi32( 15, -70));
798     static immutable int[2] correct =             [ -5,  74];
799     assert(R.array == correct);
800 }
801 
802 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
803 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe
804 {
805     return cast(__m64)(cast(byte8)a - cast(byte8)b);
806 }
807 unittest
808 {
809     byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
810                                       _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
811     static immutable byte[8] correct =                 [      -1,   7, -1,-30,  0,  0, 0, 120 ];
812     assert(R.array == correct);
813 }
814 
815 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation.
816 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe
817 {
818     return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b)));
819 }
820 unittest
821 {
822     short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
823                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
824     static immutable short[4] correct =                             [ -1,-15, 1, -32768];
825     assert(R.array == correct);
826 }
827 
828 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation.
829 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe
830 {
831     return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b)));
832 }
833 unittest
834 {
835     byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
836                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
837     static immutable byte[8] correct =                 [       -1,   7, -1,-30,  0,  0, 0, -128 ];
838     assert(R.array == correct);
839 }
840 
841 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
842 /// using saturation.
843 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe
844 {
845     return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b)));
846 }
847 unittest
848 {
849     short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534,  1, 5, 4),
850                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
851     static immutable short[4] correct =                              [ 0,  0, 1, 0];
852     assert(R.array == correct);
853 }
854 
855 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 
856 /// using saturation.
857 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe
858 {
859     return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b)));
860 }
861 unittest
862 {
863     byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8),
864                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
865     static immutable byte[8] correct =                 [        0,   7,  0,  0,  0,  0, 0, 0, ];
866     assert(R.array == correct);
867 }
868 
869 deprecated alias _m_to_int = _mm_cvtsi64_si32;  /// Deprecated intrinsics.
870 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto
871 
872 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
873 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted
874 {   
875     version(LDC)
876     {
877         // avoiding this shufflevector leads to bad performance on LDC
878         return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b);
879     }
880     else
881     {
882         short4 ia = cast(short4)a;
883         short4 ib = cast(short4)b;
884         short4 r;
885         r.ptr[0] = ia.array[2];
886         r.ptr[1] = ib.array[2];
887         r.ptr[2] = ia.array[3];
888         r.ptr[3] = ib.array[3];
889         return cast(__m64)r;
890     }
891 }
892 unittest
893 {
894     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
895     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
896     short4 R = cast(short4) _mm_unpackhi_pi16(A, B);
897     short[4] correct = [-16, -3, 7, 10];
898     assert(R.array == correct);
899 }
900 
901 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
902 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted
903 {
904     // Generate punpckldq as far back as LDC 1.0.0 -O1
905     // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions)
906     int2 ia = cast(int2)a;
907     int2 ib = cast(int2)b;
908     int2 r;
909     r.ptr[0] = ia.array[1];
910     r.ptr[1] = ib.array[1];
911     return cast(__m64)r;
912 }
913 unittest
914 {
915     __m64 A = _mm_setr_pi32(4, 8);
916     __m64 B = _mm_setr_pi32(5, 9);
917     int2 R = cast(int2) _mm_unpackhi_pi32(A, B);
918     int[2] correct = [8, 9];
919     assert(R.array == correct);
920 }
921 
922 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
923 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b)
924 {
925     version(LDC)
926     {
927         return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b);
928     }
929     else
930     {
931         byte8 ia = cast(byte8)a;
932         byte8 ib = cast(byte8)b;
933         byte8 r;
934         r.ptr[0] = ia.array[4];
935         r.ptr[1] = ib.array[4];
936         r.ptr[2] = ia.array[5];
937         r.ptr[3] = ib.array[5];
938         r.ptr[4] = ia.array[6];
939         r.ptr[5] = ib.array[6];
940         r.ptr[6] = ia.array[7];
941         r.ptr[7] = ib.array[7];
942         return cast(__m64)r;
943     }
944 }
945 unittest
946 {
947     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
948     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
949     byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B);
950     byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8];
951     assert(R.array == correct);
952 }
953 
954 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
955 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b)
956 {
957     // Generates punpcklwd since LDC 1.0.0 -01
958     short4 ia = cast(short4)a;
959     short4 ib = cast(short4)b;
960     short4 r;
961     r.ptr[0] = ia.array[0];
962     r.ptr[1] = ib.array[0];
963     r.ptr[2] = ia.array[1];
964     r.ptr[3] = ib.array[1];
965     return cast(__m64)r;
966 }
967 unittest
968 {
969     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
970     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
971     short4 R = cast(short4) _mm_unpacklo_pi16(A, B);
972     short[4] correct = [4, 5, 8, 9];
973     assert(R.array == correct);
974 }
975 
976 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
977 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted
978 {
979     // x86: Generate punpckldq as far back as LDC 1.0.0 -O1
980     // ARM: Generate zip as far back as LDC 1.8.0 -O1
981     int2 ia = cast(int2)a;
982     int2 ib = cast(int2)b;
983     int2 r;
984     r.ptr[0] = ia.array[0];
985     r.ptr[1] = ib.array[0];
986     return cast(__m64)r;
987 }
988 unittest
989 {
990     __m64 A = _mm_setr_pi32(4, 8);
991     __m64 B = _mm_setr_pi32(5, 9);
992     int2 R = cast(int2) _mm_unpacklo_pi32(A, B);
993     int[2] correct = [4, 5];
994     assert(R.array == correct);
995 }
996 
997 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
998 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b)
999 {
1000     version(LDC)
1001     {
1002         return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b);
1003     }
1004     else
1005     {
1006         byte8 ia = cast(byte8)a;
1007         byte8 ib = cast(byte8)b;
1008         byte8 r;
1009         r.ptr[0] = ia.array[0];
1010         r.ptr[1] = ib.array[0];
1011         r.ptr[2] = ia.array[1];
1012         r.ptr[3] = ib.array[1];
1013         r.ptr[4] = ia.array[2];
1014         r.ptr[5] = ib.array[2];
1015         r.ptr[6] = ia.array[3];
1016         r.ptr[7] = ib.array[3];
1017         return cast(__m64)r;
1018     }
1019 }
1020 unittest
1021 {
1022     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
1023     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
1024     byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B);
1025     byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4];
1026     assert(R.array == correct);
1027 }
1028 
1029 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`.
1030 __m64 _mm_xor_si64 (__m64 a, __m64 b)
1031 {
1032     return a ^ b;
1033 }
1034 unittest
1035 {
1036     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
1037     __m64 B = _mm_set1_pi16(15);
1038     short4 R = cast(short4)_mm_xor_si64(A, B);
1039     short[4] correct =     [240, 14, -16, 15];
1040     assert(R.array == correct);
1041 }
1042