1 /**
2 * MMX intrinsics.
3 *
4 * Copyright: Copyright Auburn Sounds 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 * Authors:   Guillaume Piolat
7 * Macros:
8 *      GUIDE = https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=$0
9 *
10 */
11 module inteli.mmx;
12 
13 public import inteli.types;
14 import inteli.internals;
15 
16 import inteli.xmmintrin;
17 import inteli.emmintrin;
18 
19 nothrow @nogc:
20 
21 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
22 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
23 // intel-intrinsics is just semantics.
24 
25 
26 /// Add packed 16-bit integers in `a` and `b`.
27 __m64 _mm_add_pi16 (__m64 a, __m64 b)
28 {
29     return cast(__m64)(cast(short4)a + cast(short4)b);
30 }
31 unittest
32 {
33     short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
34     short[4] correct = [7, 7, 7, 7];
35     assert(R.array == correct);
36 }
37 
38 /// Add packed 32-bit integers in `a` and `b`.
39 __m64 _mm_add_pi32 (__m64 a, __m64 b)
40 {
41     return cast(__m64)(cast(int2)a + cast(int2)b);
42 }
43 unittest
44 {
45     int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
46     int[2] correct = [7, 7];
47     assert(R.array == correct);
48 }
49 
50 /// Add packed 8-bit integers in `a` and `b`.
51 __m64 _mm_add_pi8 (__m64 a, __m64 b)
52 {
53     return cast(__m64)(cast(byte8)a + cast(byte8)b);
54 }
55 unittest
56 {
57     byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
58     byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
59     assert(R.array == correct);
60 }
61 
62 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
63 // PERF: PADDSW not generated
64 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
65 {
66     return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b)));
67 }
68 unittest
69 {
70     short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
71                                             _mm_set_pi16(3, 2, 1, 0));
72     static immutable short[4] correctResult = [0, 2, 4, 6];
73     assert(res.array == correctResult);
74 }
75 
76 /// Add packed 8-bit integers in `a` and `b` using signed saturation.
77 // PERF: PADDSB not generated
78 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
79 {
80     return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b)));
81 }
82 unittest
83 {
84     byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
85                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
86     static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
87     assert(res.array == correctResult);
88 }
89 
90 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
91 // PERF: PADDUSW not generated
92 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
93 {
94     return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b)));
95 }
96 unittest
97 {
98     short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
99                                             _mm_set_pi16(3, 2, 1, 0));
100     static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
101     assert(res.array == correctResult);
102 }
103 
104 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
105 // PERF: PADDUSB not generated
106 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
107 {
108     return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b)));
109 }
110 unittest
111 {
112     byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
113                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
114     static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
115     assert(res.array == correctResult);
116 }
117 
118 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
119 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
120 {
121     return a & b;
122 }
123 unittest
124 {
125     __m64 A = [7];
126     __m64 B = [14];
127     __m64 R = _mm_and_si64(A, B);
128     assert(R.array[0] == 6);
129 }
130 
131 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
132 __m64 _mm_andnot_si64 (__m64 a, __m64 b)
133 {
134     return (~a) & b;
135 }
136 unittest
137 {
138     __m64 A = [7];
139     __m64 B = [14];
140     __m64 R = _mm_andnot_si64(A, B);
141     assert(R.array[0] == 8);
142 }
143 
144 /// Compare packed 16-bit integers in `a` and `b` for equality.
145 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
146 {
147     static if (GDC_with_MMX)
148     {
149         return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b);        
150     }
151     else
152     {
153         return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
154     }
155 }
156 unittest
157 {
158     short4   A = [-3, -2, -1,  0];
159     short4   B = [ 4,  3,  2,  1];
160     short[4] E = [ 0,  0,  0,  0];
161     short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
162     assert(R.array == E);
163 }
164 
165 /// Compare packed 32-bit integers in `a` and `b` for equality.
166 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
167 {
168     static if (GDC_with_MMX)
169     {        
170         return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b);
171     }
172     else
173     {
174         return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
175     }
176 }
177 unittest
178 {
179     int2   A = [-3, -2];
180     int2   B = [ 4, -2];
181     int[2] E = [ 0, -1];
182     int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
183     assert(R.array == E);
184 }
185 
186 /// Compare packed 8-bit integers in `a` and `b` for equality,
187 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
188 {
189     static if (GDC_with_MMX)
190     {        
191         return cast(__m64) __builtin_ia32_pcmpeqb(cast(byte8)a, cast(byte8)b);
192     }
193     else
194     {
195         return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
196     }
197 }
198 unittest
199 {
200     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
201     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
202     byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
203     byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
204     assert(C.array == correct);
205 }
206 
207 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
208 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
209 {
210     static if (GDC_with_MMX)
211     { 
212         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
213     }
214     else
215     {
216         return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
217     }
218 }
219 unittest
220 {
221     short4   A = [-3, -2, -1,  0];
222     short4   B = [ 4,  3,  2,  1];
223     short[4] E = [ 0,  0,  0,  0];
224     short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
225     assert(R.array == E);
226 }
227 
228 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
229 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
230 {
231     static if (GDC_with_MMX)
232     {
233         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
234     }
235     else
236     {
237         return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
238     }
239 }
240 unittest
241 {
242     int2   A = [-3,  2];
243     int2   B = [ 4, -2];
244     int[2] E = [ 0, -1];
245     int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
246     assert(R.array == E);
247 }
248 
249 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
250 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
251 {
252     static if (GDC_with_MMX)
253     {
254         return cast(__m64) __builtin_ia32_pcmpgtb (cast(byte8)a, cast(byte8)b);
255     }
256     else
257     {
258         return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
259     }
260 }
261 unittest
262 {
263     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
264     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
265     byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
266     byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
267     assert(C.array == correct);
268 }
269 
270 /// Copy 64-bit integer `a` to `dst`.
271 long _mm_cvtm64_si64 (__m64 a) pure @safe
272 {
273     long1 la = cast(long1)a;
274     return a.array[0];
275 }
276 unittest
277 {
278     __m64 A = _mm_setr_pi32(2, 1);
279     long1 lA = cast(long1)A;
280     assert(A.array[0] == 0x100000002);
281 }
282 
283 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
284 __m64 _mm_cvtsi32_si64 (int a) pure @trusted
285 {
286     __m64 r = void;
287     r.ptr[0] = a;
288     return r;
289 }
290 unittest
291 {
292     __m64 R = _mm_cvtsi32_si64(-1);
293     assert(R.array[0] == -1);
294 }
295 
296 /// Copy 64-bit integer `a` to `dst`.
297 __m64 _mm_cvtsi64_m64 (long a) pure @trusted
298 {
299     __m64 r = void;
300     r.ptr[0] = a;
301     return r;
302 }
303 unittest
304 {
305     __m64 R = _mm_cvtsi64_m64(0x123456789A);
306     assert(R.array[0] == 0x123456789A);
307 }
308 
309 /// Get the lower 32-bit integer in `a`.
310 int _mm_cvtsi64_si32 (__m64 a) pure @safe
311 {
312     int2 r = cast(int2)a;
313     return r.array[0];
314 }
315 unittest
316 {
317     __m64 A = _mm_setr_pi32(-6, 5);
318     int R = _mm_cvtsi64_si32(A);
319     assert(R == -6);
320 }
321 
322 /// Empty the MMX state, which marks the x87 FPU registers as available for 
323 /// use by x87 instructions. 
324 /// This instruction is supposed to be used at the end of all MMX technology procedures.
325 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD.
326 void _mm_empty() pure @safe
327 {
328     // do nothing, see comment on top of file
329     // TODO: not sure for GDC, do something?
330 }
331 
332 
333 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics.
334 deprecated alias _m_from_int =  _mm_cvtsi32_si64; ///ditto
335 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto
336 
337 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 
338 /// Horizontally add adjacent pairs of intermediate 32-bit integers
339 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe
340 {
341     return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b)));
342 }
343 unittest
344 {
345     short4 A = [-32768, -32768, 32767, 32767];
346     short4 B = [-32768, -32768, 32767, 32767];
347     int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B);
348     int[2] correct = [-2147483648, 2*32767*32767];
349     assert(R.array == correct);
350 }
351 
352 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
353 /// and store the high 16 bits of the intermediate integers.
354 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe
355 {
356     return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b)));
357 }
358 unittest
359 {
360     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
361     __m64 B = _mm_set1_pi16(16384);
362     short4 R = cast(short4)_mm_mulhi_pi16(A, B);
363     short[4] correct = [1, 2, -4, 1];
364     assert(R.array == correct);
365 }
366 
367 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
368 /// and store the low 16 bits of the intermediate integers.
369 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe
370 {
371     return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b)));
372 }
373 unittest
374 {
375     __m64 A = _mm_setr_pi16(4, 1, 16, 7);
376     __m64 B = _mm_set1_pi16(16384);
377     short4 R = cast(short4)_mm_mullo_pi16(A, B);
378     short[4] correct = [0, 16384, 0, -16384];
379     assert(R.array == correct);
380 }
381 
382 /// Compute the bitwise OR of 64 bits in `a` and `b`.
383 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe
384 {
385     return a | b;
386 }
387 unittest
388 {
389     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
390     __m64 B = _mm_set1_pi16(15);
391     short4 R = cast(short4)_mm_or_si64(A, B);
392     short[4] correct =     [255, 15, -1, 15];
393     assert(R.array == correct);
394 }
395 
396 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
397 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted
398 {
399     int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b));
400     int2 r;
401     r.ptr[0] = p.array[0];
402     r.ptr[1] = p.array[2];
403     return cast(__m64)r;
404 }
405 unittest
406 {
407     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
408     byte8 R = cast(byte8) _mm_packs_pi16(A, A);
409     byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0];
410     assert(R.array == correct);
411 }
412 
413 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
414 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted
415 {
416     int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b));
417     int2 r;
418     r.ptr[0] = p.array[0];
419     r.ptr[1] = p.array[2];
420     return cast(__m64)r;
421 }
422 unittest
423 {
424     __m64 A = _mm_setr_pi32(100000, -100000);
425     short4 R = cast(short4) _mm_packs_pi32(A, A);
426     short[4] correct = [32767, -32768, 32767, -32768];
427     assert(R.array == correct);
428 }
429 
430 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
431 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted
432 {
433     int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b));
434     int2 r;
435     r.ptr[0] = p.array[0];
436     r.ptr[1] = p.array[2];
437     return cast(__m64)r;
438 }
439 unittest
440 {
441     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
442     byte8 R = cast(byte8) _mm_packs_pu16(A, A);
443     ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0];
444     assert(R.array == cast(byte[8])correct);
445 }
446 
447 deprecated alias
448     _m_packssdw = _mm_packs_pi32,     /// Deprecated intrinsics.
449     _m_packsswb = _mm_packs_pi16,     ///ditto
450     _m_packuswb = _mm_packs_pu16,     ///ditto
451     _m_paddb = _mm_add_pi8,           ///ditto
452     _m_paddd = _mm_add_pi32,          ///ditto
453     _m_paddsb = _mm_adds_pi8,         ///ditto
454     _m_paddsw = _mm_adds_pi16,        ///ditto
455     _m_paddusb = _mm_adds_pu8,        ///ditto
456     _m_paddusw = _mm_adds_pu16,       ///ditto
457     _m_paddw = _mm_add_pi16,          ///ditto
458     _m_pand = _mm_and_si64,           ///ditto
459     _m_pandn = _mm_andnot_si64,       ///ditto
460     _m_pcmpeqb = _mm_cmpeq_pi8,       ///ditto
461     _m_pcmpeqd = _mm_cmpeq_pi32,      ///ditto
462     _m_pcmpeqw = _mm_cmpeq_pi16,      ///ditto
463     _m_pcmpgtb = _mm_cmpgt_pi8,       ///ditto
464     _m_pcmpgtd = _mm_cmpgt_pi32,      ///ditto
465     _m_pcmpgtw = _mm_cmpgt_pi16,      ///ditto
466     _m_pmaddwd = _mm_madd_pi16,       ///ditto
467     _m_pmulhw = _mm_mulhi_pi16,       ///ditto
468     _m_pmullw = _mm_mullo_pi16,       ///ditto
469     _m_por = _mm_or_si64,             ///ditto
470     _m_pslld = _mm_sll_pi32,          ///ditto
471     _m_pslldi = _mm_slli_pi32,        ///ditto
472     _m_psllq = _mm_sll_si64,          ///ditto
473     _m_psllqi = _mm_slli_si64,        ///ditto
474     _m_psllw = _mm_sll_pi16,          ///ditto
475     _m_psllwi = _mm_slli_pi16,        ///ditto
476     _m_psrad = _mm_sra_pi32,          ///ditto
477     _m_psradi = _mm_srai_pi32,        ///ditto
478     _m_psraw = _mm_sra_pi16,          ///ditto
479     _m_psrawi = _mm_srai_pi16,        ///ditto
480     _m_psrld = _mm_srl_pi32,          ///ditto
481     _m_psrldi = _mm_srli_pi32,        ///ditto
482     _m_psrlq = _mm_srl_si64,          ///ditto
483     _m_psrlqi = _mm_srli_si64,        ///ditto
484     _m_psrlw = _mm_srl_pi16,          ///ditto
485     _m_psrlwi = _mm_srli_pi16,        ///ditto
486     _m_psubb = _mm_sub_pi8,           ///ditto
487     _m_psubd = _mm_sub_pi32,          ///ditto
488     _m_psubsb = _mm_subs_pi8,         ///ditto
489     _m_psubsw = _mm_subs_pi16,        ///ditto
490     _m_psubusb = _mm_subs_pu8,        ///ditto
491     _m_psubusw = _mm_subs_pu16,       ///ditto
492     _m_psubw = _mm_sub_pi16,          ///ditto
493     _m_punpckhbw = _mm_unpackhi_pi8,  ///ditto
494     _m_punpckhdq = _mm_unpackhi_pi32, ///ditto
495     _m_punpckhwd = _mm_unpackhi_pi16, ///ditto
496     _m_punpcklbw = _mm_unpacklo_pi8,  ///ditto
497     _m_punpckldq = _mm_unpacklo_pi32, ///ditto
498     _m_punpcklwd = _mm_unpacklo_pi16, ///ditto
499     _m_pxor = _mm_xor_si64;           ///ditto
500                 
501 /// Set packed 16-bit integers with the supplied values.
502 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
503 {
504     short[4] arr = [e0, e1, e2, e3];
505     return *cast(__m64*)(arr.ptr);
506 }
507 unittest
508 {
509     short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
510     short[4] correct = [0, 1, 2, 3];
511     assert(R.array == correct);
512 }
513 
514 /// Set packed 32-bit integers with the supplied values.
515 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted
516 {
517     int[2] arr = [e0, e1];
518     return *cast(__m64*)(arr.ptr);
519 }
520 unittest
521 {
522     int2 R = cast(int2) _mm_set_pi32(1, 0);
523     int[2] correct = [0, 1];
524     assert(R.array == correct);
525 }
526 
527 /// Set packed 8-bit integers with the supplied values.
528 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
529 {
530     byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
531     return *cast(__m64*)(arr.ptr);
532 }
533 unittest
534 {
535     byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
536     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
537     assert(R.array == correct);
538 }
539 
540 /// Broadcast 16-bit integer `a` to all elements.
541 __m64 _mm_set1_pi16 (short a) pure @trusted
542 {
543     return cast(__m64)(short4(a));
544 }
545 unittest
546 {
547     short4 R = cast(short4) _mm_set1_pi16(44);
548     short[4] correct = [44, 44, 44, 44];
549     assert(R.array == correct);
550 }
551 
552 /// Broadcast 32-bit integer `a` to all elements.
553 __m64 _mm_set1_pi32 (int a) pure @trusted
554 {
555     return cast(__m64)(int2(a));
556 }
557 unittest
558 {
559     int2 R = cast(int2) _mm_set1_pi32(43);
560     int[2] correct = [43, 43];
561     assert(R.array == correct);
562 }
563 
564 /// Broadcast 8-bit integer `a` to all elements.
565 __m64 _mm_set1_pi8 (byte a) pure @trusted
566 {
567     return cast(__m64)(byte8(a));
568 }
569 unittest
570 {
571     byte8 R = cast(byte8) _mm_set1_pi8(42);
572     byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
573     assert(R.array == correct);
574 }
575 
576 /// Set packed 16-bit integers with the supplied values in reverse order.
577 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
578 {
579     short[4] arr = [e3, e2, e1, e0];
580     return *cast(__m64*)(arr.ptr);
581 }
582 unittest
583 {
584     short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
585     short[4] correct = [0, 1, 2, 3];
586     assert(R.array == correct);
587 }
588 
589 /// Set packed 32-bit integers with the supplied values in reverse order.
590 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
591 {
592     int[2] arr = [e1, e0];
593     return *cast(__m64*)(arr.ptr);
594 }
595 unittest
596 {
597     int2 R = cast(int2) _mm_setr_pi32(0, 1);
598     int[2] correct = [0, 1];
599     assert(R.array == correct);
600 }
601 
602 /// Set packed 8-bit integers with the supplied values in reverse order.
603 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
604 {
605     byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
606     return *cast(__m64*)(arr.ptr);
607 }
608 unittest
609 {
610     byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
611     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
612     assert(R.array == correct);
613 }
614 
615 /// Return vector of type `__m64` with all elements set to zero.
616 __m64 _mm_setzero_si64 () pure @trusted
617 {
618     __m64 r;
619     r.ptr[0] = 0;
620     return r;
621 }
622 unittest
623 {
624     __m64 R = _mm_setzero_si64();
625     assert(R.array[0] == 0);
626 }
627 
628 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros.
629 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe
630 {
631     return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits)));
632 }
633 
634 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros.
635 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe
636 {
637     return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits)));
638 }
639 
640 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros.
641 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe
642 {
643     return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits)));
644 }
645 
646 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
647 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe
648 {
649     return to_m64(_mm_slli_epi16(to_m128i(a), imm8));
650 }
651 unittest
652 {
653     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
654     short4 B = cast(short4)( _mm_slli_pi16(A, 1) );
655     short[4] correct = [ -8, -10, 12, 14 ];
656     assert(B.array == correct);
657 }
658 
659 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
660 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe
661 {
662     return to_m64(_mm_slli_epi32(to_m128i(a), imm8));
663 }
664 unittest
665 {
666     __m64 A = _mm_setr_pi32(-4, 5);
667     int2 B = cast(int2)( _mm_slli_pi32(A, 1) );
668     int[2] correct = [ -8, 10 ];
669     assert(B.array == correct);
670 }
671 
672 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros.
673 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe
674 {
675     return to_m64(_mm_slli_epi64(to_m128i(a), imm8));
676 }
677 unittest
678 {
679     __m64 A = _mm_cvtsi64_m64(-1);
680     long1 R = cast(long1)( _mm_slli_si64(A, 1) );
681     long[1] correct = [ -2 ];
682     assert(R.array == correct);
683 }
684 
685 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits.
686 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe
687 {
688     return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits)));
689 }
690 
691 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits.
692 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe
693 {
694     return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits)));
695 }
696 
697 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
698 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe
699 {
700     return to_m64(_mm_srai_epi16(to_m128i(a), imm8));
701 }
702 unittest
703 {
704     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
705     short4 B = cast(short4)( _mm_srai_pi16(A, 1) );
706     short[4] correct = [ -2, -3, 3, 3 ];
707     assert(B.array == correct);
708 }
709 
710 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
711 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe
712 {
713     return to_m64(_mm_srai_epi32(to_m128i(a), imm8));
714 }
715 unittest
716 {
717     __m64 A = _mm_setr_pi32(-4, 5);
718     int2 B = cast(int2)( _mm_srai_pi32(A, 1) );
719     int[2] correct = [ -2, 2 ];
720     assert(B.array == correct);
721 }
722 
723 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros.
724 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe
725 {
726     return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits)));
727 }
728 
729 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros.
730 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe
731 {
732     return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits)));
733 }
734 
735 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros.
736 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe
737 {
738     return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits)));
739 }
740 
741 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
742 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe
743 {
744     return to_m64(_mm_srli_epi16(to_m128i(a), imm8));
745 }
746 unittest
747 {
748     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
749     short4 B = cast(short4)( _mm_srli_pi16(A, 1) );
750     short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ];
751     assert(B.array == correct);
752 }
753 
754 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
755 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe
756 {
757     return to_m64(_mm_srli_epi32(to_m128i(a), imm8));
758 }
759 unittest
760 {
761     __m64 A = _mm_setr_pi32(-4, 5);
762     int2 B = cast(int2)( _mm_srli_pi32(A, 1) );
763     int[2] correct = [ 0x7ffffffe, 2 ];
764     assert(B.array == correct);
765 }
766 
767 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros.
768 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe
769 {
770     return to_m64(_mm_srli_epi64(to_m128i(a), imm8));
771 }
772 unittest
773 {
774     __m64 A = _mm_cvtsi64_m64(-1);
775     long1 R = cast(long1)( _mm_srli_si64(A, 1) );
776     long[1] correct = [ 0x7fff_ffff_ffff_ffff ];
777     assert(R.array == correct);
778 }
779 
780 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
781 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe
782 {
783     return cast(__m64)(cast(short4)a - cast(short4)b);
784 }
785 unittest
786 {
787     short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
788                                          _mm_setr_pi16(cast(short)65535, 16, 4, 4));
789     static immutable short[4] correct =                            [ -1,-15, 1, 32764];
790     assert(R.array == correct);
791 }
792 
793 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
794 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe
795 {
796     return cast(__m64)(cast(int2)a - cast(int2)b);
797 }
798 unittest
799 {
800     int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10,   4),
801                                      _mm_setr_pi32( 15, -70));
802     static immutable int[2] correct =             [ -5,  74];
803     assert(R.array == correct);
804 }
805 
806 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
807 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe
808 {
809     return cast(__m64)(cast(byte8)a - cast(byte8)b);
810 }
811 unittest
812 {
813     byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
814                                       _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
815     static immutable byte[8] correct =                 [      -1,   7, -1,-30,  0,  0, 0, 120 ];
816     assert(R.array == correct);
817 }
818 
819 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation.
820 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe
821 {
822     return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b)));
823 }
824 unittest
825 {
826     short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
827                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
828     static immutable short[4] correct =                             [ -1,-15, 1, -32768];
829     assert(R.array == correct);
830 }
831 
832 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation.
833 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe
834 {
835     return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b)));
836 }
837 unittest
838 {
839     byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
840                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
841     static immutable byte[8] correct =                 [       -1,   7, -1,-30,  0,  0, 0, -128 ];
842     assert(R.array == correct);
843 }
844 
845 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
846 /// using saturation.
847 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe
848 {
849     return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b)));
850 }
851 unittest
852 {
853     short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534,  1, 5, 4),
854                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
855     static immutable short[4] correct =                              [ 0,  0, 1, 0];
856     assert(R.array == correct);
857 }
858 
859 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 
860 /// using saturation.
861 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe
862 {
863     return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b)));
864 }
865 unittest
866 {
867     byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8),
868                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
869     static immutable byte[8] correct =                 [        0,   7,  0,  0,  0,  0, 0, 0, ];
870     assert(R.array == correct);
871 }
872 
873 deprecated alias _m_to_int = _mm_cvtsi64_si32;  /// Deprecated intrinsics.
874 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto
875 
876 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
877 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted
878 {   
879     version(LDC)
880     {
881         // avoiding this shufflevector leads to bad performance on LDC
882         return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b);
883     }
884     else
885     {
886         short4 ia = cast(short4)a;
887         short4 ib = cast(short4)b;
888         short4 r;
889         r.ptr[0] = ia.array[2];
890         r.ptr[1] = ib.array[2];
891         r.ptr[2] = ia.array[3];
892         r.ptr[3] = ib.array[3];
893         return cast(__m64)r;
894     }
895 }
896 unittest
897 {
898     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
899     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
900     short4 R = cast(short4) _mm_unpackhi_pi16(A, B);
901     short[4] correct = [-16, -3, 7, 10];
902     assert(R.array == correct);
903 }
904 
905 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
906 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted
907 {
908     // Generate punpckldq as far back as LDC 1.0.0 -O1
909     // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions)
910     int2 ia = cast(int2)a;
911     int2 ib = cast(int2)b;
912     int2 r;
913     r.ptr[0] = ia.array[1];
914     r.ptr[1] = ib.array[1];
915     return cast(__m64)r;
916 }
917 unittest
918 {
919     __m64 A = _mm_setr_pi32(4, 8);
920     __m64 B = _mm_setr_pi32(5, 9);
921     int2 R = cast(int2) _mm_unpackhi_pi32(A, B);
922     int[2] correct = [8, 9];
923     assert(R.array == correct);
924 }
925 
926 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
927 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b)
928 {
929     version(LDC)
930     {
931         return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b);
932     }
933     else
934     {
935         byte8 ia = cast(byte8)a;
936         byte8 ib = cast(byte8)b;
937         byte8 r;
938         r.ptr[0] = ia.array[4];
939         r.ptr[1] = ib.array[4];
940         r.ptr[2] = ia.array[5];
941         r.ptr[3] = ib.array[5];
942         r.ptr[4] = ia.array[6];
943         r.ptr[5] = ib.array[6];
944         r.ptr[6] = ia.array[7];
945         r.ptr[7] = ib.array[7];
946         return cast(__m64)r;
947     }
948 }
949 unittest
950 {
951     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
952     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
953     byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B);
954     byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8];
955     assert(R.array == correct);
956 }
957 
958 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
959 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b)
960 {
961     // Generates punpcklwd since LDC 1.0.0 -01
962     short4 ia = cast(short4)a;
963     short4 ib = cast(short4)b;
964     short4 r;
965     r.ptr[0] = ia.array[0];
966     r.ptr[1] = ib.array[0];
967     r.ptr[2] = ia.array[1];
968     r.ptr[3] = ib.array[1];
969     return cast(__m64)r;
970 }
971 unittest
972 {
973     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
974     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
975     short4 R = cast(short4) _mm_unpacklo_pi16(A, B);
976     short[4] correct = [4, 5, 8, 9];
977     assert(R.array == correct);
978 }
979 
980 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
981 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted
982 {
983     // x86: Generate punpckldq as far back as LDC 1.0.0 -O1
984     // ARM: Generate zip as far back as LDC 1.8.0 -O1
985     int2 ia = cast(int2)a;
986     int2 ib = cast(int2)b;
987     int2 r;
988     r.ptr[0] = ia.array[0];
989     r.ptr[1] = ib.array[0];
990     return cast(__m64)r;
991 }
992 unittest
993 {
994     __m64 A = _mm_setr_pi32(4, 8);
995     __m64 B = _mm_setr_pi32(5, 9);
996     int2 R = cast(int2) _mm_unpacklo_pi32(A, B);
997     int[2] correct = [4, 5];
998     assert(R.array == correct);
999 }
1000 
1001 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
1002 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b)
1003 {
1004     version(LDC)
1005     {
1006         return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b);
1007     }
1008     else
1009     {
1010         byte8 ia = cast(byte8)a;
1011         byte8 ib = cast(byte8)b;
1012         byte8 r;
1013         r.ptr[0] = ia.array[0];
1014         r.ptr[1] = ib.array[0];
1015         r.ptr[2] = ia.array[1];
1016         r.ptr[3] = ib.array[1];
1017         r.ptr[4] = ia.array[2];
1018         r.ptr[5] = ib.array[2];
1019         r.ptr[6] = ia.array[3];
1020         r.ptr[7] = ib.array[3];
1021         return cast(__m64)r;
1022     }
1023 }
1024 unittest
1025 {
1026     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
1027     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
1028     byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B);
1029     byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4];
1030     assert(R.array == correct);
1031 }
1032 
1033 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`.
1034 __m64 _mm_xor_si64 (__m64 a, __m64 b)
1035 {
1036     return a ^ b;
1037 }
1038 unittest
1039 {
1040     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
1041     __m64 B = _mm_set1_pi16(15);
1042     short4 R = cast(short4)_mm_xor_si64(A, B);
1043     short[4] correct =     [240, 14, -16, 15];
1044     assert(R.array == correct);
1045 }
1046