1 /**
2 * MMX intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX
4 * 
5 * Copyright: Copyright Guillaume Piolat 2019-2020.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.mmx;
9 
10 public import inteli.types;
11 import inteli.internals;
12 
13 import inteli.xmmintrin;
14 import inteli.emmintrin;
15 
16 nothrow @nogc:
17 
18 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
19 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
20 // intel-intrinsics is just semantics.
21 
22 
23 /// Add packed 16-bit integers in `a` and `b`.
24 __m64 _mm_add_pi16 (__m64 a, __m64 b)
25 {
26     return cast(__m64)(cast(short4)a + cast(short4)b);
27 }
28 unittest
29 {
30     short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
31     short[4] correct = [7, 7, 7, 7];
32     assert(R.array == correct);
33 }
34 
35 /// Add packed 32-bit integers in `a` and `b`.
36 __m64 _mm_add_pi32 (__m64 a, __m64 b)
37 {
38     return cast(__m64)(cast(int2)a + cast(int2)b);
39 }
40 unittest
41 {
42     int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
43     int[2] correct = [7, 7];
44     assert(R.array == correct);
45 }
46 
47 /// Add packed 8-bit integers in `a` and `b`.
48 __m64 _mm_add_pi8 (__m64 a, __m64 b)
49 {
50     return cast(__m64)(cast(byte8)a + cast(byte8)b);
51 }
52 unittest
53 {
54     byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
55     byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
56     assert(R.array == correct);
57 }
58 
59 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
60 // PERF: PADDSW not generated
61 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
62 {
63     return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b)));
64 }
65 unittest
66 {
67     short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
68                                             _mm_set_pi16(3, 2, 1, 0));
69     static immutable short[4] correctResult = [0, 2, 4, 6];
70     assert(res.array == correctResult);
71 }
72 
73 /// Add packed 8-bit integers in `a` and `b` using signed saturation.
74 // PERF: PADDSB not generated
75 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
76 {
77     return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b)));
78 }
79 unittest
80 {
81     byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
82                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
83     static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
84     assert(res.array == correctResult);
85 }
86 
87 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
88 // PERF: PADDUSW not generated
89 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
90 {
91     return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b)));
92 }
93 unittest
94 {
95     short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
96                                             _mm_set_pi16(3, 2, 1, 0));
97     static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
98     assert(res.array == correctResult);
99 }
100 
101 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
102 // PERF: PADDUSB not generated
103 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
104 {
105     return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b)));
106 }
107 unittest
108 {
109     byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
110                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
111     static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
112     assert(res.array == correctResult);
113 }
114 
115 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
116 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
117 {
118     return a & b;
119 }
120 unittest
121 {
122     __m64 A = [7];
123     __m64 B = [14];
124     __m64 R = _mm_and_si64(A, B);
125     assert(R.array[0] == 6);
126 }
127 
128 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
129 __m64 _mm_andnot_si64 (__m64 a, __m64 b)
130 {
131     return (~a) & b;
132 }
133 unittest
134 {
135     __m64 A = [7];
136     __m64 B = [14];
137     __m64 R = _mm_andnot_si64(A, B);
138     assert(R.array[0] == 8);
139 }
140 
141 /// Compare packed 16-bit integers in `a` and `b` for equality.
142 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
143 {
144     static if (GDC_with_MMX)
145     {
146         return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b);        
147     }
148     else
149     {
150         return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
151     }
152 }
153 unittest
154 {
155     short4   A = [-3, -2, -1,  0];
156     short4   B = [ 4,  3,  2,  1];
157     short[4] E = [ 0,  0,  0,  0];
158     short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
159     assert(R.array == E);
160 }
161 
162 /// Compare packed 32-bit integers in `a` and `b` for equality.
163 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
164 {
165     static if (GDC_with_MMX)
166     {        
167         return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b);
168     }
169     else
170     {
171         return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
172     }
173 }
174 unittest
175 {
176     int2   A = [-3, -2];
177     int2   B = [ 4, -2];
178     int[2] E = [ 0, -1];
179     int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
180     assert(R.array == E);
181 }
182 
183 /// Compare packed 8-bit integers in `a` and `b` for equality,
184 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
185 {
186     static if (GDC_with_MMX)
187     {        
188         return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b);
189     }
190     else
191     {
192         return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
193     }
194 }
195 unittest
196 {
197     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
198     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
199     byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
200     byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
201     assert(C.array == correct);
202 }
203 
204 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
205 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
206 {
207     static if (GDC_with_MMX)
208     { 
209         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
210     }
211     else
212     {
213         return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
214     }
215 }
216 unittest
217 {
218     short4   A = [-3, -2, -1,  0];
219     short4   B = [ 4,  3,  2,  1];
220     short[4] E = [ 0,  0,  0,  0];
221     short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
222     assert(R.array == E);
223 }
224 
225 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
226 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
227 {
228     static if (GDC_with_MMX)
229     {
230         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
231     }
232     else
233     {
234         return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
235     }
236 }
237 unittest
238 {
239     int2   A = [-3,  2];
240     int2   B = [ 4, -2];
241     int[2] E = [ 0, -1];
242     int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
243     assert(R.array == E);
244 }
245 
246 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
247 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
248 {
249     static if (GDC_with_MMX)
250     {
251         return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b);
252     }
253     else
254     {
255         return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
256     }
257 }
258 unittest
259 {
260     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
261     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
262     byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
263     byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
264     assert(C.array == correct);
265 }
266 
267 /// Copy 64-bit integer `a` to `dst`.
268 long _mm_cvtm64_si64 (__m64 a) pure @safe
269 {
270     long1 la = cast(long1)a;
271     return a.array[0];
272 }
273 unittest
274 {
275     __m64 A = _mm_setr_pi32(2, 1);
276     long1 lA = cast(long1)A;
277     assert(A.array[0] == 0x100000002);
278 }
279 
280 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
281 __m64 _mm_cvtsi32_si64 (int a) pure @trusted
282 {
283     __m64 r = void;
284     r.ptr[0] = a;
285     return r;
286 }
287 unittest
288 {
289     __m64 R = _mm_cvtsi32_si64(-1);
290     assert(R.array[0] == -1);
291 }
292 
293 /// Copy 64-bit integer `a` to `dst`.
294 __m64 _mm_cvtsi64_m64 (long a) pure @trusted
295 {
296     __m64 r = void;
297     r.ptr[0] = a;
298     return r;
299 }
300 unittest
301 {
302     __m64 R = _mm_cvtsi64_m64(0x123456789A);
303     assert(R.array[0] == 0x123456789A);
304 }
305 
306 /// Get the lower 32-bit integer in `a`.
307 int _mm_cvtsi64_si32 (__m64 a) pure @safe
308 {
309     int2 r = cast(int2)a;
310     return r.array[0];
311 }
312 unittest
313 {
314     __m64 A = _mm_setr_pi32(-6, 5);
315     int R = _mm_cvtsi64_si32(A);
316     assert(R == -6);
317 }
318 
319 /// Empty the MMX state, which marks the x87 FPU registers as available for 
320 /// use by x87 instructions. 
321 /// This instruction is supposed to be used at the end of all MMX technology procedures.
322 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD.
323 void _mm_empty() pure @safe
324 {
325     // do nothing, see comment on top of file
326     // TODO: not sure for GDC, do something?
327 }
328 
329 
330 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics.
331 deprecated alias _m_from_int =  _mm_cvtsi32_si64; ///ditto
332 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto
333 
334 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 
335 /// Horizontally add adjacent pairs of intermediate 32-bit integers
336 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe
337 {
338     return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b)));
339 }
340 unittest
341 {
342     short4 A = [-32768, -32768, 32767, 32767];
343     short4 B = [-32768, -32768, 32767, 32767];
344     int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B);
345     int[2] correct = [-2147483648, 2*32767*32767];
346     assert(R.array == correct);
347 }
348 
349 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
350 /// and store the high 16 bits of the intermediate integers.
351 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe
352 {
353     return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b)));
354 }
355 unittest
356 {
357     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
358     __m64 B = _mm_set1_pi16(16384);
359     short4 R = cast(short4)_mm_mulhi_pi16(A, B);
360     short[4] correct = [1, 2, -4, 1];
361     assert(R.array == correct);
362 }
363 
364 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
365 /// and store the low 16 bits of the intermediate integers.
366 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe
367 {
368     return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b)));
369 }
370 unittest
371 {
372     __m64 A = _mm_setr_pi16(4, 1, 16, 7);
373     __m64 B = _mm_set1_pi16(16384);
374     short4 R = cast(short4)_mm_mullo_pi16(A, B);
375     short[4] correct = [0, 16384, 0, -16384];
376     assert(R.array == correct);
377 }
378 
379 /// Compute the bitwise OR of 64 bits in `a` and `b`.
380 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe
381 {
382     return a | b;
383 }
384 unittest
385 {
386     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
387     __m64 B = _mm_set1_pi16(15);
388     short4 R = cast(short4)_mm_or_si64(A, B);
389     short[4] correct =     [255, 15, -1, 15];
390     assert(R.array == correct);
391 }
392 
393 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
394 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted
395 {
396     int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b));
397     int2 r;
398     r.ptr[0] = p.array[0];
399     r.ptr[1] = p.array[2];
400     return cast(__m64)r;
401 }
402 unittest
403 {
404     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
405     byte8 R = cast(byte8) _mm_packs_pi16(A, A);
406     byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0];
407     assert(R.array == correct);
408 }
409 
410 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
411 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted
412 {
413     int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b));
414     int2 r;
415     r.ptr[0] = p.array[0];
416     r.ptr[1] = p.array[2];
417     return cast(__m64)r;
418 }
419 unittest
420 {
421     __m64 A = _mm_setr_pi32(100000, -100000);
422     short4 R = cast(short4) _mm_packs_pi32(A, A);
423     short[4] correct = [32767, -32768, 32767, -32768];
424     assert(R.array == correct);
425 }
426 
427 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
428 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted
429 {
430     int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b));
431     int2 r;
432     r.ptr[0] = p.array[0];
433     r.ptr[1] = p.array[2];
434     return cast(__m64)r;
435 }
436 unittest
437 {
438     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
439     byte8 R = cast(byte8) _mm_packs_pu16(A, A);
440     ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0];
441     assert(R.array == cast(byte[8])correct);
442 }
443 
444 deprecated alias
445     _m_packssdw = _mm_packs_pi32,     /// Deprecated intrinsics.
446     _m_packsswb = _mm_packs_pi16,     ///ditto
447     _m_packuswb = _mm_packs_pu16,     ///ditto
448     _m_paddb = _mm_add_pi8,           ///ditto
449     _m_paddd = _mm_add_pi32,          ///ditto
450     _m_paddsb = _mm_adds_pi8,         ///ditto
451     _m_paddsw = _mm_adds_pi16,        ///ditto
452     _m_paddusb = _mm_adds_pu8,        ///ditto
453     _m_paddusw = _mm_adds_pu16,       ///ditto
454     _m_paddw = _mm_add_pi16,          ///ditto
455     _m_pand = _mm_and_si64,           ///ditto
456     _m_pandn = _mm_andnot_si64,       ///ditto
457     _m_pcmpeqb = _mm_cmpeq_pi8,       ///ditto
458     _m_pcmpeqd = _mm_cmpeq_pi32,      ///ditto
459     _m_pcmpeqw = _mm_cmpeq_pi16,      ///ditto
460     _m_pcmpgtb = _mm_cmpgt_pi8,       ///ditto
461     _m_pcmpgtd = _mm_cmpgt_pi32,      ///ditto
462     _m_pcmpgtw = _mm_cmpgt_pi16,      ///ditto
463     _m_pmaddwd = _mm_madd_pi16,       ///ditto
464     _m_pmulhw = _mm_mulhi_pi16,       ///ditto
465     _m_pmullw = _mm_mullo_pi16,       ///ditto
466     _m_por = _mm_or_si64,             ///ditto
467     _m_pslld = _mm_sll_pi32,          ///ditto
468     _m_pslldi = _mm_slli_pi32,        ///ditto
469     _m_psllq = _mm_sll_si64,          ///ditto
470     _m_psllqi = _mm_slli_si64,        ///ditto
471     _m_psllw = _mm_sll_pi16,          ///ditto
472     _m_psllwi = _mm_slli_pi16,        ///ditto
473     _m_psrad = _mm_sra_pi32,          ///ditto
474     _m_psradi = _mm_srai_pi32,        ///ditto
475     _m_psraw = _mm_sra_pi16,          ///ditto
476     _m_psrawi = _mm_srai_pi16,        ///ditto
477     _m_psrld = _mm_srl_pi32,          ///ditto
478     _m_psrldi = _mm_srli_pi32,        ///ditto
479     _m_psrlq = _mm_srl_si64,          ///ditto
480     _m_psrlqi = _mm_srli_si64,        ///ditto
481     _m_psrlw = _mm_srl_pi16,          ///ditto
482     _m_psrlwi = _mm_srli_pi16,        ///ditto
483     _m_psubb = _mm_sub_pi8,           ///ditto
484     _m_psubd = _mm_sub_pi32,          ///ditto
485     _m_psubsb = _mm_subs_pi8,         ///ditto
486     _m_psubsw = _mm_subs_pi16,        ///ditto
487     _m_psubusb = _mm_subs_pu8,        ///ditto
488     _m_psubusw = _mm_subs_pu16,       ///ditto
489     _m_psubw = _mm_sub_pi16,          ///ditto
490     _m_punpckhbw = _mm_unpackhi_pi8,  ///ditto
491     _m_punpckhdq = _mm_unpackhi_pi32, ///ditto
492     _m_punpckhwd = _mm_unpackhi_pi16, ///ditto
493     _m_punpcklbw = _mm_unpacklo_pi8,  ///ditto
494     _m_punpckldq = _mm_unpacklo_pi32, ///ditto
495     _m_punpcklwd = _mm_unpacklo_pi16, ///ditto
496     _m_pxor = _mm_xor_si64;           ///ditto
497                 
498 /// Set packed 16-bit integers with the supplied values.
499 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
500 {
501     short[4] arr = [e0, e1, e2, e3];
502     return *cast(__m64*)(arr.ptr);
503 }
504 unittest
505 {
506     short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
507     short[4] correct = [0, 1, 2, 3];
508     assert(R.array == correct);
509 }
510 
511 /// Set packed 32-bit integers with the supplied values.
512 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted
513 {
514     int[2] arr = [e0, e1];
515     return *cast(__m64*)(arr.ptr);
516 }
517 unittest
518 {
519     int2 R = cast(int2) _mm_set_pi32(1, 0);
520     int[2] correct = [0, 1];
521     assert(R.array == correct);
522 }
523 
524 /// Set packed 8-bit integers with the supplied values.
525 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
526 {
527     byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
528     return *cast(__m64*)(arr.ptr);
529 }
530 unittest
531 {
532     byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
533     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
534     assert(R.array == correct);
535 }
536 
537 /// Broadcast 16-bit integer `a` to all elements.
538 __m64 _mm_set1_pi16 (short a) pure @trusted
539 {
540     return cast(__m64)(short4(a));
541 }
542 unittest
543 {
544     short4 R = cast(short4) _mm_set1_pi16(44);
545     short[4] correct = [44, 44, 44, 44];
546     assert(R.array == correct);
547 }
548 
549 /// Broadcast 32-bit integer `a` to all elements.
550 __m64 _mm_set1_pi32 (int a) pure @trusted
551 {
552     return cast(__m64)(int2(a));
553 }
554 unittest
555 {
556     int2 R = cast(int2) _mm_set1_pi32(43);
557     int[2] correct = [43, 43];
558     assert(R.array == correct);
559 }
560 
561 /// Broadcast 8-bit integer `a` to all elements.
562 __m64 _mm_set1_pi8 (byte a) pure @trusted
563 {
564     return cast(__m64)(byte8(a));
565 }
566 unittest
567 {
568     byte8 R = cast(byte8) _mm_set1_pi8(42);
569     byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
570     assert(R.array == correct);
571 }
572 
573 /// Set packed 16-bit integers with the supplied values in reverse order.
574 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
575 {
576     short[4] arr = [e3, e2, e1, e0];
577     return *cast(__m64*)(arr.ptr);
578 }
579 unittest
580 {
581     short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
582     short[4] correct = [0, 1, 2, 3];
583     assert(R.array == correct);
584 }
585 
586 /// Set packed 32-bit integers with the supplied values in reverse order.
587 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
588 {
589     int[2] arr = [e1, e0];
590     return *cast(__m64*)(arr.ptr);
591 }
592 unittest
593 {
594     int2 R = cast(int2) _mm_setr_pi32(0, 1);
595     int[2] correct = [0, 1];
596     assert(R.array == correct);
597 }
598 
599 /// Set packed 8-bit integers with the supplied values in reverse order.
600 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
601 {
602     byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
603     return *cast(__m64*)(arr.ptr);
604 }
605 unittest
606 {
607     byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
608     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
609     assert(R.array == correct);
610 }
611 
612 /// Return vector of type `__m64` with all elements set to zero.
613 __m64 _mm_setzero_si64 () pure @trusted
614 {
615     __m64 r; // PERF =void;
616     r.ptr[0] = 0;
617     return r;
618 }
619 unittest
620 {
621     __m64 R = _mm_setzero_si64();
622     assert(R.array[0] == 0);
623 }
624 
625 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros.
626 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe
627 {
628     return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits)));
629 }
630 
631 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros.
632 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe
633 {
634     return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits)));
635 }
636 
637 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros.
638 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe
639 {
640     return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits)));
641 }
642 
643 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
644 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe
645 {
646     return to_m64(_mm_slli_epi16(to_m128i(a), imm8));
647 }
648 unittest
649 {
650     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
651     short4 B = cast(short4)( _mm_slli_pi16(A, 1) );
652     short[4] correct = [ -8, -10, 12, 14 ];
653     assert(B.array == correct);
654 }
655 
656 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
657 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe
658 {
659     return to_m64(_mm_slli_epi32(to_m128i(a), imm8));
660 }
661 unittest
662 {
663     __m64 A = _mm_setr_pi32(-4, 5);
664     int2 B = cast(int2)( _mm_slli_pi32(A, 1) );
665     int[2] correct = [ -8, 10 ];
666     assert(B.array == correct);
667 }
668 
669 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros.
670 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe
671 {
672     return to_m64(_mm_slli_epi64(to_m128i(a), imm8));
673 }
674 unittest
675 {
676     __m64 A = _mm_cvtsi64_m64(-1);
677     long1 R = cast(long1)( _mm_slli_si64(A, 1) );
678     long[1] correct = [ -2 ];
679     assert(R.array == correct);
680 }
681 
682 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits.
683 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe
684 {
685     return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits)));
686 }
687 
688 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits.
689 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe
690 {
691     return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits)));
692 }
693 
694 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits.
695 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe
696 {
697     return to_m64(_mm_srai_epi16(to_m128i(a), imm8));
698 }
699 unittest
700 {
701     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
702     short4 B = cast(short4)( _mm_srai_pi16(A, 1) );
703     short[4] correct = [ -2, -3, 3, 3 ];
704     assert(B.array == correct);
705 }
706 
707 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits.
708 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe
709 {
710     return to_m64(_mm_srai_epi32(to_m128i(a), imm8));
711 }
712 unittest
713 {
714     __m64 A = _mm_setr_pi32(-4, 5);
715     int2 B = cast(int2)( _mm_srai_pi32(A, 1) );
716     int[2] correct = [ -2, 2 ];
717     assert(B.array == correct);
718 }
719 
720 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros.
721 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe
722 {
723     return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits)));
724 }
725 
726 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros.
727 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe
728 {
729     return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits)));
730 }
731 
732 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros.
733 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe
734 {
735     return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits)));
736 }
737 
738 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
739 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe
740 {
741     return to_m64(_mm_srli_epi16(to_m128i(a), imm8));
742 }
743 unittest
744 {
745     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
746     short4 B = cast(short4)( _mm_srli_pi16(A, 1) );
747     short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ];
748     assert(B.array == correct);
749 }
750 
751 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
752 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe
753 {
754     return to_m64(_mm_srli_epi32(to_m128i(a), imm8));
755 }
756 unittest
757 {
758     __m64 A = _mm_setr_pi32(-4, 5);
759     int2 B = cast(int2)( _mm_srli_pi32(A, 1) );
760     int[2] correct = [ 0x7ffffffe, 2 ];
761     assert(B.array == correct);
762 }
763 
764 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros.
765 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe
766 {
767     return to_m64(_mm_srli_epi64(to_m128i(a), imm8));
768 }
769 unittest
770 {
771     __m64 A = _mm_cvtsi64_m64(-1);
772     long1 R = cast(long1)( _mm_srli_si64(A, 1) );
773     long[1] correct = [ 0x7fff_ffff_ffff_ffff ];
774     assert(R.array == correct);
775 }
776 
777 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
778 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe
779 {
780     return cast(__m64)(cast(short4)a - cast(short4)b);
781 }
782 unittest
783 {
784     short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
785                                          _mm_setr_pi16(cast(short)65535, 16, 4, 4));
786     static immutable short[4] correct =                            [ -1,-15, 1, 32764];
787     assert(R.array == correct);
788 }
789 
790 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
791 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe
792 {
793     return cast(__m64)(cast(int2)a - cast(int2)b);
794 }
795 unittest
796 {
797     int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10,   4),
798                                      _mm_setr_pi32( 15, -70));
799     static immutable int[2] correct =             [ -5,  74];
800     assert(R.array == correct);
801 }
802 
803 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
804 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe
805 {
806     return cast(__m64)(cast(byte8)a - cast(byte8)b);
807 }
808 unittest
809 {
810     byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
811                                       _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
812     static immutable byte[8] correct =                 [      -1,   7, -1,-30,  0,  0, 0, 120 ];
813     assert(R.array == correct);
814 }
815 
816 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation.
817 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe
818 {
819     return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b)));
820 }
821 unittest
822 {
823     short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
824                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
825     static immutable short[4] correct =                             [ -1,-15, 1, -32768];
826     assert(R.array == correct);
827 }
828 
829 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation.
830 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe
831 {
832     return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b)));
833 }
834 unittest
835 {
836     byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
837                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
838     static immutable byte[8] correct =                 [       -1,   7, -1,-30,  0,  0, 0, -128 ];
839     assert(R.array == correct);
840 }
841 
842 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
843 /// using saturation.
844 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe
845 {
846     return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b)));
847 }
848 unittest
849 {
850     short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534,  1, 5, 4),
851                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
852     static immutable short[4] correct =                              [ 0,  0, 1, 0];
853     assert(R.array == correct);
854 }
855 
856 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 
857 /// using saturation.
858 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe
859 {
860     return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b)));
861 }
862 unittest
863 {
864     byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8),
865                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
866     static immutable byte[8] correct =                 [        0,   7,  0,  0,  0,  0, 0, 0, ];
867     assert(R.array == correct);
868 }
869 
870 deprecated alias _m_to_int = _mm_cvtsi64_si32;  /// Deprecated intrinsics.
871 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto
872 
873 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
874 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted
875 {   
876     version(LDC)
877     {
878         // avoiding this shufflevector leads to bad performance on LDC
879         return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b);
880     }
881     else
882     {
883         short4 ia = cast(short4)a;
884         short4 ib = cast(short4)b;
885         short4 r;
886         r.ptr[0] = ia.array[2];
887         r.ptr[1] = ib.array[2];
888         r.ptr[2] = ia.array[3];
889         r.ptr[3] = ib.array[3];
890         return cast(__m64)r;
891     }
892 }
893 unittest
894 {
895     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
896     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
897     short4 R = cast(short4) _mm_unpackhi_pi16(A, B);
898     short[4] correct = [-16, -3, 7, 10];
899     assert(R.array == correct);
900 }
901 
902 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
903 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted
904 {
905     // Generate punpckldq as far back as LDC 1.0.0 -O1
906     // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions)
907     int2 ia = cast(int2)a;
908     int2 ib = cast(int2)b;
909     int2 r;
910     r.ptr[0] = ia.array[1];
911     r.ptr[1] = ib.array[1];
912     return cast(__m64)r;
913 }
914 unittest
915 {
916     __m64 A = _mm_setr_pi32(4, 8);
917     __m64 B = _mm_setr_pi32(5, 9);
918     int2 R = cast(int2) _mm_unpackhi_pi32(A, B);
919     int[2] correct = [8, 9];
920     assert(R.array == correct);
921 }
922 
923 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
924 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b)
925 {
926     version(LDC)
927     {
928         return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b);
929     }
930     else
931     {
932         byte8 ia = cast(byte8)a;
933         byte8 ib = cast(byte8)b;
934         byte8 r;
935         r.ptr[0] = ia.array[4];
936         r.ptr[1] = ib.array[4];
937         r.ptr[2] = ia.array[5];
938         r.ptr[3] = ib.array[5];
939         r.ptr[4] = ia.array[6];
940         r.ptr[5] = ib.array[6];
941         r.ptr[6] = ia.array[7];
942         r.ptr[7] = ib.array[7];
943         return cast(__m64)r;
944     }
945 }
946 unittest
947 {
948     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
949     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
950     byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B);
951     byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8];
952     assert(R.array == correct);
953 }
954 
955 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
956 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b)
957 {
958     // Generates punpcklwd since LDC 1.0.0 -01
959     short4 ia = cast(short4)a;
960     short4 ib = cast(short4)b;
961     short4 r;
962     r.ptr[0] = ia.array[0];
963     r.ptr[1] = ib.array[0];
964     r.ptr[2] = ia.array[1];
965     r.ptr[3] = ib.array[1];
966     return cast(__m64)r;
967 }
968 unittest
969 {
970     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
971     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
972     short4 R = cast(short4) _mm_unpacklo_pi16(A, B);
973     short[4] correct = [4, 5, 8, 9];
974     assert(R.array == correct);
975 }
976 
977 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
978 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted
979 {
980     // x86: Generate punpckldq as far back as LDC 1.0.0 -O1
981     // ARM: Generate zip as far back as LDC 1.8.0 -O1
982     int2 ia = cast(int2)a;
983     int2 ib = cast(int2)b;
984     int2 r;
985     r.ptr[0] = ia.array[0];
986     r.ptr[1] = ib.array[0];
987     return cast(__m64)r;
988 }
989 unittest
990 {
991     __m64 A = _mm_setr_pi32(4, 8);
992     __m64 B = _mm_setr_pi32(5, 9);
993     int2 R = cast(int2) _mm_unpacklo_pi32(A, B);
994     int[2] correct = [4, 5];
995     assert(R.array == correct);
996 }
997 
998 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
999 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b)
1000 {
1001     version(LDC)
1002     {
1003         return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b);
1004     }
1005     else
1006     {
1007         byte8 ia = cast(byte8)a;
1008         byte8 ib = cast(byte8)b;
1009         byte8 r;
1010         r.ptr[0] = ia.array[0];
1011         r.ptr[1] = ib.array[0];
1012         r.ptr[2] = ia.array[1];
1013         r.ptr[3] = ib.array[1];
1014         r.ptr[4] = ia.array[2];
1015         r.ptr[5] = ib.array[2];
1016         r.ptr[6] = ia.array[3];
1017         r.ptr[7] = ib.array[3];
1018         return cast(__m64)r;
1019     }
1020 }
1021 unittest
1022 {
1023     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
1024     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
1025     byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B);
1026     byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4];
1027     assert(R.array == correct);
1028 }
1029 
1030 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`.
1031 __m64 _mm_xor_si64 (__m64 a, __m64 b)
1032 {
1033     return a ^ b;
1034 }
1035 unittest
1036 {
1037     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
1038     __m64 B = _mm_set1_pi16(15);
1039     short4 R = cast(short4)_mm_xor_si64(A, B);
1040     short[4] correct =     [240, 14, -16, 15];
1041     assert(R.array == correct);
1042 }
1043