1 /**
2 * Copyright: Copyright Auburn Sounds 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 * Macros:
6 *      GUIDE = https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=$0
7 *
8 */
9 module inteli.mmx;
10 
11 public import inteli.types;
12 import inteli.internals;
13 
14 import inteli.xmmintrin;
15 import inteli.emmintrin;
16 
17 nothrow @nogc:
18 
19 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
20 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
21 // intel-intrinsics is just semantics.
22 
23 
24 /// Add packed 16-bit integers in `a` and `b`.
25 __m64 _mm_add_pi16 (__m64 a, __m64 b)
26 {
27     return cast(__m64)(cast(short4)a + cast(short4)b);
28 }
29 unittest
30 {
31     short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
32     short[4] correct = [7, 7, 7, 7];
33     assert(R.array == correct);
34 }
35 
36 /// Add packed 32-bit integers in `a` and `b`.
37 __m64 _mm_add_pi32 (__m64 a, __m64 b)
38 {
39     return cast(__m64)(cast(int2)a + cast(int2)b);
40 }
41 unittest
42 {
43     int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
44     int[2] correct = [7, 7];
45     assert(R.array == correct);
46 }
47 
48 /// Add packed 8-bit integers in `a` and `b`.
49 __m64 _mm_add_pi8 (__m64 a, __m64 b)
50 {
51     return cast(__m64)(cast(byte8)a + cast(byte8)b);
52 }
53 unittest
54 {
55     byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
56     byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
57     assert(R.array == correct);
58 }
59 
60 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
61 // PERF: PADDSW not generated
62 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
63 {
64     return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b)));
65 }
66 unittest
67 {
68     short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
69                                             _mm_set_pi16(3, 2, 1, 0));
70     static immutable short[4] correctResult = [0, 2, 4, 6];
71     assert(res.array == correctResult);
72 }
73 
74 /// Add packed 8-bit integers in `a` and `b` using signed saturation.
75 // PERF: PADDSB not generated
76 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
77 {
78     return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b)));
79 }
80 unittest
81 {
82     byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
83                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
84     static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
85     assert(res.array == correctResult);
86 }
87 
88 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
89 // PERF: PADDUSW not generated
90 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
91 {
92     return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b)));
93 }
94 unittest
95 {
96     short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
97                                             _mm_set_pi16(3, 2, 1, 0));
98     static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
99     assert(res.array == correctResult);
100 }
101 
102 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
103 // PERF: PADDUSB not generated
104 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
105 {
106     return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b)));
107 }
108 unittest
109 {
110     byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
111                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
112     static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
113     assert(res.array == correctResult);
114 }
115 
116 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
117 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
118 {
119     return a & b;
120 }
121 unittest
122 {
123     __m64 A = [7];
124     __m64 B = [14];
125     __m64 R = _mm_and_si64(A, B);
126     assert(R.array[0] == 6);
127 }
128 
129 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
130 __m64 _mm_andnot_si64 (__m64 a, __m64 b)
131 {
132     return (~a) & b;
133 }
134 unittest
135 {
136     __m64 A = [7];
137     __m64 B = [14];
138     __m64 R = _mm_andnot_si64(A, B);
139     assert(R.array[0] == 8);
140 }
141 
142 /// Compare packed 16-bit integers in `a` and `b` for equality.
143 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
144 {
145     static if (GDC_with_MMX)
146     {
147         return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b);        
148     }
149     else
150     {
151         return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
152     }
153 }
154 unittest
155 {
156     short4   A = [-3, -2, -1,  0];
157     short4   B = [ 4,  3,  2,  1];
158     short[4] E = [ 0,  0,  0,  0];
159     short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
160     assert(R.array == E);
161 }
162 
163 /// Compare packed 32-bit integers in `a` and `b` for equality.
164 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
165 {
166     static if (GDC_with_MMX)
167     {        
168         return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b);
169     }
170     else
171     {
172         return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
173     }
174 }
175 unittest
176 {
177     int2   A = [-3, -2];
178     int2   B = [ 4, -2];
179     int[2] E = [ 0, -1];
180     int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
181     assert(R.array == E);
182 }
183 
184 /// Compare packed 8-bit integers in `a` and `b` for equality,
185 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
186 {
187     static if (GDC_with_MMX)
188     {        
189         return cast(__m64) __builtin_ia32_pcmpeqb(cast(byte8)a, cast(byte8)b);
190     }
191     else
192     {
193         return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
194     }
195 }
196 unittest
197 {
198     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
199     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
200     byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
201     byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
202     assert(C.array == correct);
203 }
204 
205 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
206 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
207 {
208     static if (GDC_with_MMX)
209     { 
210         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
211     }
212     else
213     {
214         return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
215     }
216 }
217 unittest
218 {
219     short4   A = [-3, -2, -1,  0];
220     short4   B = [ 4,  3,  2,  1];
221     short[4] E = [ 0,  0,  0,  0];
222     short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
223     assert(R.array == E);
224 }
225 
226 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
227 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
228 {
229     static if (GDC_with_MMX)
230     {
231         return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b);
232     }
233     else
234     {
235         return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
236     }
237 }
238 unittest
239 {
240     int2   A = [-3,  2];
241     int2   B = [ 4, -2];
242     int[2] E = [ 0, -1];
243     int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
244     assert(R.array == E);
245 }
246 
247 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
248 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
249 {
250     static if (GDC_with_MMX)
251     {
252         return cast(__m64) __builtin_ia32_pcmpgtb (cast(byte8)a, cast(byte8)b);
253     }
254     else
255     {
256         return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
257     }
258 }
259 unittest
260 {
261     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
262     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
263     byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
264     byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
265     assert(C.array == correct);
266 }
267 
268 /// Copy 64-bit integer `a` to `dst`.
269 long _mm_cvtm64_si64 (__m64 a) pure @safe
270 {
271     long1 la = cast(long1)a;
272     return a.array[0];
273 }
274 unittest
275 {
276     __m64 A = _mm_setr_pi32(2, 1);
277     long1 lA = cast(long1)A;
278     assert(A.array[0] == 0x100000002);
279 }
280 
281 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
282 __m64 _mm_cvtsi32_si64 (int a) pure @trusted
283 {
284     __m64 r = void;
285     r.ptr[0] = a;
286     return r;
287 }
288 unittest
289 {
290     __m64 R = _mm_cvtsi32_si64(-1);
291     assert(R.array[0] == -1);
292 }
293 
294 /// Copy 64-bit integer `a` to `dst`.
295 __m64 _mm_cvtsi64_m64 (long a) pure @trusted
296 {
297     __m64 r = void;
298     r.ptr[0] = a;
299     return r;
300 }
301 unittest
302 {
303     __m64 R = _mm_cvtsi64_m64(0x123456789A);
304     assert(R.array[0] == 0x123456789A);
305 }
306 
307 /// Get the lower 32-bit integer in `a`.
308 int _mm_cvtsi64_si32 (__m64 a) pure @safe
309 {
310     int2 r = cast(int2)a;
311     return r.array[0];
312 }
313 unittest
314 {
315     __m64 A = _mm_setr_pi32(-6, 5);
316     int R = _mm_cvtsi64_si32(A);
317     assert(R == -6);
318 }
319 
320 /// Empty the MMX state, which marks the x87 FPU registers as available for 
321 /// use by x87 instructions. 
322 /// This instruction is supposed to be used at the end of all MMX technology procedures.
323 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD.
324 void _mm_empty() pure @safe
325 {
326     // do nothing, see comment on top of file
327     // TODO: not sure for GDC, do something?
328 }
329 
330 ///ditto
331 alias _m_empty = _mm_empty;
332 
333 alias _m_from_int =  _mm_cvtsi32_si64;
334 alias _m_from_int64 = _mm_cvtsi64_m64;
335 
336 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 
337 /// Horizontally add adjacent pairs of intermediate 32-bit integers
338 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe
339 {
340     return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b)));
341 }
342 unittest
343 {
344     short4 A = [-32768, -32768, 32767, 32767];
345     short4 B = [-32768, -32768, 32767, 32767];
346     int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B);
347     int[2] correct = [-2147483648, 2*32767*32767];
348     assert(R.array == correct);
349 }
350 
351 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
352 /// and store the high 16 bits of the intermediate integers.
353 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe
354 {
355     return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b)));
356 }
357 unittest
358 {
359     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
360     __m64 B = _mm_set1_pi16(16384);
361     short4 R = cast(short4)_mm_mulhi_pi16(A, B);
362     short[4] correct = [1, 2, -4, 1];
363     assert(R.array == correct);
364 }
365 
366 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 
367 /// and store the low 16 bits of the intermediate integers.
368 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe
369 {
370     return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b)));
371 }
372 unittest
373 {
374     __m64 A = _mm_setr_pi16(4, 1, 16, 7);
375     __m64 B = _mm_set1_pi16(16384);
376     short4 R = cast(short4)_mm_mullo_pi16(A, B);
377     short[4] correct = [0, 16384, 0, -16384];
378     assert(R.array == correct);
379 }
380 
381 /// Compute the bitwise OR of 64 bits in `a` and `b`.
382 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe
383 {
384     return a | b;
385 }
386 unittest
387 {
388     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
389     __m64 B = _mm_set1_pi16(15);
390     short4 R = cast(short4)_mm_or_si64(A, B);
391     short[4] correct =     [255, 15, -1, 15];
392     assert(R.array == correct);
393 }
394 
395 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation.
396 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted
397 {
398     int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b));
399     int2 r;
400     r.ptr[0] = p.array[0];
401     r.ptr[1] = p.array[2];
402     return cast(__m64)r;
403 }
404 unittest
405 {
406     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
407     byte8 R = cast(byte8) _mm_packs_pi16(A, A);
408     byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0];
409     assert(R.array == correct);
410 }
411 
412 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation.
413 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted
414 {
415     int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b));
416     int2 r;
417     r.ptr[0] = p.array[0];
418     r.ptr[1] = p.array[2];
419     return cast(__m64)r;
420 }
421 unittest
422 {
423     __m64 A = _mm_setr_pi32(100000, -100000);
424     short4 R = cast(short4) _mm_packs_pi32(A, A);
425     short[4] correct = [32767, -32768, 32767, -32768];
426     assert(R.array == correct);
427 }
428 
429 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation.
430 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted
431 {
432     int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b));
433     int2 r;
434     r.ptr[0] = p.array[0];
435     r.ptr[1] = p.array[2];
436     return cast(__m64)r;
437 }
438 unittest
439 {
440     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
441     byte8 R = cast(byte8) _mm_packs_pu16(A, A);
442     ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0];
443     assert(R.array == cast(byte[8])correct);
444 }
445 
446 deprecated alias
447     _m_packssdw = _mm_packs_pi32,
448     _m_packsswb = _mm_packs_pi16,
449     _m_packuswb = _mm_packs_pu16,
450     _m_paddb = _mm_add_pi8,
451     _m_paddd = _mm_add_pi32,
452     _m_paddsb = _mm_adds_pi8,
453     _m_paddsw = _mm_adds_pi16,
454     _m_paddusb = _mm_adds_pu8,
455     _m_paddusw = _mm_adds_pu16,
456     _m_paddw = _mm_add_pi16,
457     _m_pand = _mm_and_si64,
458     _m_pandn = _mm_andnot_si64,
459     _m_pcmpeqb = _mm_cmpeq_pi8,
460     _m_pcmpeqd = _mm_cmpeq_pi32,
461     _m_pcmpeqw = _mm_cmpeq_pi16,
462     _m_pcmpgtb = _mm_cmpgt_pi8,
463     _m_pcmpgtd = _mm_cmpgt_pi32,
464     _m_pcmpgtw = _mm_cmpgt_pi16,
465     _m_pmaddwd = _mm_madd_pi16,
466     _m_pmulhw = _mm_mulhi_pi16,
467     _m_pmullw = _mm_mullo_pi16,
468     _m_por = _mm_or_si64,
469     _m_pslld = _mm_sll_pi32,
470     _m_pslldi = _mm_slli_pi32,
471     _m_psllq = _mm_sll_si64,
472     _m_psllqi = _mm_slli_si64,
473     _m_psllw = _mm_sll_pi16,
474     _m_psllwi = _mm_slli_pi16,
475     _m_psrad = _mm_sra_pi32,
476     _m_psradi = _mm_srai_pi32,
477     _m_psraw = _mm_sra_pi16,
478     _m_psrawi = _mm_srai_pi16,
479     _m_psrld = _mm_srl_pi32,
480     _m_psrldi = _mm_srli_pi32,
481     _m_psrlq = _mm_srl_si64,
482     _m_psrlqi = _mm_srli_si64,
483     _m_psrlw = _mm_srl_pi16,
484     _m_psrlwi = _mm_srli_pi16,
485     _m_psubb = _mm_sub_pi8,
486     _m_psubd = _mm_sub_pi32,
487     _m_psubsb = _mm_subs_pi8,
488     _m_psubsw = _mm_subs_pi16,
489     _m_psubusb = _mm_subs_pu8,
490     _m_psubusw = _mm_subs_pu16,
491     _m_psubw = _mm_sub_pi16,
492     _m_punpckhbw = _mm_unpackhi_pi8,
493     _m_punpckhdq = _mm_unpackhi_pi32,
494     _m_punpckhwd = _mm_unpackhi_pi16,
495     _m_punpcklbw = _mm_unpacklo_pi8,
496     _m_punpckldq = _mm_unpacklo_pi32,
497     _m_punpcklwd = _mm_unpacklo_pi16,
498     _m_pxor = _mm_xor_si64;
499 
500 /// Set packed 16-bit integers with the supplied values.
501 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
502 {
503     short[4] arr = [e0, e1, e2, e3];
504     return *cast(__m64*)(arr.ptr);
505 }
506 unittest
507 {
508     short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
509     short[4] correct = [0, 1, 2, 3];
510     assert(R.array == correct);
511 }
512 
513 /// Set packed 32-bit integers with the supplied values.
514 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted
515 {
516     int[2] arr = [e0, e1];
517     return *cast(__m64*)(arr.ptr);
518 }
519 unittest
520 {
521     int2 R = cast(int2) _mm_set_pi32(1, 0);
522     int[2] correct = [0, 1];
523     assert(R.array == correct);
524 }
525 
526 /// Set packed 8-bit integers with the supplied values.
527 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
528 {
529     byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
530     return *cast(__m64*)(arr.ptr);
531 }
532 unittest
533 {
534     byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
535     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
536     assert(R.array == correct);
537 }
538 
539 /// Broadcast 16-bit integer `a` to all elements.
540 __m64 _mm_set1_pi16 (short a) pure @trusted
541 {
542     return cast(__m64)(short4(a));
543 }
544 unittest
545 {
546     short4 R = cast(short4) _mm_set1_pi16(44);
547     short[4] correct = [44, 44, 44, 44];
548     assert(R.array == correct);
549 }
550 
551 /// Broadcast 32-bit integer `a` to all elements.
552 __m64 _mm_set1_pi32 (int a) pure @trusted
553 {
554     return cast(__m64)(int2(a));
555 }
556 unittest
557 {
558     int2 R = cast(int2) _mm_set1_pi32(43);
559     int[2] correct = [43, 43];
560     assert(R.array == correct);
561 }
562 
563 /// Broadcast 8-bit integer `a` to all elements.
564 __m64 _mm_set1_pi8 (byte a) pure @trusted
565 {
566     return cast(__m64)(byte8(a));
567 }
568 unittest
569 {
570     byte8 R = cast(byte8) _mm_set1_pi8(42);
571     byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
572     assert(R.array == correct);
573 }
574 
575 /// Set packed 16-bit integers with the supplied values in reverse order.
576 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
577 {
578     short[4] arr = [e3, e2, e1, e0];
579     return *cast(__m64*)(arr.ptr);
580 }
581 unittest
582 {
583     short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
584     short[4] correct = [0, 1, 2, 3];
585     assert(R.array == correct);
586 }
587 
588 /// Set packed 32-bit integers with the supplied values in reverse order.
589 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
590 {
591     int[2] arr = [e1, e0];
592     return *cast(__m64*)(arr.ptr);
593 }
594 unittest
595 {
596     int2 R = cast(int2) _mm_setr_pi32(0, 1);
597     int[2] correct = [0, 1];
598     assert(R.array == correct);
599 }
600 
601 /// Set packed 8-bit integers with the supplied values in reverse order.
602 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
603 {
604     byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
605     return *cast(__m64*)(arr.ptr);
606 }
607 unittest
608 {
609     byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
610     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
611     assert(R.array == correct);
612 }
613 
614 /// Return vector of type `__m64` with all elements set to zero.
615 __m64 _mm_setzero_si64 () pure @trusted
616 {
617     __m64 r;
618     r.ptr[0] = 0;
619     return r;
620 }
621 unittest
622 {
623     __m64 R = _mm_setzero_si64();
624     assert(R.array[0] == 0);
625 }
626 
627 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros.
628 __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe
629 {
630     return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits)));
631 }
632 unittest
633 {
634     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
635     short4 B = cast(short4)( _mm_sll_pi16(A, _mm_cvtsi64_m64(1)) );
636     short[4] correct = [ -8, -10, 12, 14 ];
637     assert(B.array == correct);
638 }
639 
640 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros.
641 __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe
642 {
643     return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits)));
644 }
645 unittest
646 {
647     __m64 A = _mm_setr_pi32(-4, 5);
648     int2 B = cast(int2)( _mm_sll_pi32(A, _mm_cvtsi64_m64(1)) );
649     int[2] correct = [ -8, 10 ];
650     assert(B.array == correct);
651 }
652 
653 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros.
654 __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe
655 {
656     return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits)));
657 }
658 unittest
659 {
660     __m64 A = _mm_cvtsi64_m64(-1);
661     long1 R = cast(long1)( _mm_sll_si64(A, _mm_cvtsi64_m64(1)) );
662     long[1] correct = [ -2 ];
663     assert(R.array == correct);
664 }
665 
666 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros.
667 __m64 _mm_slli_pi16 (__m64 a, int bits) pure @safe
668 {
669     return to_m64(_mm_slli_epi16(to_m128i(a), bits));
670 }
671 unittest
672 {
673     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
674     short4 B = cast(short4)( _mm_slli_pi16(A, 1) );
675     short[4] correct = [ -8, -10, 12, 14 ];
676     assert(B.array == correct);
677 }
678 
679 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros.
680 __m64 _mm_slli_pi32 (__m64 a, int bits) pure @safe
681 {
682     return to_m64(_mm_slli_epi32(to_m128i(a), bits));
683 }
684 unittest
685 {
686     __m64 A = _mm_setr_pi32(-4, 5);
687     int2 B = cast(int2)( _mm_slli_pi32(A, 1) );
688     int[2] correct = [ -8, 10 ];
689     assert(B.array == correct);
690 }
691 
692 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros.
693 __m64 _mm_slli_si64 (__m64 a, int bits) pure @safe
694 {
695     return to_m64(_mm_slli_epi64(to_m128i(a), bits));
696 }
697 unittest
698 {
699     __m64 A = _mm_cvtsi64_m64(-1);
700     long1 R = cast(long1)( _mm_slli_si64(A, 1) );
701     long[1] correct = [ -2 ];
702     assert(R.array == correct);
703 }
704 
705 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits.
706 __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe
707 {
708     return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits)));
709 }
710 unittest
711 {
712     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
713     short4 B = cast(short4)( _mm_sra_pi16(A, _mm_cvtsi64_m64(1)) );
714     short[4] correct = [ -2, -3, 3, 3 ];
715     assert(B.array == correct);
716 }
717 
718 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits.
719 __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe
720 {
721     return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits)));
722 }
723 unittest
724 {
725     __m64 A = _mm_setr_pi32(-4, 5);
726     int2 B = cast(int2)( _mm_sra_pi32(A, _mm_cvtsi64_m64(1)) );
727     int[2] correct = [ -2, 2 ];
728     assert(B.array == correct);
729 }
730 
731 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits.
732 __m64 _mm_srai_pi16 (__m64 a, int bits) pure @safe
733 {
734     return to_m64(_mm_srai_epi16(to_m128i(a), bits));
735 }
736 unittest
737 {
738     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
739     short4 B = cast(short4)( _mm_srai_pi16(A, 1) );
740     short[4] correct = [ -2, -3, 3, 3 ];
741     assert(B.array == correct);
742 }
743 
744 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits.
745 __m64 _mm_srai_pi32 (__m64 a, int bits) pure @safe
746 {
747     return to_m64(_mm_srai_epi32(to_m128i(a), bits));
748 }
749 unittest
750 {
751     __m64 A = _mm_setr_pi32(-4, 5);
752     int2 B = cast(int2)( _mm_srai_pi32(A, 1) );
753     int[2] correct = [ -2, 2 ];
754     assert(B.array == correct);
755 }
756 
757 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros.
758 __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe
759 {
760     return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits)));
761 }
762 unittest
763 {
764     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
765     short4 B = cast(short4)( _mm_srl_pi16(A, _mm_cvtsi64_m64(1)) );
766     short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ];
767     assert(B.array == correct);
768 }
769 
770 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros.
771 __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe
772 {
773     return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits)));
774 }
775 unittest
776 {
777     __m64 A = _mm_setr_pi32(-4, 5);
778     int2 B = cast(int2)( _mm_srl_pi32(A, _mm_cvtsi64_m64(1)) );
779     int[2] correct = [ 0x7ffffffe, 2 ];
780     assert(B.array == correct);
781 }
782 
783 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros.
784 __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe
785 {
786     return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits)));
787 }
788 unittest
789 {
790     __m64 A = _mm_cvtsi64_m64(-1);
791     long1 R = cast(long1)( _mm_srl_si64(A, _mm_cvtsi64_m64(1)) );
792     long[1] correct = [ 0x7fff_ffff_ffff_ffff ];
793     assert(R.array == correct);
794 }
795 
796 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros.
797 __m64 _mm_srli_pi16 (__m64 a, int bits) pure @safe
798 {
799     return to_m64(_mm_srli_epi16(to_m128i(a), bits));
800 }
801 unittest
802 {
803     __m64 A = _mm_setr_pi16(-4, -5, 6, 7);
804     short4 B = cast(short4)( _mm_srli_pi16(A, 1) );
805     short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ];
806     assert(B.array == correct);
807 }
808 
809 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros.
810 __m64 _mm_srli_pi32 (__m64 a, int bits) pure @safe
811 {
812     return to_m64(_mm_srli_epi32(to_m128i(a), bits));
813 }
814 unittest
815 {
816     __m64 A = _mm_setr_pi32(-4, 5);
817     int2 B = cast(int2)( _mm_srli_pi32(A, 1) );
818     int[2] correct = [ 0x7ffffffe, 2 ];
819     assert(B.array == correct);
820 }
821 
822 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros.
823 __m64 _mm_srli_si64 (__m64 a, int bits) pure @safe
824 {
825     return to_m64(_mm_srli_epi64(to_m128i(a), bits));
826 }
827 unittest
828 {
829     __m64 A = _mm_cvtsi64_m64(-1);
830     long1 R = cast(long1)( _mm_srli_si64(A, 1) );
831     long[1] correct = [ 0x7fff_ffff_ffff_ffff ];
832     assert(R.array == correct);
833 }
834 
835 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
836 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe
837 {
838     return cast(__m64)(cast(short4)a - cast(short4)b);
839 }
840 unittest
841 {
842     short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
843                                          _mm_setr_pi16(cast(short)65535, 16, 4, 4));
844     static immutable short[4] correct =                            [ -1,-15, 1, 32764];
845     assert(R.array == correct);
846 }
847 
848 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
849 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe
850 {
851     return cast(__m64)(cast(int2)a - cast(int2)b);
852 }
853 unittest
854 {
855     int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10,   4),
856                                      _mm_setr_pi32( 15, -70));
857     static immutable int[2] correct =             [ -5,  74];
858     assert(R.array == correct);
859 }
860 
861 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
862 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe
863 {
864     return cast(__m64)(cast(byte8)a - cast(byte8)b);
865 }
866 unittest
867 {
868     byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
869                                       _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
870     static immutable byte[8] correct =                 [      -1,   7, -1,-30,  0,  0, 0, 120 ];
871     assert(R.array == correct);
872 }
873 
874 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation.
875 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe
876 {
877     return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b)));
878 }
879 unittest
880 {
881     short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534,  1, 5, -32768),
882                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
883     static immutable short[4] correct =                             [ -1,-15, 1, -32768];
884     assert(R.array == correct);
885 }
886 
887 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation.
888 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe
889 {
890     return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b)));
891 }
892 unittest
893 {
894     byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128),
895                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
896     static immutable byte[8] correct =                 [       -1,   7, -1,-30,  0,  0, 0, -128 ];
897     assert(R.array == correct);
898 }
899 
900 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 
901 /// using saturation.
902 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe
903 {
904     return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b)));
905 }
906 unittest
907 {
908     short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534,  1, 5, 4),
909                                           _mm_setr_pi16(cast(short)65535, 16, 4, 4));
910     static immutable short[4] correct =                              [ 0,  0, 1, 0];
911     assert(R.array == correct);
912 }
913 
914 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 
915 /// using saturation.
916 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe
917 {
918     return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b)));
919 }
920 unittest
921 {
922     byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8),
923                                        _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8));
924     static immutable byte[8] correct =                 [        0,   7,  0,  0,  0,  0, 0, 0, ];
925     assert(R.array == correct);
926 }
927 
928 deprecated alias _m_to_int = _mm_cvtsi64_si32;
929 deprecated alias _m_to_int64 = _mm_cvtm64_si64;
930 
931 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
932 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted
933 {   
934     version(LDC)
935     {
936         // avoiding this shufflevector leads to bad performance on LDC
937         return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b);
938     }
939     else
940     {
941         short4 ia = cast(short4)a;
942         short4 ib = cast(short4)b;
943         short4 r;
944         r.ptr[0] = ia.array[2];
945         r.ptr[1] = ib.array[2];
946         r.ptr[2] = ia.array[3];
947         r.ptr[3] = ib.array[3];
948         return cast(__m64)r;
949     }
950 }
951 unittest
952 {
953     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
954     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
955     short4 R = cast(short4) _mm_unpackhi_pi16(A, B);
956     short[4] correct = [-16, -3, 7, 10];
957     assert(R.array == correct);
958 }
959 
960 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
961 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted
962 {
963     // Generate punpckldq as far back as LDC 1.0.0 -O1
964     // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions)
965     int2 ia = cast(int2)a;
966     int2 ib = cast(int2)b;
967     int2 r;
968     r.ptr[0] = ia.array[1];
969     r.ptr[1] = ib.array[1];
970     return cast(__m64)r;
971 }
972 unittest
973 {
974     __m64 A = _mm_setr_pi32(4, 8);
975     __m64 B = _mm_setr_pi32(5, 9);
976     int2 R = cast(int2) _mm_unpackhi_pi32(A, B);
977     int[2] correct = [8, 9];
978     assert(R.array == correct);
979 }
980 
981 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
982 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b)
983 {
984     version(LDC)
985     {
986         return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b);
987     }
988     else
989     {
990         byte8 ia = cast(byte8)a;
991         byte8 ib = cast(byte8)b;
992         byte8 r;
993         r.ptr[0] = ia.array[4];
994         r.ptr[1] = ib.array[4];
995         r.ptr[2] = ia.array[5];
996         r.ptr[3] = ib.array[5];
997         r.ptr[4] = ia.array[6];
998         r.ptr[5] = ib.array[6];
999         r.ptr[6] = ia.array[7];
1000         r.ptr[7] = ib.array[7];
1001         return cast(__m64)r;
1002     }
1003 }
1004 unittest
1005 {
1006     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
1007     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
1008     byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B);
1009     byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8];
1010     assert(R.array == correct);
1011 }
1012 
1013 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
1014 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b)
1015 {
1016     // Generates punpcklwd since LDC 1.0.0 -01
1017     short4 ia = cast(short4)a;
1018     short4 ib = cast(short4)b;
1019     short4 r;
1020     r.ptr[0] = ia.array[0];
1021     r.ptr[1] = ib.array[0];
1022     r.ptr[2] = ia.array[1];
1023     r.ptr[3] = ib.array[1];
1024     return cast(__m64)r;
1025 }
1026 unittest
1027 {
1028     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
1029     __m64 B = _mm_setr_pi16(5, 9,  -3, 10);
1030     short4 R = cast(short4) _mm_unpacklo_pi16(A, B);
1031     short[4] correct = [4, 5, 8, 9];
1032     assert(R.array == correct);
1033 }
1034 
1035 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
1036 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted
1037 {
1038     // Generate punpckldq as far back as LDC 1.0.0 -O1
1039     int2 ia = cast(int2)a;
1040     int2 ib = cast(int2)b;
1041     int2 r;
1042     r.ptr[0] = ia.array[0];
1043     r.ptr[1] = ib.array[0];
1044     return cast(__m64)r;
1045 }
1046 unittest
1047 {
1048     __m64 A = _mm_setr_pi32(4, 8);
1049     __m64 B = _mm_setr_pi32(5, 9);
1050     int2 R = cast(int2) _mm_unpacklo_pi32(A, B);
1051     int[2] correct = [4, 5];
1052     assert(R.array == correct);
1053 }
1054 
1055 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
1056 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b)
1057 {
1058     version(LDC)
1059     {
1060         return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b);
1061     }
1062     else
1063     {
1064         byte8 ia = cast(byte8)a;
1065         byte8 ib = cast(byte8)b;
1066         byte8 r;
1067         r.ptr[0] = ia.array[0];
1068         r.ptr[1] = ib.array[0];
1069         r.ptr[2] = ia.array[1];
1070         r.ptr[3] = ib.array[1];
1071         r.ptr[4] = ia.array[2];
1072         r.ptr[5] = ib.array[2];
1073         r.ptr[6] = ia.array[3];
1074         r.ptr[7] = ib.array[3];
1075         return cast(__m64)r;
1076     }
1077 }
1078 unittest
1079 {
1080     __m64 A = _mm_setr_pi8( 1,  2,  3,  4,  5,  6,  7,  8);
1081     __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8);
1082     byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B);
1083     byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4];
1084     assert(R.array == correct);
1085 }
1086 
1087 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`.
1088 __m64 _mm_xor_si64 (__m64 a, __m64 b)
1089 {
1090     return a ^ b;
1091 }
1092 unittest
1093 {
1094     __m64 A = _mm_setr_pi16(255, 1, -1, 0);
1095     __m64 B = _mm_set1_pi16(15);
1096     short4 R = cast(short4)_mm_xor_si64(A, B);
1097     short[4] correct =     [240, 14, -16, 15];
1098     assert(R.array == correct);
1099 }
1100