1 /**
2 * Copyright: Copyright Auburn Sounds 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.mmx;
7 
8 public import inteli.types;
9 import inteli.internals;
10 
11 import inteli.xmmintrin;
12 import inteli.emmintrin;
13 
14 nothrow @nogc:
15 
16 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
17 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
18 // intel-intrinsics is just semantics.
19 
20 
21 /// Add packed 16-bit integers in `a` and `b`.
22 __m64 _mm_add_pi16 (__m64 a, __m64 b)
23 {
24     return cast(__m64)(cast(short4)a + cast(short4)b);
25 }
26 unittest
27 {
28     short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
29     short[4] correct = [7, 7, 7, 7];
30     assert(R.array == correct);
31 }
32 
33 /// Add packed 32-bit integers in `a` and `b`.
34 __m64 _mm_add_pi32 (__m64 a, __m64 b)
35 {
36     return cast(__m64)(cast(int2)a + cast(int2)b);
37 }
38 unittest
39 {
40     int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
41     int[2] correct = [7, 7];
42     assert(R.array == correct);
43 }
44 
45 /// Add packed 8-bit integers in `a` and `b`.
46 __m64 _mm_add_pi8 (__m64 a, __m64 b)
47 {
48     return cast(__m64)(cast(byte8)a + cast(byte8)b);
49 }
50 unittest
51 {
52     byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
53     byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
54     assert(R.array == correct);
55 }
56 
57 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
58 // PERF: PADDSW not generated
59 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
60 {
61     return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b)));
62 }
63 unittest
64 {
65     short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
66                                             _mm_set_pi16(3, 2, 1, 0));
67     static immutable short[4] correctResult = [0, 2, 4, 6];
68     assert(res.array == correctResult);
69 }
70 
71 /// Add packed 8-bit integers in `a` and `b` using signed saturation.
72 // PERF: PADDSB not generated
73 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
74 {
75     return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b)));
76 }
77 unittest
78 {
79     byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
80                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
81     static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
82     assert(res.array == correctResult);
83 }
84 
85 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
86 // PERF: PADDUSW not generated
87 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
88 {
89     return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b)));
90 }
91 unittest
92 {
93     short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
94                                             _mm_set_pi16(3, 2, 1, 0));
95     static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
96     assert(res.array == correctResult);
97 }
98 
99 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
100 // PERF: PADDUSB not generated
101 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
102 {
103     return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b)));
104 }
105 unittest
106 {
107     byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
108                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
109     static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
110     assert(res.array == correctResult);
111 }
112 
113 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
114 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
115 {
116     return a & b;
117 }
118 unittest
119 {
120     __m64 A = [7];
121     __m64 B = [14];
122     __m64 R = _mm_and_si64(A, B);
123     assert(R[0] == 6);
124 }
125 
126 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
127 __m64 _mm_andnot_si64 (__m64 a, __m64 b)
128 {
129     return (~a) & b;
130 }
131 unittest
132 {
133     __m64 A = [7];
134     __m64 B = [14];
135     __m64 R = _mm_andnot_si64(A, B);
136     assert(R[0] == 8);
137 }
138 
139 
140 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
141 {
142     return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
143 }
144 unittest
145 {
146     short4   A = [-3, -2, -1,  0];
147     short4   B = [ 4,  3,  2,  1];
148     short[4] E = [ 0,  0,  0,  0];
149     short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
150     assert(R.array == E);
151 }
152 
153 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
154 {
155     return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
156 }
157 unittest
158 {
159     int2   A = [-3, -2];
160     int2   B = [ 4, -2];
161     int[2] E = [ 0, -1];
162     int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
163     assert(R.array == E);
164 }
165 
166 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
167 {
168     return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
169 }
170 unittest
171 {
172     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
173     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
174     byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
175     byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
176     assert(C.array == correct);
177 }
178 
179 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
180 {
181     return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
182 }
183 unittest
184 {
185     short4   A = [-3, -2, -1,  0];
186     short4   B = [ 4,  3,  2,  1];
187     short[4] E = [ 0,  0,  0,  0];
188     short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
189     assert(R.array == E);
190 }
191 
192 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
193 {
194     return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
195 }
196 unittest
197 {
198     int2   A = [-3,  2];
199     int2   B = [ 4, -2];
200     int[2] E = [ 0, -1];
201     int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
202     assert(R.array == E);
203 }
204 
205 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
206 {
207     return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
208 }
209 unittest
210 {
211     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
212     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
213     byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
214     byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
215     assert(C.array == correct);
216 }
217 
218 /// Copy 64-bit integer `a` to `dst`.
219 long _mm_cvtm64_si64 (__m64 a) pure @safe
220 {
221     return a[0];
222 }
223 
224 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
225 __m64 _mm_cvtsi32_si64 (int a) pure @safe
226 {
227     __m64 r = void;
228     r[0] = a;
229     return r;
230 }
231 unittest
232 {
233     __m64 R = _mm_cvtsi32_si64(-1);
234     assert(R[0] == -1);
235 }
236 
237 /// Copy 64-bit integer `a` to `dst`.
238 __m64 _mm_cvtsi64_m64 (long a) pure @safe
239 {
240     __m64 r = void;
241     r[0] = a;
242     return r;
243 }
244 unittest
245 {
246     __m64 R = _mm_cvtsi64_m64(-1);
247     assert(R[0] == -1);
248 }
249 
250 /// Copy the lower 32-bit integer in `a` to `dst`.
251 int _mm_cvtsi64_si32 (__m64 a) pure @safe
252 {
253     int2 r = cast(int2)a;
254     return r[0];
255 }
256 
257 alias _m_empty = _mm_empty;
258 
259 void _mm_empty() pure @safe
260 {
261     // do nothing, see comment on top of file
262 }
263 
264 alias _m_from_int =  _mm_cvtsi32_si64;
265 alias _m_from_int64 = _mm_cvtsi64_m64;
266 
267 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe
268 {
269     return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b)));
270 }
271 unittest
272 {
273     short4 A = [-32768, -32768, 32767, 32767];
274     short4 B = [-32768, -32768, 32767, 32767];
275     int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B);
276     int[2] correct = [-2147483648, 2*32767*32767];
277     assert(R.array == correct);
278 }
279 
280 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe
281 {
282     return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b)));
283 }
284 unittest
285 {
286     __m64 A = _mm_setr_pi16(4, 8, -16, 7);
287     __m64 B = _mm_set1_pi16(16384);
288     short4 R = cast(short4)_mm_mulhi_pi16(A, B);
289     short[4] correct = [1, 2, -4, 1];
290     assert(R.array == correct);
291 }
292 
293 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe
294 {
295     return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b)));
296 }
297 unittest
298 {
299     __m64 A = _mm_setr_pi16(4, 1, 16, 7);
300     __m64 B = _mm_set1_pi16(16384);
301     short4 R = cast(short4)_mm_mullo_pi16(A, B);
302     short[4] correct = [0, 16384, 0, -16384];
303     assert(R.array == correct);
304 }
305 
306 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe
307 {
308     return a | b;
309 }
310 
311 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @safe
312 {
313     int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b));
314     int2 r;
315     r[0] = p[0];
316     r[1] = p[2];
317     return cast(__m64)r;
318 }
319 unittest
320 {
321     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
322     byte8 R = cast(byte8) _mm_packs_pi16(A, A);
323     byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0];
324     assert(R.array == correct);
325 }
326 
327 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @safe
328 {
329     int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b));
330     int2 r;
331     r[0] = p[0];
332     r[1] = p[2];
333     return cast(__m64)r;
334 }
335 unittest
336 {
337     __m64 A = _mm_setr_pi32(100000, -100000);
338     short4 R = cast(short4) _mm_packs_pi32(A, A);
339     short[4] correct = [32767, -32768, 32767, -32768];
340     assert(R.array == correct);
341 }
342 
343 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @safe
344 {
345     int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b));
346     int2 r;
347     r[0] = p[0];
348     r[1] = p[2];
349     return cast(__m64)r;
350 }
351 unittest
352 {
353     __m64 A = _mm_setr_pi16(256, -129, 254, 0);
354     byte8 R = cast(byte8) _mm_packs_pu16(A, A);
355     ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0];
356     assert(R.array == cast(byte[8])correct);
357 }
358 
359 deprecated alias
360     _m_packssdw = _mm_packs_pi32,
361     _m_packsswb = _mm_packs_pi16,
362     _m_packuswb = _mm_packs_pu16,
363     _m_paddb = _mm_add_pi8,
364     _m_paddd = _mm_add_pi32,
365     _m_paddsb = _mm_adds_pi8,
366     _m_paddsw = _mm_adds_pi16,
367     _m_paddusb = _mm_adds_pu8,
368     _m_paddusw = _mm_adds_pu16,
369     _m_paddw = _mm_add_pi16,
370     _m_pand = _mm_and_si64,
371     _m_pandn = _mm_andnot_si64,
372     _m_pcmpeqb = _mm_cmpeq_pi8,
373     _m_pcmpeqd = _mm_cmpeq_pi32,
374     _m_pcmpeqw = _mm_cmpeq_pi16,
375     _m_pcmpgtb = _mm_cmpgt_pi8,
376     _m_pcmpgtd = _mm_cmpgt_pi32,
377     _m_pcmpgtw = _mm_cmpgt_pi16,
378     _m_pmaddwd = _mm_madd_pi16,
379     _m_pmulhw = _mm_mulhi_pi16,
380     _m_pmullw = _mm_mullo_pi16,
381     _m_por = _mm_or_si64,
382     _m_pslld = _mm_sll_pi32,
383     _m_pslldi = _mm_slli_pi32,
384     _m_psllq = _mm_sll_si64,
385     _m_psllqi = _mm_slli_si64,
386     _m_psllw = _mm_sll_pi16,
387     _m_psllwi = _mm_slli_pi16,
388     _m_psrad = _mm_sra_pi32,
389     _m_psradi = _mm_srai_pi32,
390     _m_psraw = _mm_sra_pi16,
391     _m_psrawi = _mm_srai_pi16,
392     _m_psrld = _mm_srl_pi32,
393     _m_psrldi = _mm_srli_pi32,
394     _m_psrlq = _mm_srl_si64,
395     _m_psrlqi = _mm_srli_si64,
396     _m_psrlw = _mm_srl_pi16,
397     _m_psrlwi = _mm_srli_pi16,
398     _m_psubb = _mm_sub_pi8,
399     _m_psubd = _mm_sub_pi32,
400     _m_psubsb = _mm_subs_pi8,
401     _m_psubsw = _mm_subs_pi16,
402     _m_psubusb = _mm_subs_pu8,
403     _m_psubusw = _mm_subs_pu16,
404     _m_psubw = _mm_sub_pi16,
405     _m_punpckhbw = _mm_unpackhi_pi8,
406     _m_punpckhdq = _mm_unpackhi_pi32,
407     _m_punpckhwd = _mm_unpackhi_pi16,
408     _m_punpcklbw = _mm_unpacklo_pi8,
409     _m_punpckldq = _mm_unpacklo_pi32,
410     _m_punpcklwd = _mm_unpacklo_pi16,
411     _m_pxor = _mm_xor_si64;
412 
413 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
414 {
415     short[4] arr = [e0, e1, e2, e3];
416     return *cast(__m64*)(arr.ptr);
417 }
418 unittest
419 {
420     short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
421     short[4] correct = [0, 1, 2, 3];
422     assert(R.array == correct);
423 }
424 
425 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted
426 {
427     int[2] arr = [e0, e1];
428     return *cast(__m64*)(arr.ptr);
429 }
430 unittest
431 {
432     int2 R = cast(int2) _mm_set_pi32(1, 0);
433     int[2] correct = [0, 1];
434     assert(R.array == correct);
435 }
436 
437 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
438 {
439     byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
440     return *cast(__m64*)(arr.ptr);
441 }
442 unittest
443 {
444     byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
445     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
446     assert(R.array == correct);
447 }
448 
449 __m64 _mm_set1_pi16 (short a) pure @trusted
450 {
451     return cast(__m64)(short4(a));
452 }
453 unittest
454 {
455     short4 R = cast(short4) _mm_set1_pi16(44);
456     short[4] correct = [44, 44, 44, 44];
457     assert(R.array == correct);
458 }
459 
460 __m64 _mm_set1_pi32 (int a) pure @trusted
461 {
462     return cast(__m64)(int2(a));
463 }
464 unittest
465 {
466     int2 R = cast(int2) _mm_set1_pi32(43);
467     int[2] correct = [43, 43];
468     assert(R.array == correct);
469 }
470 
471 __m64 _mm_set1_pi8 (byte a) pure @trusted
472 {
473     return cast(__m64)(byte8(a));
474 }
475 unittest
476 {
477     byte8 R = cast(byte8) _mm_set1_pi8(42);
478     byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
479     assert(R.array == correct);
480 }
481 
482 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
483 {
484     short[4] arr = [e3, e2, e1, e0];
485     return *cast(__m64*)(arr.ptr);
486 }
487 unittest
488 {
489     short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
490     short[4] correct = [0, 1, 2, 3];
491     assert(R.array == correct);
492 }
493 
494 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
495 {
496     int[2] arr = [e1, e0];
497     return *cast(__m64*)(arr.ptr);
498 }
499 unittest
500 {
501     int2 R = cast(int2) _mm_setr_pi32(0, 1);
502     int[2] correct = [0, 1];
503     assert(R.array == correct);
504 }
505 
506 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted
507 {
508     byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
509     return *cast(__m64*)(arr.ptr);
510 }
511 unittest
512 {
513     byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
514     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
515     assert(R.array == correct);
516 }
517 
518 __m64 _mm_setzero_si64 () pure @trusted
519 {
520     __m64 r;
521     r[0] = 0;
522     return r;
523 }
524 unittest
525 {
526     __m64 R = _mm_setzero_si64();
527     assert(R[0] == 0);
528 }
529 
530 __m64 _mm_sll_pi16 (__m64 a, __m64 count) pure @safe
531 {
532     return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(count)));
533 }
534 
535 __m64 _mm_sll_pi32 (__m64 a, __m64 count) pure @safe
536 {
537     return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(count)));
538 }
539 
540 __m64 _mm_sll_si64 (__m64 a, __m64 count) pure @safe
541 {
542     return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(count)));
543 }
544 
545 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe
546 {
547     return to_m64(_mm_slli_epi16(to_m128i(a), imm8));
548 }
549 
550 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe
551 {
552     return to_m64(_mm_slli_epi32(to_m128i(a), imm8));
553 }
554 
555 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe
556 {
557     return to_m64(_mm_slli_epi64(to_m128i(a), imm8));
558 }
559 
560 __m64 _mm_sra_pi16 (__m64 a, __m64 count) pure @safe
561 {
562     return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(count)));
563 }
564 
565 __m64 _mm_sra_pi32 (__m64 a, __m64 count) pure @safe
566 {
567     return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(count)));
568 }
569 
570 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe
571 {
572     return to_m64(_mm_srai_epi16(to_m128i(a), imm8));
573 }
574 
575 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe
576 {
577     return to_m64(_mm_srai_epi32(to_m128i(a), imm8));
578 }
579 
580 __m64 _mm_srl_pi16 (__m64 a, __m64 count) pure @safe
581 {
582     return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(count)));
583 }
584 
585 __m64 _mm_srl_pi32 (__m64 a, __m64 count) pure @safe
586 {
587     return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(count)));
588 }
589 
590 __m64 _mm_srl_si64 (__m64 a, __m64 count) pure @safe
591 {
592     return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(count)));
593 }
594 
595 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe
596 {
597     return to_m64(_mm_srli_epi16(to_m128i(a), imm8));
598 }
599 
600 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe
601 {
602     return to_m64(_mm_srli_epi32(to_m128i(a), imm8));
603 }
604 
605 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe
606 {
607     return to_m64(_mm_srli_epi64(to_m128i(a), imm8));
608 }
609 
610 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe
611 {
612     return cast(__m64)(cast(short4)a - cast(short4)b);
613 }
614 
615 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe
616 {
617     return cast(__m64)(cast(int2)a - cast(int2)b);
618 }
619 
620 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe
621 {
622     return cast(__m64)(cast(byte8)a - cast(byte8)b);
623 }
624 
625 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe
626 {
627     return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b)));
628 }
629 
630 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe
631 {
632     return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b)));
633 }
634 
635 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe
636 {
637     return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b)));
638 }
639 
640 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe
641 {
642     return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b)));
643 }
644 
645 deprecated alias _m_to_int = _mm_cvtsi64_si32;
646 deprecated alias _m_to_int64 = _mm_cvtm64_si64;
647 
648 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @safe
649 {
650     return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b);
651 }
652 
653 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @safe
654 {
655     return cast(__m64) shufflevector!(int2, 1, 3)(cast(int2)a, cast(int2)b);
656 }
657 
658 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b)
659 {
660     return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b);
661 }
662 
663 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b)
664 {
665     return cast(__m64) shufflevector!(short4, 0, 4, 1, 5)(cast(short4)a, cast(short4)b);
666 }
667 
668 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @safe
669 {
670     return cast(__m64) shufflevector!(int2, 0, 2)(cast(int2)a, cast(int2)b);
671 }
672 
673 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b)
674 {
675     return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b);
676 }
677 
678 __m64 _mm_xor_si64 (__m64 a, __m64 b)
679 {
680     return a ^ b;
681 }
682