1 /**
2 * Copyright: Copyright Auburn Sounds 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.mmx;
7 
8 public import inteli.types;
9 import inteli.internals;
10 
11 nothrow @nogc:
12 
13 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics,
14 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen.
15 // intel-intrinsics is just semantics.
16 
17 
18 /// Add packed 16-bit integers in `a` and `b`.
19 __m64 _mm_add_pi16 (__m64 a, __m64 b)
20 {
21     return cast(__m64)(cast(short4)a + cast(short4)b);
22 }
23 unittest
24 {
25     short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3));
26     short[4] correct = [7, 7, 7, 7];
27     assert(R.array == correct);
28 }
29 
30 /// Add packed 32-bit integers in `a` and `b`.
31 __m64 _mm_add_pi32 (__m64 a, __m64 b)
32 {
33     return cast(__m64)(cast(int2)a + cast(int2)b);
34 }
35 unittest
36 {
37     int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3));
38     int[2] correct = [7, 7];
39     assert(R.array == correct);
40 }
41 
42 /// Add packed 8-bit integers in `a` and `b`.
43 __m64 _mm_add_pi8 (__m64 a, __m64 b)
44 {
45     return cast(__m64)(cast(byte8)a + cast(byte8)b);
46 }
47 unittest
48 {
49     byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128));
50     byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1];
51     assert(R.array == correct);
52 }
53 
54 /// Add packed 16-bit integers in `a` and `b` using signed saturation.
55 // PERF: PADDSW not generated
56 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted
57 {
58     short[4] res;
59     short4 sa = cast(short4)a;
60     short4 sb = cast(short4)b;
61     foreach(i; 0..4)
62         res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
63     return *cast(__m64*)(res.ptr);
64 }
65 unittest
66 {
67     short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0),
68                                             _mm_set_pi16(3, 2, 1, 0));
69     static immutable short[4] correctResult = [0, 2, 4, 6];
70     assert(res.array == correctResult);
71 }
72 
73 /// Add packed 8-bit integers in `a` and `b` using signed saturation.
74 // PERF: PADDSB not generated
75 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted
76 {
77     byte[8] res;
78     byte8 sa = cast(byte8)a;
79     byte8 sb = cast(byte8)b;
80     foreach(i; 0..8)
81         res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
82     return *cast(__m64*)(res.ptr);
83 }
84 unittest
85 {
86     byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0),
87                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
88     static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
89     assert(res.array == correctResult);
90 }
91 
92 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation.
93 // PERF: PADDUSW not generated
94 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted
95 {
96     ushort[4] res;
97     short4 sa = cast(short4)a;
98     short4 sb = cast(short4)b;
99     foreach(i; 0..4)
100         res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
101     return *cast(__m64*)(res.ptr);
102 }
103 unittest
104 {
105     short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0),
106                                             _mm_set_pi16(3, 2, 1, 0));
107     static immutable short[4] correctResult = [0, cast(short)65535, 4, 6];
108     assert(res.array == correctResult);
109 }
110 
111 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation.
112 // PERF: PADDUSB not generated
113 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted
114 {
115     byte[8] res;
116     byte8 sa = cast(byte8)a;
117     byte8 sb = cast(byte8)b;
118     foreach(i; 0..8)
119         res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
120     return *cast(__m64*)(res.ptr);
121 }
122 unittest
123 {
124     byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0),
125                                          _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0));
126     static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14];
127     assert(res.array == correctResult);
128 }
129 
130 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`.
131 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe
132 {
133     return a & b;
134 }
135 unittest
136 {
137     __m64 A = [7];
138     __m64 B = [14];
139     __m64 R = _mm_and_si64(A, B);
140     assert(R[0] == 6);
141 }
142 
143 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`.
144 __m64 _mm_andnot_si64 (__m64 a, __m64 b)
145 {
146     return (~a) & b;
147 }
148 unittest
149 {
150     __m64 A = [7];
151     __m64 B = [14];
152     __m64 R = _mm_andnot_si64(A, B);
153     assert(R[0] == 8);
154 }
155 
156 
157 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe
158 {
159     return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b);
160 }
161 unittest
162 {
163     short4   A = [-3, -2, -1,  0];
164     short4   B = [ 4,  3,  2,  1];
165     short[4] E = [ 0,  0,  0,  0];
166     short4   R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B));
167     assert(R.array == E);
168 }
169 
170 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe
171 {
172     return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b);
173 }
174 unittest
175 {
176     int2   A = [-3, -2];
177     int2   B = [ 4, -2];
178     int[2] E = [ 0, -1];
179     int2   R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B));
180     assert(R.array == E);
181 }
182 
183 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe
184 {
185     return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b);
186 }
187 unittest
188 {
189     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
190     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
191     byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B);
192     byte[8] correct =     [0,-1, 0, 0, 0,-1, 0, 0];
193     assert(C.array == correct);
194 }
195 
196 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe
197 {
198     return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b);
199 }
200 unittest
201 {
202     short4   A = [-3, -2, -1,  0];
203     short4   B = [ 4,  3,  2,  1];
204     short[4] E = [ 0,  0,  0,  0];
205     short4   R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B));
206     assert(R.array == E);
207 }
208 
209 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe
210 {
211     return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b);
212 }
213 unittest
214 {
215     int2   A = [-3,  2];
216     int2   B = [ 4, -2];
217     int[2] E = [ 0, -1];
218     int2   R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B));
219     assert(R.array == E);
220 }
221 
222 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe
223 {
224     return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b);
225 }
226 unittest
227 {
228     __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2);
229     __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3);
230     byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B);
231     byte[8] correct =     [0, 0,-1, 0, 0, 0, 0, 0];
232     assert(C.array == correct);
233 }
234 
235 /// Copy 64-bit integer `a` to `dst`.
236 long _mm_cvtm64_si64 (__m64 a) pure @safe
237 {
238     return a[0];
239 }
240 
241 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`.
242 __m64 _mm_cvtsi32_si64 (int a) pure @safe
243 {
244     __m64 r = void;
245     r[0] = a;
246     return r;
247 }
248 unittest
249 {
250     __m64 R = _mm_cvtsi32_si64(-1);
251     assert(R[0] == -1);
252 }
253 
254 /// Copy 64-bit integer `a` to `dst`.
255 __m64 _mm_cvtsi64_m64 (long a) pure @safe
256 {
257     __m64 r = void;
258     r[0] = a;
259     return r;
260 }
261 unittest
262 {
263     __m64 R = _mm_cvtsi64_m64(-1);
264     assert(R[0] == -1);
265 }
266 
267 /// Copy the lower 32-bit integer in `a` to `dst`.
268 int _mm_cvtsi64_si32 (__m64 a) pure @safe
269 {
270     return cast(int)a[0];
271 }
272 
273 alias _m_empty = _mm_empty;
274 
275 void _mm_empty() pure @safe
276 {
277     // do nothing, see comment on top of file
278 }
279 
280 alias _m_from_int =  _mm_cvtsi32_si64;
281 alias _m_from_int64 = _mm_cvtsi64_m64;
282 
283 /+
284 __m64 _mm_madd_pi16 (__m64 a, __m64 b) TODO
285 pmulhw
286 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) TODO
287 pmullw
288 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) TODO
289 por
290 __m64 _mm_or_si64 (__m64 a, __m64 b) TODO
291 packsswb
292 __m64 _mm_packs_pi16 (__m64 a, __m64 b) TODO
293 packssdw
294 __m64 _mm_packs_pi32 (__m64 a, __m64 b) TODO
295 packuswb
296 __m64 _mm_packs_pu16 (__m64 a, __m64 b) TODO
297 packssdw
298 __m64 _m_packssdw (__m64 a, __m64 b) TODO
299 packsswb
300 __m64 _m_packsswb (__m64 a, __m64 b) TODO
301 packuswb
302 __m64 _m_packuswb (__m64 a, __m64 b) TODO
303 +/
304 
305 
306 deprecated alias _m_paddb = _mm_add_pi8;
307 deprecated alias _m_paddd = _mm_add_pi32;
308 
309 /+
310 paddsb
311 __m64 _m_paddsb (__m64 a, __m64 b) TODO
312 paddsw
313 __m64 _m_paddsw (__m64 a, __m64 b) TODO
314 paddusb
315 __m64 _m_paddusb (__m64 a, __m64 b) TODO
316 paddusw
317 __m64 _m_paddusw (__m64 a, __m64 b) TODO
318 +/
319 
320 deprecated alias _m_paddw = _mm_add_pi16;
321 deprecated alias _m_pand = _mm_and_si64;
322 deprecated alias _m_pandn = _mm_andnot_si64;
323 
324 /+
325 pcmpeqb
326 __m64 _m_pcmpeqb (__m64 a, __m64 b) TODO
327 pcmpeqd
328 __m64 _m_pcmpeqd (__m64 a, __m64 b) TODO
329 pcmpeqw
330 __m64 _m_pcmpeqw (__m64 a, __m64 b) TODO
331 pcmpgtb
332 __m64 _m_pcmpgtb (__m64 a, __m64 b) TODO
333 pcmpgtd
334 __m64 _m_pcmpgtd (__m64 a, __m64 b) TODO
335 pcmpgtw
336 __m64 _m_pcmpgtw (__m64 a, __m64 b) TODO
337 pmaddwd
338 __m64 _m_pmaddwd (__m64 a, __m64 b) TODO
339 pmulhw
340 __m64 _m_pmulhw (__m64 a, __m64 b) TODO
341 pmullw
342 __m64 _m_pmullw (__m64 a, __m64 b) TODO
343 por
344 __m64 _m_por (__m64 a, __m64 b) TODO
345 pslld
346 __m64 _m_pslld (__m64 a, __m64 count) TODO
347 pslld
348 __m64 _m_pslldi (__m64 a, int imm8) TODO
349 psllq
350 __m64 _m_psllq (__m64 a, __m64 count) TODO
351 psllq
352 __m64 _m_psllqi (__m64 a, int imm8) TODO
353 psllw
354 __m64 _m_psllw (__m64 a, __m64 count) TODO
355 psllw
356 __m64 _m_psllwi (__m64 a, int imm8) TODO
357 psrad
358 __m64 _m_psrad (__m64 a, __m64 count) TODO
359 psrad
360 __m64 _m_psradi (__m64 a, int imm8) TODO
361 psraw
362 __m64 _m_psraw (__m64 a, __m64 count) TODO
363 psraw
364 __m64 _m_psrawi (__m64 a, int imm8) TODO
365 psrld
366 __m64 _m_psrld (__m64 a, __m64 count) TODO
367 psrld
368 __m64 _m_psrldi (__m64 a, int imm8) TODO
369 psrlq
370 __m64 _m_psrlq (__m64 a, __m64 count) TODO
371 psrlq
372 __m64 _m_psrlqi (__m64 a, int imm8) TODO
373 psrlw
374 __m64 _m_psrlw (__m64 a, __m64 count) TODO
375 psrlw
376 __m64 _m_psrlwi (__m64 a, int imm8) TODO
377 psubb
378 __m64 _m_psubb (__m64 a, __m64 b) TODO
379 psubd
380 __m64 _m_psubd (__m64 a, __m64 b) TODO
381 psubsb
382 __m64 _m_psubsb (__m64 a, __m64 b) TODO
383 psubsw
384 __m64 _m_psubsw (__m64 a, __m64 b) TODO
385 psubusb
386 __m64 _m_psubusb (__m64 a, __m64 b) TODO
387 psubusw
388 __m64 _m_psubusw (__m64 a, __m64 b) TODO
389 psubw
390 __m64 _m_psubw (__m64 a, __m64 b) TODO
391 punpckhbw
392 __m64 _m_punpckhbw (__m64 a, __m64 b) TODO
393 punpckhdq
394 __m64 _m_punpckhdq (__m64 a, __m64 b) TODO
395 punpcklbw
396 __m64 _m_punpckhwd (__m64 a, __m64 b) TODO
397 punpcklbw
398 __m64 _m_punpcklbw (__m64 a, __m64 b) TODO
399 punpckldq
400 __m64 _m_punpckldq (__m64 a, __m64 b) TODO
401 punpcklwd
402 __m64 _m_punpcklwd (__m64 a, __m64 b) TODO
403 pxor
404 __m64 _m_pxor (__m64 a, __m64 b) TODO
405 +/
406 
407 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted
408 { 
409     short[4] arr = [e0, e1, e2, e3];
410     return *cast(__m64*)(arr.ptr);
411 }
412 unittest
413 {
414     short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0);
415     short[4] correct = [0, 1, 2, 3];
416     assert(R.array == correct);
417 }
418 
419 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted
420 { 
421     int[2] arr = [e0, e1];
422     return *cast(__m64*)(arr.ptr);
423 }
424 unittest
425 {
426     int2 R = cast(int2) _mm_set_pi32(1, 0);
427     int[2] correct = [0, 1];
428     assert(R.array == correct);
429 }
430 
431 __m64 _mm_set_pi8 (char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) pure @trusted
432 { 
433     byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7];
434     return *cast(__m64*)(arr.ptr);
435 }
436 unittest
437 {
438     byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0);
439     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
440     assert(R.array == correct);
441 }
442 
443 __m64 _mm_set1_pi16 (short a) pure @trusted
444 {
445     short[4] arr = [a, a, a, a];
446     return *cast(__m64*)(arr.ptr);
447 }
448 unittest
449 {
450     short4 R = cast(short4) _mm_set1_pi16(44);
451     short[4] correct = [44, 44, 44, 44];
452     assert(R.array == correct);
453 }
454 
455 __m64 _mm_set1_pi32 (int a) pure @trusted
456 {
457     int[2] arr = [a, a];
458     return *cast(__m64*)(arr.ptr);
459 }
460 unittest
461 {
462     int2 R = cast(int2) _mm_set1_pi32(43);
463     int[2] correct = [43, 43];
464     assert(R.array == correct);
465 }
466 
467 __m64 _mm_set1_pi8 (byte a) pure @trusted
468 {
469     byte[8] arr = [a, a, a, a, a, a, a, a];
470     return *cast(__m64*)(arr.ptr);
471 }
472 unittest
473 {
474     byte8 R = cast(byte8) _mm_set1_pi8(42);
475     byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42];
476     assert(R.array == correct);
477 }
478 
479 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted
480 { 
481     short[4] arr = [e3, e2, e1, e0];
482     return *cast(__m64*)(arr.ptr);
483 }
484 unittest
485 {
486     short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3);
487     short[4] correct = [0, 1, 2, 3];
488     assert(R.array == correct);
489 }
490 
491 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted
492 { 
493     int[2] arr = [e1, e0];
494     return *cast(__m64*)(arr.ptr);
495 }
496 unittest
497 {
498     int2 R = cast(int2) _mm_setr_pi32(0, 1);
499     int[2] correct = [0, 1];
500     assert(R.array == correct);
501 }
502 
503 __m64 _mm_setr_pi8 (char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) pure @trusted
504 { 
505     byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0];
506     return *cast(__m64*)(arr.ptr);
507 }
508 unittest
509 {
510     byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7);
511     byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7];
512     assert(R.array == correct);
513 }
514 
515 __m64 _mm_setzero_si64 () pure @trusted
516 {
517     __m64 r;
518     r[0] = 0;
519     return r;
520 }
521 unittest
522 {
523     __m64 R = _mm_setzero_si64();
524     assert(R[0] == 0);
525 }
526 
527 
528 /+
529 __m64 _mm_sll_pi16 (__m64 a, __m64 count) TODO
530 pslld
531 __m64 _mm_sll_pi32 (__m64 a, __m64 count) TODO
532 psllq
533 __m64 _mm_sll_si64 (__m64 a, __m64 count) TODO
534 psllw
535 __m64 _mm_slli_pi16 (__m64 a, int imm8) TODO
536 pslld
537 __m64 _mm_slli_pi32 (__m64 a, int imm8) TODO
538 psllq
539 __m64 _mm_slli_si64 (__m64 a, int imm8) TODO
540 psraw 
541 __m64 _mm_sra_pi16 (__m64 a, __m64 count) TODO
542 psrad
543 __m64 _mm_sra_pi32 (__m64 a, __m64 count) TODO
544 psraw
545 __m64 _mm_srai_pi16 (__m64 a, int imm8) TODO
546 psrad
547 __m64 _mm_srai_pi32 (__m64 a, int imm8) TODO
548 psrlw
549 __m64 _mm_srl_pi16 (__m64 a, __m64 count) TODO
550 psrld
551 __m64 _mm_srl_pi32 (__m64 a, __m64 count) TODO
552 psrlq
553 __m64 _mm_srl_si64 (__m64 a, __m64 count) TODO
554 psrlw
555 __m64 _mm_srli_pi16 (__m64 a, int imm8) TODO
556 psrld
557 __m64 _mm_srli_pi32 (__m64 a, int imm8) TODO
558 psrlq
559 __m64 _mm_srli_si64 (__m64 a, int imm8) TODO
560 psubw
561 __m64 _mm_sub_pi16 (__m64 a, __m64 b) TODO
562 psubd
563 __m64 _mm_sub_pi32 (__m64 a, __m64 b) TODO
564 psubb
565 __m64 _mm_sub_pi8 (__m64 a, __m64 b) TODO
566 psubsw
567 __m64 _mm_subs_pi16 (__m64 a, __m64 b) TODO
568 psubsb
569 __m64 _mm_subs_pi8 (__m64 a, __m64 b) TODO
570 psubusw
571 __m64 _mm_subs_pu16 (__m64 a, __m64 b) TODO
572 psubusb
573 __m64 _mm_subs_pu8 (__m64 a, __m64 b) TODO
574 +/
575 
576 deprecated alias _m_to_int = _mm_cvtsi64_si32;
577 deprecated alias _m_to_int64 = _mm_cvtm64_si64;
578 
579 /+
580 punpcklbw
581 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) TODO
582 punpckhdq
583 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) TODO
584 punpckhbw
585 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) TODO
586 punpcklwd
587 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) TODO
588 punpckldq
589 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) TODO
590 punpcklbw
591 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) TODO
592 pxor
593 __m64 _mm_xor_si64 (__m64 a, __m64 b) TODO
594 
595 +/
596 
597 
598 /+
599 #define _m_packsswb _mm_packs_pi16
600 #define _m_packssdw _mm_packs_pi32
601 #define _m_packuswb _mm_packs_pu16
602 #define _m_punpckhbw _mm_unpackhi_pi8
603 #define _m_punpckhwd _mm_unpackhi_pi16
604 #define _m_punpckhdq _mm_unpackhi_pi32
605 #define _m_punpcklbw _mm_unpacklo_pi8
606 #define _m_punpcklwd _mm_unpacklo_pi16
607 #define _m_punpckldq _mm_unpacklo_pi32
608 
609 #define _m_paddsb _mm_adds_pi8
610 #define _m_paddsw _mm_adds_pi16
611 #define _m_paddusb _mm_adds_pu8
612 #define _m_paddusw _mm_adds_pu16
613 #define _m_psubb _mm_sub_pi8
614 #define _m_psubw _mm_sub_pi16
615 #define _m_psubd _mm_sub_pi32
616 #define _m_psubsb _mm_subs_pi8
617 #define _m_psubsw _mm_subs_pi16
618 #define _m_psubusb _mm_subs_pu8
619 #define _m_psubusw _mm_subs_pu16
620 #define _m_pmaddwd _mm_madd_pi16
621 #define _m_pmulhw _mm_mulhi_pi16
622 #define _m_pmullw _mm_mullo_pi16
623 #define _m_psllw _mm_sll_pi16
624 #define _m_psllwi _mm_slli_pi16
625 #define _m_pslld _mm_sll_pi32
626 #define _m_pslldi _mm_slli_pi32
627 #define _m_psllq _mm_sll_si64
628 #define _m_psllqi _mm_slli_si64
629 #define _m_psraw _mm_sra_pi16
630 #define _m_psrawi _mm_srai_pi16
631 #define _m_psrad _mm_sra_pi32
632 #define _m_psradi _mm_srai_pi32
633 #define _m_psrlw _mm_srl_pi16
634 #define _m_psrlwi _mm_srli_pi16
635 #define _m_psrld _mm_srl_pi32
636 #define _m_psrldi _mm_srli_pi32
637 #define _m_psrlq _mm_srl_si64
638 #define _m_psrlqi _mm_srli_si64
639 #define _m_por _mm_or_si64
640 #define _m_pxor _mm_xor_si64
641 #define _m_pcmpeqb _mm_cmpeq_pi8
642 #define _m_pcmpeqw _mm_cmpeq_pi16
643 #define _m_pcmpeqd _mm_cmpeq_pi32
644 #define _m_pcmpgtb _mm_cmpgt_pi8
645 #define _m_pcmpgtw _mm_cmpgt_pi16
646 #define _m_pcmpgtd _mm_cmpgt_pi32
647 +/