1 /** 2 * Copyright: Copyright Auburn Sounds 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.mmx; 7 8 public import inteli.types; 9 import inteli.internals; 10 11 import inteli.xmmintrin; 12 import inteli.emmintrin; 13 14 nothrow @nogc: 15 16 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 17 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 18 // intel-intrinsics is just semantics. 19 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m64 _mm_add_pi16 (__m64 a, __m64 b) 23 { 24 return cast(__m64)(cast(short4)a + cast(short4)b); 25 } 26 unittest 27 { 28 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 29 short[4] correct = [7, 7, 7, 7]; 30 assert(R.array == correct); 31 } 32 33 /// Add packed 32-bit integers in `a` and `b`. 34 __m64 _mm_add_pi32 (__m64 a, __m64 b) 35 { 36 return cast(__m64)(cast(int2)a + cast(int2)b); 37 } 38 unittest 39 { 40 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 41 int[2] correct = [7, 7]; 42 assert(R.array == correct); 43 } 44 45 /// Add packed 8-bit integers in `a` and `b`. 46 __m64 _mm_add_pi8 (__m64 a, __m64 b) 47 { 48 return cast(__m64)(cast(byte8)a + cast(byte8)b); 49 } 50 unittest 51 { 52 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 53 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 54 assert(R.array == correct); 55 } 56 57 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 58 // PERF: PADDSW not generated 59 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 60 { 61 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 62 } 63 unittest 64 { 65 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 66 _mm_set_pi16(3, 2, 1, 0)); 67 static immutable short[4] correctResult = [0, 2, 4, 6]; 68 assert(res.array == correctResult); 69 } 70 71 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 72 // PERF: PADDSB not generated 73 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 74 { 75 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 76 } 77 unittest 78 { 79 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 80 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 81 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 82 assert(res.array == correctResult); 83 } 84 85 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 86 // PERF: PADDUSW not generated 87 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 88 { 89 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 90 } 91 unittest 92 { 93 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 94 _mm_set_pi16(3, 2, 1, 0)); 95 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 96 assert(res.array == correctResult); 97 } 98 99 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 100 // PERF: PADDUSB not generated 101 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 102 { 103 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 104 } 105 unittest 106 { 107 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 108 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 109 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 110 assert(res.array == correctResult); 111 } 112 113 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 114 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 115 { 116 return a & b; 117 } 118 unittest 119 { 120 __m64 A = [7]; 121 __m64 B = [14]; 122 __m64 R = _mm_and_si64(A, B); 123 assert(R[0] == 6); 124 } 125 126 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 127 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 128 { 129 return (~a) & b; 130 } 131 unittest 132 { 133 __m64 A = [7]; 134 __m64 B = [14]; 135 __m64 R = _mm_andnot_si64(A, B); 136 assert(R[0] == 8); 137 } 138 139 140 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 141 { 142 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 143 } 144 unittest 145 { 146 short4 A = [-3, -2, -1, 0]; 147 short4 B = [ 4, 3, 2, 1]; 148 short[4] E = [ 0, 0, 0, 0]; 149 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 150 assert(R.array == E); 151 } 152 153 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 154 { 155 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 156 } 157 unittest 158 { 159 int2 A = [-3, -2]; 160 int2 B = [ 4, -2]; 161 int[2] E = [ 0, -1]; 162 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 163 assert(R.array == E); 164 } 165 166 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 167 { 168 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 169 } 170 unittest 171 { 172 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 173 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 174 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 175 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 176 assert(C.array == correct); 177 } 178 179 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 180 { 181 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 182 } 183 unittest 184 { 185 short4 A = [-3, -2, -1, 0]; 186 short4 B = [ 4, 3, 2, 1]; 187 short[4] E = [ 0, 0, 0, 0]; 188 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 189 assert(R.array == E); 190 } 191 192 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 193 { 194 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 195 } 196 unittest 197 { 198 int2 A = [-3, 2]; 199 int2 B = [ 4, -2]; 200 int[2] E = [ 0, -1]; 201 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 202 assert(R.array == E); 203 } 204 205 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 206 { 207 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 208 } 209 unittest 210 { 211 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 212 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 213 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 214 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 215 assert(C.array == correct); 216 } 217 218 /// Copy 64-bit integer `a` to `dst`. 219 long _mm_cvtm64_si64 (__m64 a) pure @safe 220 { 221 return a[0]; 222 } 223 224 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 225 __m64 _mm_cvtsi32_si64 (int a) pure @safe 226 { 227 __m64 r = void; 228 r[0] = a; 229 return r; 230 } 231 unittest 232 { 233 __m64 R = _mm_cvtsi32_si64(-1); 234 assert(R[0] == -1); 235 } 236 237 /// Copy 64-bit integer `a` to `dst`. 238 __m64 _mm_cvtsi64_m64 (long a) pure @safe 239 { 240 __m64 r = void; 241 r[0] = a; 242 return r; 243 } 244 unittest 245 { 246 __m64 R = _mm_cvtsi64_m64(-1); 247 assert(R[0] == -1); 248 } 249 250 /// Copy the lower 32-bit integer in `a` to `dst`. 251 int _mm_cvtsi64_si32 (__m64 a) pure @safe 252 { 253 int2 r = cast(int2)a; 254 return r[0]; 255 } 256 257 alias _m_empty = _mm_empty; 258 259 void _mm_empty() pure @safe 260 { 261 // do nothing, see comment on top of file 262 } 263 264 alias _m_from_int = _mm_cvtsi32_si64; 265 alias _m_from_int64 = _mm_cvtsi64_m64; 266 267 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 268 { 269 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 270 } 271 unittest 272 { 273 short4 A = [-32768, -32768, 32767, 32767]; 274 short4 B = [-32768, -32768, 32767, 32767]; 275 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 276 int[2] correct = [-2147483648, 2*32767*32767]; 277 assert(R.array == correct); 278 } 279 280 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 281 { 282 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 283 } 284 unittest 285 { 286 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 287 __m64 B = _mm_set1_pi16(16384); 288 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 289 short[4] correct = [1, 2, -4, 1]; 290 assert(R.array == correct); 291 } 292 293 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 294 { 295 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 296 } 297 unittest 298 { 299 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 300 __m64 B = _mm_set1_pi16(16384); 301 short4 R = cast(short4)_mm_mullo_pi16(A, B); 302 short[4] correct = [0, 16384, 0, -16384]; 303 assert(R.array == correct); 304 } 305 306 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 307 { 308 return a | b; 309 } 310 311 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @safe 312 { 313 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 314 int2 r; 315 r[0] = p[0]; 316 r[1] = p[2]; 317 return cast(__m64)r; 318 } 319 unittest 320 { 321 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 322 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 323 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 324 assert(R.array == correct); 325 } 326 327 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @safe 328 { 329 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 330 int2 r; 331 r[0] = p[0]; 332 r[1] = p[2]; 333 return cast(__m64)r; 334 } 335 unittest 336 { 337 __m64 A = _mm_setr_pi32(100000, -100000); 338 short4 R = cast(short4) _mm_packs_pi32(A, A); 339 short[4] correct = [32767, -32768, 32767, -32768]; 340 assert(R.array == correct); 341 } 342 343 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @safe 344 { 345 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 346 int2 r; 347 r[0] = p[0]; 348 r[1] = p[2]; 349 return cast(__m64)r; 350 } 351 unittest 352 { 353 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 354 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 355 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 356 assert(R.array == cast(byte[8])correct); 357 } 358 359 deprecated alias 360 _m_packssdw = _mm_packs_pi32, 361 _m_packsswb = _mm_packs_pi16, 362 _m_packuswb = _mm_packs_pu16, 363 _m_paddb = _mm_add_pi8, 364 _m_paddd = _mm_add_pi32, 365 _m_paddsb = _mm_adds_pi8, 366 _m_paddsw = _mm_adds_pi16, 367 _m_paddusb = _mm_adds_pu8, 368 _m_paddusw = _mm_adds_pu16, 369 _m_paddw = _mm_add_pi16, 370 _m_pand = _mm_and_si64, 371 _m_pandn = _mm_andnot_si64, 372 _m_pcmpeqb = _mm_cmpeq_pi8, 373 _m_pcmpeqd = _mm_cmpeq_pi32, 374 _m_pcmpeqw = _mm_cmpeq_pi16, 375 _m_pcmpgtb = _mm_cmpgt_pi8, 376 _m_pcmpgtd = _mm_cmpgt_pi32, 377 _m_pcmpgtw = _mm_cmpgt_pi16, 378 _m_pmaddwd = _mm_madd_pi16, 379 _m_pmulhw = _mm_mulhi_pi16, 380 _m_pmullw = _mm_mullo_pi16, 381 _m_por = _mm_or_si64, 382 _m_pslld = _mm_sll_pi32, 383 _m_pslldi = _mm_slli_pi32, 384 _m_psllq = _mm_sll_si64, 385 _m_psllqi = _mm_slli_si64, 386 _m_psllw = _mm_sll_pi16, 387 _m_psllwi = _mm_slli_pi16, 388 _m_psrad = _mm_sra_pi32, 389 _m_psradi = _mm_srai_pi32, 390 _m_psraw = _mm_sra_pi16, 391 _m_psrawi = _mm_srai_pi16, 392 _m_psrld = _mm_srl_pi32, 393 _m_psrldi = _mm_srli_pi32, 394 _m_psrlq = _mm_srl_si64, 395 _m_psrlqi = _mm_srli_si64, 396 _m_psrlw = _mm_srl_pi16, 397 _m_psrlwi = _mm_srli_pi16, 398 _m_psubb = _mm_sub_pi8, 399 _m_psubd = _mm_sub_pi32, 400 _m_psubsb = _mm_subs_pi8, 401 _m_psubsw = _mm_subs_pi16, 402 _m_psubusb = _mm_subs_pu8, 403 _m_psubusw = _mm_subs_pu16, 404 _m_psubw = _mm_sub_pi16, 405 _m_punpckhbw = _mm_unpackhi_pi8, 406 _m_punpckhdq = _mm_unpackhi_pi32, 407 _m_punpckhwd = _mm_unpackhi_pi16, 408 _m_punpcklbw = _mm_unpacklo_pi8, 409 _m_punpckldq = _mm_unpacklo_pi32, 410 _m_punpcklwd = _mm_unpacklo_pi16, 411 _m_pxor = _mm_xor_si64; 412 413 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 414 { 415 short[4] arr = [e0, e1, e2, e3]; 416 return *cast(__m64*)(arr.ptr); 417 } 418 unittest 419 { 420 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 421 short[4] correct = [0, 1, 2, 3]; 422 assert(R.array == correct); 423 } 424 425 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 426 { 427 int[2] arr = [e0, e1]; 428 return *cast(__m64*)(arr.ptr); 429 } 430 unittest 431 { 432 int2 R = cast(int2) _mm_set_pi32(1, 0); 433 int[2] correct = [0, 1]; 434 assert(R.array == correct); 435 } 436 437 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 438 { 439 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 440 return *cast(__m64*)(arr.ptr); 441 } 442 unittest 443 { 444 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 445 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 446 assert(R.array == correct); 447 } 448 449 __m64 _mm_set1_pi16 (short a) pure @trusted 450 { 451 return cast(__m64)(short4(a)); 452 } 453 unittest 454 { 455 short4 R = cast(short4) _mm_set1_pi16(44); 456 short[4] correct = [44, 44, 44, 44]; 457 assert(R.array == correct); 458 } 459 460 __m64 _mm_set1_pi32 (int a) pure @trusted 461 { 462 return cast(__m64)(int2(a)); 463 } 464 unittest 465 { 466 int2 R = cast(int2) _mm_set1_pi32(43); 467 int[2] correct = [43, 43]; 468 assert(R.array == correct); 469 } 470 471 __m64 _mm_set1_pi8 (byte a) pure @trusted 472 { 473 return cast(__m64)(byte8(a)); 474 } 475 unittest 476 { 477 byte8 R = cast(byte8) _mm_set1_pi8(42); 478 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 479 assert(R.array == correct); 480 } 481 482 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 483 { 484 short[4] arr = [e3, e2, e1, e0]; 485 return *cast(__m64*)(arr.ptr); 486 } 487 unittest 488 { 489 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 490 short[4] correct = [0, 1, 2, 3]; 491 assert(R.array == correct); 492 } 493 494 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 495 { 496 int[2] arr = [e1, e0]; 497 return *cast(__m64*)(arr.ptr); 498 } 499 unittest 500 { 501 int2 R = cast(int2) _mm_setr_pi32(0, 1); 502 int[2] correct = [0, 1]; 503 assert(R.array == correct); 504 } 505 506 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 507 { 508 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 509 return *cast(__m64*)(arr.ptr); 510 } 511 unittest 512 { 513 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 514 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 515 assert(R.array == correct); 516 } 517 518 __m64 _mm_setzero_si64 () pure @trusted 519 { 520 __m64 r; 521 r[0] = 0; 522 return r; 523 } 524 unittest 525 { 526 __m64 R = _mm_setzero_si64(); 527 assert(R[0] == 0); 528 } 529 530 __m64 _mm_sll_pi16 (__m64 a, __m64 count) pure @safe 531 { 532 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(count))); 533 } 534 535 __m64 _mm_sll_pi32 (__m64 a, __m64 count) pure @safe 536 { 537 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(count))); 538 } 539 540 __m64 _mm_sll_si64 (__m64 a, __m64 count) pure @safe 541 { 542 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(count))); 543 } 544 545 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe 546 { 547 return to_m64(_mm_slli_epi16(to_m128i(a), imm8)); 548 } 549 550 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe 551 { 552 return to_m64(_mm_slli_epi32(to_m128i(a), imm8)); 553 } 554 555 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe 556 { 557 return to_m64(_mm_slli_epi64(to_m128i(a), imm8)); 558 } 559 560 __m64 _mm_sra_pi16 (__m64 a, __m64 count) pure @safe 561 { 562 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(count))); 563 } 564 565 __m64 _mm_sra_pi32 (__m64 a, __m64 count) pure @safe 566 { 567 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(count))); 568 } 569 570 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe 571 { 572 return to_m64(_mm_srai_epi16(to_m128i(a), imm8)); 573 } 574 575 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe 576 { 577 return to_m64(_mm_srai_epi32(to_m128i(a), imm8)); 578 } 579 580 __m64 _mm_srl_pi16 (__m64 a, __m64 count) pure @safe 581 { 582 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(count))); 583 } 584 585 __m64 _mm_srl_pi32 (__m64 a, __m64 count) pure @safe 586 { 587 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(count))); 588 } 589 590 __m64 _mm_srl_si64 (__m64 a, __m64 count) pure @safe 591 { 592 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(count))); 593 } 594 595 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe 596 { 597 return to_m64(_mm_srli_epi16(to_m128i(a), imm8)); 598 } 599 600 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe 601 { 602 return to_m64(_mm_srli_epi32(to_m128i(a), imm8)); 603 } 604 605 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe 606 { 607 return to_m64(_mm_srli_epi64(to_m128i(a), imm8)); 608 } 609 610 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 611 { 612 return cast(__m64)(cast(short4)a - cast(short4)b); 613 } 614 615 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 616 { 617 return cast(__m64)(cast(int2)a - cast(int2)b); 618 } 619 620 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 621 { 622 return cast(__m64)(cast(byte8)a - cast(byte8)b); 623 } 624 625 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 626 { 627 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 628 } 629 630 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 631 { 632 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 633 } 634 635 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 636 { 637 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 638 } 639 640 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 641 { 642 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 643 } 644 645 deprecated alias _m_to_int = _mm_cvtsi64_si32; 646 deprecated alias _m_to_int64 = _mm_cvtm64_si64; 647 648 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @safe 649 { 650 return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 651 } 652 653 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @safe 654 { 655 return cast(__m64) shufflevector!(int2, 1, 3)(cast(int2)a, cast(int2)b); 656 } 657 658 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 659 { 660 return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 661 } 662 663 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 664 { 665 return cast(__m64) shufflevector!(short4, 0, 4, 1, 5)(cast(short4)a, cast(short4)b); 666 } 667 668 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @safe 669 { 670 return cast(__m64) shufflevector!(int2, 0, 2)(cast(int2)a, cast(int2)b); 671 } 672 673 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 674 { 675 return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 676 } 677 678 __m64 _mm_xor_si64 (__m64 a, __m64 b) 679 { 680 return a ^ b; 681 } 682