1 /** 2 * Copyright: Copyright Auburn Sounds 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.mmx; 7 8 public import inteli.types; 9 import inteli.internals; 10 11 import inteli.xmmintrin; 12 import inteli.emmintrin; 13 14 nothrow @nogc: 15 16 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 17 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 18 // intel-intrinsics is just semantics. 19 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m64 _mm_add_pi16 (__m64 a, __m64 b) 23 { 24 return cast(__m64)(cast(short4)a + cast(short4)b); 25 } 26 unittest 27 { 28 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 29 short[4] correct = [7, 7, 7, 7]; 30 assert(R.array == correct); 31 } 32 33 /// Add packed 32-bit integers in `a` and `b`. 34 __m64 _mm_add_pi32 (__m64 a, __m64 b) 35 { 36 return cast(__m64)(cast(int2)a + cast(int2)b); 37 } 38 unittest 39 { 40 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 41 int[2] correct = [7, 7]; 42 assert(R.array == correct); 43 } 44 45 /// Add packed 8-bit integers in `a` and `b`. 46 __m64 _mm_add_pi8 (__m64 a, __m64 b) 47 { 48 return cast(__m64)(cast(byte8)a + cast(byte8)b); 49 } 50 unittest 51 { 52 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 53 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 54 assert(R.array == correct); 55 } 56 57 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 58 // PERF: PADDSW not generated 59 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 60 { 61 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 62 } 63 unittest 64 { 65 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 66 _mm_set_pi16(3, 2, 1, 0)); 67 static immutable short[4] correctResult = [0, 2, 4, 6]; 68 assert(res.array == correctResult); 69 } 70 71 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 72 // PERF: PADDSB not generated 73 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 74 { 75 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 76 } 77 unittest 78 { 79 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 80 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 81 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 82 assert(res.array == correctResult); 83 } 84 85 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 86 // PERF: PADDUSW not generated 87 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 88 { 89 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 90 } 91 unittest 92 { 93 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 94 _mm_set_pi16(3, 2, 1, 0)); 95 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 96 assert(res.array == correctResult); 97 } 98 99 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 100 // PERF: PADDUSB not generated 101 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 102 { 103 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 104 } 105 unittest 106 { 107 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 108 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 109 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 110 assert(res.array == correctResult); 111 } 112 113 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 114 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 115 { 116 return a & b; 117 } 118 unittest 119 { 120 __m64 A = [7]; 121 __m64 B = [14]; 122 __m64 R = _mm_and_si64(A, B); 123 assert(R.array[0] == 6); 124 } 125 126 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 127 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 128 { 129 return (~a) & b; 130 } 131 unittest 132 { 133 __m64 A = [7]; 134 __m64 B = [14]; 135 __m64 R = _mm_andnot_si64(A, B); 136 assert(R.array[0] == 8); 137 } 138 139 140 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 141 { 142 static if (GDC_with_MMX) 143 { 144 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 145 } 146 else 147 { 148 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 149 } 150 } 151 unittest 152 { 153 short4 A = [-3, -2, -1, 0]; 154 short4 B = [ 4, 3, 2, 1]; 155 short[4] E = [ 0, 0, 0, 0]; 156 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 157 assert(R.array == E); 158 } 159 160 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 161 { 162 static if (GDC_with_MMX) 163 { 164 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 165 } 166 else 167 { 168 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 169 } 170 } 171 unittest 172 { 173 int2 A = [-3, -2]; 174 int2 B = [ 4, -2]; 175 int[2] E = [ 0, -1]; 176 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 177 assert(R.array == E); 178 } 179 180 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 181 { 182 static if (GDC_with_MMX) 183 { 184 return cast(__m64) __builtin_ia32_pcmpeqb(cast(byte8)a, cast(byte8)b); 185 } 186 else 187 { 188 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 189 } 190 } 191 unittest 192 { 193 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 194 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 195 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 196 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 197 assert(C.array == correct); 198 } 199 200 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 201 { 202 static if (GDC_with_MMX) 203 { 204 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 205 } 206 else 207 { 208 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 209 } 210 } 211 unittest 212 { 213 short4 A = [-3, -2, -1, 0]; 214 short4 B = [ 4, 3, 2, 1]; 215 short[4] E = [ 0, 0, 0, 0]; 216 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 217 assert(R.array == E); 218 } 219 220 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 221 { 222 static if (GDC_with_MMX) 223 { 224 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 225 } 226 else 227 { 228 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 229 } 230 } 231 unittest 232 { 233 int2 A = [-3, 2]; 234 int2 B = [ 4, -2]; 235 int[2] E = [ 0, -1]; 236 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 237 assert(R.array == E); 238 } 239 240 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 241 { 242 static if (GDC_with_MMX) 243 { 244 return cast(__m64) __builtin_ia32_pcmpgtb (cast(byte8)a, cast(byte8)b); 245 } 246 else 247 { 248 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 249 } 250 } 251 unittest 252 { 253 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 254 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 255 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 256 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 257 assert(C.array == correct); 258 } 259 260 /// Copy 64-bit integer `a` to `dst`. 261 long _mm_cvtm64_si64 (__m64 a) pure @safe 262 { 263 return a.array[0]; 264 } 265 266 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 267 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 268 { 269 __m64 r = void; 270 r.ptr[0] = a; 271 return r; 272 } 273 unittest 274 { 275 __m64 R = _mm_cvtsi32_si64(-1); 276 assert(R.array[0] == -1); 277 } 278 279 /// Copy 64-bit integer `a` to `dst`. 280 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 281 { 282 __m64 r = void; 283 r.ptr[0] = a; 284 return r; 285 } 286 unittest 287 { 288 __m64 R = _mm_cvtsi64_m64(-1); 289 assert(R.array[0] == -1); 290 } 291 292 /// Copy the lower 32-bit integer in `a` to `dst`. 293 int _mm_cvtsi64_si32 (__m64 a) pure @safe 294 { 295 int2 r = cast(int2)a; 296 return r.array[0]; 297 } 298 299 alias _m_empty = _mm_empty; 300 301 void _mm_empty() pure @safe 302 { 303 // do nothing, see comment on top of file 304 } 305 306 alias _m_from_int = _mm_cvtsi32_si64; 307 alias _m_from_int64 = _mm_cvtsi64_m64; 308 309 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 310 { 311 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 312 } 313 unittest 314 { 315 short4 A = [-32768, -32768, 32767, 32767]; 316 short4 B = [-32768, -32768, 32767, 32767]; 317 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 318 int[2] correct = [-2147483648, 2*32767*32767]; 319 assert(R.array == correct); 320 } 321 322 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 323 { 324 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 325 } 326 unittest 327 { 328 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 329 __m64 B = _mm_set1_pi16(16384); 330 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 331 short[4] correct = [1, 2, -4, 1]; 332 assert(R.array == correct); 333 } 334 335 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 336 { 337 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 338 } 339 unittest 340 { 341 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 342 __m64 B = _mm_set1_pi16(16384); 343 short4 R = cast(short4)_mm_mullo_pi16(A, B); 344 short[4] correct = [0, 16384, 0, -16384]; 345 assert(R.array == correct); 346 } 347 348 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 349 { 350 return a | b; 351 } 352 353 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 354 { 355 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 356 int2 r; 357 r.ptr[0] = p.array[0]; 358 r.ptr[1] = p.array[2]; 359 return cast(__m64)r; 360 } 361 unittest 362 { 363 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 364 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 365 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 366 assert(R.array == correct); 367 } 368 369 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 370 { 371 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 372 int2 r; 373 r.ptr[0] = p.array[0]; 374 r.ptr[1] = p.array[2]; 375 return cast(__m64)r; 376 } 377 unittest 378 { 379 __m64 A = _mm_setr_pi32(100000, -100000); 380 short4 R = cast(short4) _mm_packs_pi32(A, A); 381 short[4] correct = [32767, -32768, 32767, -32768]; 382 assert(R.array == correct); 383 } 384 385 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 386 { 387 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 388 int2 r; 389 r.ptr[0] = p.array[0]; 390 r.ptr[1] = p.array[2]; 391 return cast(__m64)r; 392 } 393 unittest 394 { 395 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 396 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 397 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 398 assert(R.array == cast(byte[8])correct); 399 } 400 401 deprecated alias 402 _m_packssdw = _mm_packs_pi32, 403 _m_packsswb = _mm_packs_pi16, 404 _m_packuswb = _mm_packs_pu16, 405 _m_paddb = _mm_add_pi8, 406 _m_paddd = _mm_add_pi32, 407 _m_paddsb = _mm_adds_pi8, 408 _m_paddsw = _mm_adds_pi16, 409 _m_paddusb = _mm_adds_pu8, 410 _m_paddusw = _mm_adds_pu16, 411 _m_paddw = _mm_add_pi16, 412 _m_pand = _mm_and_si64, 413 _m_pandn = _mm_andnot_si64, 414 _m_pcmpeqb = _mm_cmpeq_pi8, 415 _m_pcmpeqd = _mm_cmpeq_pi32, 416 _m_pcmpeqw = _mm_cmpeq_pi16, 417 _m_pcmpgtb = _mm_cmpgt_pi8, 418 _m_pcmpgtd = _mm_cmpgt_pi32, 419 _m_pcmpgtw = _mm_cmpgt_pi16, 420 _m_pmaddwd = _mm_madd_pi16, 421 _m_pmulhw = _mm_mulhi_pi16, 422 _m_pmullw = _mm_mullo_pi16, 423 _m_por = _mm_or_si64, 424 _m_pslld = _mm_sll_pi32, 425 _m_pslldi = _mm_slli_pi32, 426 _m_psllq = _mm_sll_si64, 427 _m_psllqi = _mm_slli_si64, 428 _m_psllw = _mm_sll_pi16, 429 _m_psllwi = _mm_slli_pi16, 430 _m_psrad = _mm_sra_pi32, 431 _m_psradi = _mm_srai_pi32, 432 _m_psraw = _mm_sra_pi16, 433 _m_psrawi = _mm_srai_pi16, 434 _m_psrld = _mm_srl_pi32, 435 _m_psrldi = _mm_srli_pi32, 436 _m_psrlq = _mm_srl_si64, 437 _m_psrlqi = _mm_srli_si64, 438 _m_psrlw = _mm_srl_pi16, 439 _m_psrlwi = _mm_srli_pi16, 440 _m_psubb = _mm_sub_pi8, 441 _m_psubd = _mm_sub_pi32, 442 _m_psubsb = _mm_subs_pi8, 443 _m_psubsw = _mm_subs_pi16, 444 _m_psubusb = _mm_subs_pu8, 445 _m_psubusw = _mm_subs_pu16, 446 _m_psubw = _mm_sub_pi16, 447 _m_punpckhbw = _mm_unpackhi_pi8, 448 _m_punpckhdq = _mm_unpackhi_pi32, 449 _m_punpckhwd = _mm_unpackhi_pi16, 450 _m_punpcklbw = _mm_unpacklo_pi8, 451 _m_punpckldq = _mm_unpacklo_pi32, 452 _m_punpcklwd = _mm_unpacklo_pi16, 453 _m_pxor = _mm_xor_si64; 454 455 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 456 { 457 short[4] arr = [e0, e1, e2, e3]; 458 return *cast(__m64*)(arr.ptr); 459 } 460 unittest 461 { 462 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 463 short[4] correct = [0, 1, 2, 3]; 464 assert(R.array == correct); 465 } 466 467 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 468 { 469 int[2] arr = [e0, e1]; 470 return *cast(__m64*)(arr.ptr); 471 } 472 unittest 473 { 474 int2 R = cast(int2) _mm_set_pi32(1, 0); 475 int[2] correct = [0, 1]; 476 assert(R.array == correct); 477 } 478 479 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 480 { 481 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 482 return *cast(__m64*)(arr.ptr); 483 } 484 unittest 485 { 486 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 487 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 488 assert(R.array == correct); 489 } 490 491 __m64 _mm_set1_pi16 (short a) pure @trusted 492 { 493 return cast(__m64)(short4(a)); 494 } 495 unittest 496 { 497 short4 R = cast(short4) _mm_set1_pi16(44); 498 short[4] correct = [44, 44, 44, 44]; 499 assert(R.array == correct); 500 } 501 502 __m64 _mm_set1_pi32 (int a) pure @trusted 503 { 504 return cast(__m64)(int2(a)); 505 } 506 unittest 507 { 508 int2 R = cast(int2) _mm_set1_pi32(43); 509 int[2] correct = [43, 43]; 510 assert(R.array == correct); 511 } 512 513 __m64 _mm_set1_pi8 (byte a) pure @trusted 514 { 515 return cast(__m64)(byte8(a)); 516 } 517 unittest 518 { 519 byte8 R = cast(byte8) _mm_set1_pi8(42); 520 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 521 assert(R.array == correct); 522 } 523 524 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 525 { 526 short[4] arr = [e3, e2, e1, e0]; 527 return *cast(__m64*)(arr.ptr); 528 } 529 unittest 530 { 531 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 532 short[4] correct = [0, 1, 2, 3]; 533 assert(R.array == correct); 534 } 535 536 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 537 { 538 int[2] arr = [e1, e0]; 539 return *cast(__m64*)(arr.ptr); 540 } 541 unittest 542 { 543 int2 R = cast(int2) _mm_setr_pi32(0, 1); 544 int[2] correct = [0, 1]; 545 assert(R.array == correct); 546 } 547 548 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 549 { 550 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 551 return *cast(__m64*)(arr.ptr); 552 } 553 unittest 554 { 555 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 556 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 557 assert(R.array == correct); 558 } 559 560 __m64 _mm_setzero_si64 () pure @trusted 561 { 562 __m64 r; 563 r.ptr[0] = 0; 564 return r; 565 } 566 unittest 567 { 568 __m64 R = _mm_setzero_si64(); 569 assert(R.array[0] == 0); 570 } 571 572 __m64 _mm_sll_pi16 (__m64 a, __m64 count) pure @safe 573 { 574 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(count))); 575 } 576 577 __m64 _mm_sll_pi32 (__m64 a, __m64 count) pure @safe 578 { 579 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(count))); 580 } 581 582 __m64 _mm_sll_si64 (__m64 a, __m64 count) pure @safe 583 { 584 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(count))); 585 } 586 587 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe 588 { 589 return to_m64(_mm_slli_epi16(to_m128i(a), imm8)); 590 } 591 592 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe 593 { 594 return to_m64(_mm_slli_epi32(to_m128i(a), imm8)); 595 } 596 597 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe 598 { 599 return to_m64(_mm_slli_epi64(to_m128i(a), imm8)); 600 } 601 602 __m64 _mm_sra_pi16 (__m64 a, __m64 count) pure @safe 603 { 604 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(count))); 605 } 606 607 __m64 _mm_sra_pi32 (__m64 a, __m64 count) pure @safe 608 { 609 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(count))); 610 } 611 612 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe 613 { 614 return to_m64(_mm_srai_epi16(to_m128i(a), imm8)); 615 } 616 617 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe 618 { 619 return to_m64(_mm_srai_epi32(to_m128i(a), imm8)); 620 } 621 622 __m64 _mm_srl_pi16 (__m64 a, __m64 count) pure @safe 623 { 624 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(count))); 625 } 626 627 __m64 _mm_srl_pi32 (__m64 a, __m64 count) pure @safe 628 { 629 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(count))); 630 } 631 632 __m64 _mm_srl_si64 (__m64 a, __m64 count) pure @safe 633 { 634 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(count))); 635 } 636 637 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe 638 { 639 return to_m64(_mm_srli_epi16(to_m128i(a), imm8)); 640 } 641 642 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe 643 { 644 return to_m64(_mm_srli_epi32(to_m128i(a), imm8)); 645 } 646 647 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe 648 { 649 return to_m64(_mm_srli_epi64(to_m128i(a), imm8)); 650 } 651 652 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 653 { 654 return cast(__m64)(cast(short4)a - cast(short4)b); 655 } 656 657 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 658 { 659 return cast(__m64)(cast(int2)a - cast(int2)b); 660 } 661 662 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 663 { 664 return cast(__m64)(cast(byte8)a - cast(byte8)b); 665 } 666 667 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 668 { 669 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 670 } 671 672 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 673 { 674 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 675 } 676 677 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 678 { 679 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 680 } 681 682 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 683 { 684 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 685 } 686 687 deprecated alias _m_to_int = _mm_cvtsi64_si32; 688 deprecated alias _m_to_int64 = _mm_cvtm64_si64; 689 690 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @safe 691 { 692 return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 693 } 694 695 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @safe 696 { 697 return cast(__m64) shufflevector!(int2, 1, 3)(cast(int2)a, cast(int2)b); 698 } 699 700 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 701 { 702 return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 703 } 704 705 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 706 { 707 return cast(__m64) shufflevector!(short4, 0, 4, 1, 5)(cast(short4)a, cast(short4)b); 708 } 709 710 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @safe 711 { 712 return cast(__m64) shufflevector!(int2, 0, 2)(cast(int2)a, cast(int2)b); 713 } 714 715 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 716 { 717 return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 718 } 719 720 __m64 _mm_xor_si64 (__m64 a, __m64 b) 721 { 722 return a ^ b; 723 } 724