1 /** 2 * Copyright: Copyright Auburn Sounds 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 * Macros: 6 * GUIDE = https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=$0 7 * 8 */ 9 module inteli.mmx; 10 11 public import inteli.types; 12 import inteli.internals; 13 14 import inteli.xmmintrin; 15 import inteli.emmintrin; 16 17 nothrow @nogc: 18 19 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 20 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 21 // intel-intrinsics is just semantics. 22 23 24 /// Add packed 16-bit integers in `a` and `b`. 25 __m64 _mm_add_pi16 (__m64 a, __m64 b) 26 { 27 return cast(__m64)(cast(short4)a + cast(short4)b); 28 } 29 unittest 30 { 31 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 32 short[4] correct = [7, 7, 7, 7]; 33 assert(R.array == correct); 34 } 35 36 /// Add packed 32-bit integers in `a` and `b`. 37 __m64 _mm_add_pi32 (__m64 a, __m64 b) 38 { 39 return cast(__m64)(cast(int2)a + cast(int2)b); 40 } 41 unittest 42 { 43 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 44 int[2] correct = [7, 7]; 45 assert(R.array == correct); 46 } 47 48 /// Add packed 8-bit integers in `a` and `b`. 49 __m64 _mm_add_pi8 (__m64 a, __m64 b) 50 { 51 return cast(__m64)(cast(byte8)a + cast(byte8)b); 52 } 53 unittest 54 { 55 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 56 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 57 assert(R.array == correct); 58 } 59 60 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 61 // PERF: PADDSW not generated 62 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 63 { 64 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 65 } 66 unittest 67 { 68 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 69 _mm_set_pi16(3, 2, 1, 0)); 70 static immutable short[4] correctResult = [0, 2, 4, 6]; 71 assert(res.array == correctResult); 72 } 73 74 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 75 // PERF: PADDSB not generated 76 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 77 { 78 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 79 } 80 unittest 81 { 82 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 89 // PERF: PADDUSW not generated 90 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 91 { 92 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 93 } 94 unittest 95 { 96 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 97 _mm_set_pi16(3, 2, 1, 0)); 98 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 99 assert(res.array == correctResult); 100 } 101 102 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 103 // PERF: PADDUSB not generated 104 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 105 { 106 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 107 } 108 unittest 109 { 110 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 111 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 112 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 113 assert(res.array == correctResult); 114 } 115 116 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 117 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 118 { 119 return a & b; 120 } 121 unittest 122 { 123 __m64 A = [7]; 124 __m64 B = [14]; 125 __m64 R = _mm_and_si64(A, B); 126 assert(R.array[0] == 6); 127 } 128 129 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 130 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 131 { 132 return (~a) & b; 133 } 134 unittest 135 { 136 __m64 A = [7]; 137 __m64 B = [14]; 138 __m64 R = _mm_andnot_si64(A, B); 139 assert(R.array[0] == 8); 140 } 141 142 /// Compare packed 16-bit integers in `a` and `b` for equality. 143 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 144 { 145 static if (GDC_with_MMX) 146 { 147 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 148 } 149 else 150 { 151 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 152 } 153 } 154 unittest 155 { 156 short4 A = [-3, -2, -1, 0]; 157 short4 B = [ 4, 3, 2, 1]; 158 short[4] E = [ 0, 0, 0, 0]; 159 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 160 assert(R.array == E); 161 } 162 163 /// Compare packed 32-bit integers in `a` and `b` for equality. 164 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 165 { 166 static if (GDC_with_MMX) 167 { 168 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 169 } 170 else 171 { 172 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 173 } 174 } 175 unittest 176 { 177 int2 A = [-3, -2]; 178 int2 B = [ 4, -2]; 179 int[2] E = [ 0, -1]; 180 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 181 assert(R.array == E); 182 } 183 184 /// Compare packed 8-bit integers in `a` and `b` for equality, 185 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 186 { 187 static if (GDC_with_MMX) 188 { 189 return cast(__m64) __builtin_ia32_pcmpeqb(cast(byte8)a, cast(byte8)b); 190 } 191 else 192 { 193 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 194 } 195 } 196 unittest 197 { 198 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 199 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 200 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 201 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 202 assert(C.array == correct); 203 } 204 205 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 206 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 207 { 208 static if (GDC_with_MMX) 209 { 210 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 211 } 212 else 213 { 214 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 215 } 216 } 217 unittest 218 { 219 short4 A = [-3, -2, -1, 0]; 220 short4 B = [ 4, 3, 2, 1]; 221 short[4] E = [ 0, 0, 0, 0]; 222 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 223 assert(R.array == E); 224 } 225 226 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 227 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 228 { 229 static if (GDC_with_MMX) 230 { 231 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 232 } 233 else 234 { 235 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 236 } 237 } 238 unittest 239 { 240 int2 A = [-3, 2]; 241 int2 B = [ 4, -2]; 242 int[2] E = [ 0, -1]; 243 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 244 assert(R.array == E); 245 } 246 247 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 248 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 249 { 250 static if (GDC_with_MMX) 251 { 252 return cast(__m64) __builtin_ia32_pcmpgtb (cast(byte8)a, cast(byte8)b); 253 } 254 else 255 { 256 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 257 } 258 } 259 unittest 260 { 261 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 262 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 263 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 264 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 265 assert(C.array == correct); 266 } 267 268 /// Copy 64-bit integer `a` to `dst`. 269 long _mm_cvtm64_si64 (__m64 a) pure @safe 270 { 271 long1 la = cast(long1)a; 272 return a.array[0]; 273 } 274 unittest 275 { 276 __m64 A = _mm_setr_pi32(2, 1); 277 long1 lA = cast(long1)A; 278 assert(A.array[0] == 0x100000002); 279 } 280 281 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 282 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 283 { 284 __m64 r = void; 285 r.ptr[0] = a; 286 return r; 287 } 288 unittest 289 { 290 __m64 R = _mm_cvtsi32_si64(-1); 291 assert(R.array[0] == -1); 292 } 293 294 /// Copy 64-bit integer `a` to `dst`. 295 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 296 { 297 __m64 r = void; 298 r.ptr[0] = a; 299 return r; 300 } 301 unittest 302 { 303 __m64 R = _mm_cvtsi64_m64(0x123456789A); 304 assert(R.array[0] == 0x123456789A); 305 } 306 307 /// Get the lower 32-bit integer in `a`. 308 int _mm_cvtsi64_si32 (__m64 a) pure @safe 309 { 310 int2 r = cast(int2)a; 311 return r.array[0]; 312 } 313 unittest 314 { 315 __m64 A = _mm_setr_pi32(-6, 5); 316 int R = _mm_cvtsi64_si32(A); 317 assert(R == -6); 318 } 319 320 /// Empty the MMX state, which marks the x87 FPU registers as available for 321 /// use by x87 instructions. 322 /// This instruction is supposed to be used at the end of all MMX technology procedures. 323 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD. 324 void _mm_empty() pure @safe 325 { 326 // do nothing, see comment on top of file 327 // TODO: not sure for GDC, do something? 328 } 329 330 ///ditto 331 alias _m_empty = _mm_empty; 332 333 alias _m_from_int = _mm_cvtsi32_si64; 334 alias _m_from_int64 = _mm_cvtsi64_m64; 335 336 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 337 /// Horizontally add adjacent pairs of intermediate 32-bit integers 338 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 339 { 340 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 341 } 342 unittest 343 { 344 short4 A = [-32768, -32768, 32767, 32767]; 345 short4 B = [-32768, -32768, 32767, 32767]; 346 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 347 int[2] correct = [-2147483648, 2*32767*32767]; 348 assert(R.array == correct); 349 } 350 351 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 352 /// and store the high 16 bits of the intermediate integers. 353 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 354 { 355 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 356 } 357 unittest 358 { 359 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 360 __m64 B = _mm_set1_pi16(16384); 361 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 362 short[4] correct = [1, 2, -4, 1]; 363 assert(R.array == correct); 364 } 365 366 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 367 /// and store the low 16 bits of the intermediate integers. 368 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 369 { 370 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 371 } 372 unittest 373 { 374 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 375 __m64 B = _mm_set1_pi16(16384); 376 short4 R = cast(short4)_mm_mullo_pi16(A, B); 377 short[4] correct = [0, 16384, 0, -16384]; 378 assert(R.array == correct); 379 } 380 381 /// Compute the bitwise OR of 64 bits in `a` and `b`. 382 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 383 { 384 return a | b; 385 } 386 unittest 387 { 388 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 389 __m64 B = _mm_set1_pi16(15); 390 short4 R = cast(short4)_mm_or_si64(A, B); 391 short[4] correct = [255, 15, -1, 15]; 392 assert(R.array == correct); 393 } 394 395 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 396 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 397 { 398 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 399 int2 r; 400 r.ptr[0] = p.array[0]; 401 r.ptr[1] = p.array[2]; 402 return cast(__m64)r; 403 } 404 unittest 405 { 406 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 407 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 408 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 409 assert(R.array == correct); 410 } 411 412 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 413 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 414 { 415 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 416 int2 r; 417 r.ptr[0] = p.array[0]; 418 r.ptr[1] = p.array[2]; 419 return cast(__m64)r; 420 } 421 unittest 422 { 423 __m64 A = _mm_setr_pi32(100000, -100000); 424 short4 R = cast(short4) _mm_packs_pi32(A, A); 425 short[4] correct = [32767, -32768, 32767, -32768]; 426 assert(R.array == correct); 427 } 428 429 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 430 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 431 { 432 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 433 int2 r; 434 r.ptr[0] = p.array[0]; 435 r.ptr[1] = p.array[2]; 436 return cast(__m64)r; 437 } 438 unittest 439 { 440 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 441 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 442 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 443 assert(R.array == cast(byte[8])correct); 444 } 445 446 deprecated alias 447 _m_packssdw = _mm_packs_pi32, 448 _m_packsswb = _mm_packs_pi16, 449 _m_packuswb = _mm_packs_pu16, 450 _m_paddb = _mm_add_pi8, 451 _m_paddd = _mm_add_pi32, 452 _m_paddsb = _mm_adds_pi8, 453 _m_paddsw = _mm_adds_pi16, 454 _m_paddusb = _mm_adds_pu8, 455 _m_paddusw = _mm_adds_pu16, 456 _m_paddw = _mm_add_pi16, 457 _m_pand = _mm_and_si64, 458 _m_pandn = _mm_andnot_si64, 459 _m_pcmpeqb = _mm_cmpeq_pi8, 460 _m_pcmpeqd = _mm_cmpeq_pi32, 461 _m_pcmpeqw = _mm_cmpeq_pi16, 462 _m_pcmpgtb = _mm_cmpgt_pi8, 463 _m_pcmpgtd = _mm_cmpgt_pi32, 464 _m_pcmpgtw = _mm_cmpgt_pi16, 465 _m_pmaddwd = _mm_madd_pi16, 466 _m_pmulhw = _mm_mulhi_pi16, 467 _m_pmullw = _mm_mullo_pi16, 468 _m_por = _mm_or_si64, 469 _m_pslld = _mm_sll_pi32, 470 _m_pslldi = _mm_slli_pi32, 471 _m_psllq = _mm_sll_si64, 472 _m_psllqi = _mm_slli_si64, 473 _m_psllw = _mm_sll_pi16, 474 _m_psllwi = _mm_slli_pi16, 475 _m_psrad = _mm_sra_pi32, 476 _m_psradi = _mm_srai_pi32, 477 _m_psraw = _mm_sra_pi16, 478 _m_psrawi = _mm_srai_pi16, 479 _m_psrld = _mm_srl_pi32, 480 _m_psrldi = _mm_srli_pi32, 481 _m_psrlq = _mm_srl_si64, 482 _m_psrlqi = _mm_srli_si64, 483 _m_psrlw = _mm_srl_pi16, 484 _m_psrlwi = _mm_srli_pi16, 485 _m_psubb = _mm_sub_pi8, 486 _m_psubd = _mm_sub_pi32, 487 _m_psubsb = _mm_subs_pi8, 488 _m_psubsw = _mm_subs_pi16, 489 _m_psubusb = _mm_subs_pu8, 490 _m_psubusw = _mm_subs_pu16, 491 _m_psubw = _mm_sub_pi16, 492 _m_punpckhbw = _mm_unpackhi_pi8, 493 _m_punpckhdq = _mm_unpackhi_pi32, 494 _m_punpckhwd = _mm_unpackhi_pi16, 495 _m_punpcklbw = _mm_unpacklo_pi8, 496 _m_punpckldq = _mm_unpacklo_pi32, 497 _m_punpcklwd = _mm_unpacklo_pi16, 498 _m_pxor = _mm_xor_si64; 499 500 /// Set packed 16-bit integers with the supplied values. 501 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 502 { 503 short[4] arr = [e0, e1, e2, e3]; 504 return *cast(__m64*)(arr.ptr); 505 } 506 unittest 507 { 508 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 509 short[4] correct = [0, 1, 2, 3]; 510 assert(R.array == correct); 511 } 512 513 /// Set packed 32-bit integers with the supplied values. 514 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 515 { 516 int[2] arr = [e0, e1]; 517 return *cast(__m64*)(arr.ptr); 518 } 519 unittest 520 { 521 int2 R = cast(int2) _mm_set_pi32(1, 0); 522 int[2] correct = [0, 1]; 523 assert(R.array == correct); 524 } 525 526 /// Set packed 8-bit integers with the supplied values. 527 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 528 { 529 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 530 return *cast(__m64*)(arr.ptr); 531 } 532 unittest 533 { 534 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 535 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 536 assert(R.array == correct); 537 } 538 539 /// Broadcast 16-bit integer `a` to all elements. 540 __m64 _mm_set1_pi16 (short a) pure @trusted 541 { 542 return cast(__m64)(short4(a)); 543 } 544 unittest 545 { 546 short4 R = cast(short4) _mm_set1_pi16(44); 547 short[4] correct = [44, 44, 44, 44]; 548 assert(R.array == correct); 549 } 550 551 /// Broadcast 32-bit integer `a` to all elements. 552 __m64 _mm_set1_pi32 (int a) pure @trusted 553 { 554 return cast(__m64)(int2(a)); 555 } 556 unittest 557 { 558 int2 R = cast(int2) _mm_set1_pi32(43); 559 int[2] correct = [43, 43]; 560 assert(R.array == correct); 561 } 562 563 /// Broadcast 8-bit integer `a` to all elements. 564 __m64 _mm_set1_pi8 (byte a) pure @trusted 565 { 566 return cast(__m64)(byte8(a)); 567 } 568 unittest 569 { 570 byte8 R = cast(byte8) _mm_set1_pi8(42); 571 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 572 assert(R.array == correct); 573 } 574 575 /// Set packed 16-bit integers with the supplied values in reverse order. 576 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 577 { 578 short[4] arr = [e3, e2, e1, e0]; 579 return *cast(__m64*)(arr.ptr); 580 } 581 unittest 582 { 583 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 584 short[4] correct = [0, 1, 2, 3]; 585 assert(R.array == correct); 586 } 587 588 /// Set packed 32-bit integers with the supplied values in reverse order. 589 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 590 { 591 int[2] arr = [e1, e0]; 592 return *cast(__m64*)(arr.ptr); 593 } 594 unittest 595 { 596 int2 R = cast(int2) _mm_setr_pi32(0, 1); 597 int[2] correct = [0, 1]; 598 assert(R.array == correct); 599 } 600 601 /// Set packed 8-bit integers with the supplied values in reverse order. 602 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 603 { 604 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 605 return *cast(__m64*)(arr.ptr); 606 } 607 unittest 608 { 609 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 610 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 611 assert(R.array == correct); 612 } 613 614 /// Return vector of type `__m64` with all elements set to zero. 615 __m64 _mm_setzero_si64 () pure @trusted 616 { 617 __m64 r; 618 r.ptr[0] = 0; 619 return r; 620 } 621 unittest 622 { 623 __m64 R = _mm_setzero_si64(); 624 assert(R.array[0] == 0); 625 } 626 627 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 628 __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe 629 { 630 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits))); 631 } 632 unittest 633 { 634 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 635 short4 B = cast(short4)( _mm_sll_pi16(A, _mm_cvtsi64_m64(1)) ); 636 short[4] correct = [ -8, -10, 12, 14 ]; 637 assert(B.array == correct); 638 } 639 640 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 641 __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe 642 { 643 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits))); 644 } 645 unittest 646 { 647 __m64 A = _mm_setr_pi32(-4, 5); 648 int2 B = cast(int2)( _mm_sll_pi32(A, _mm_cvtsi64_m64(1)) ); 649 int[2] correct = [ -8, 10 ]; 650 assert(B.array == correct); 651 } 652 653 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 654 __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe 655 { 656 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits))); 657 } 658 unittest 659 { 660 __m64 A = _mm_cvtsi64_m64(-1); 661 long1 R = cast(long1)( _mm_sll_si64(A, _mm_cvtsi64_m64(1)) ); 662 long[1] correct = [ -2 ]; 663 assert(R.array == correct); 664 } 665 666 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 667 __m64 _mm_slli_pi16 (__m64 a, int bits) pure @safe 668 { 669 return to_m64(_mm_slli_epi16(to_m128i(a), bits)); 670 } 671 unittest 672 { 673 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 674 short4 B = cast(short4)( _mm_slli_pi16(A, 1) ); 675 short[4] correct = [ -8, -10, 12, 14 ]; 676 assert(B.array == correct); 677 } 678 679 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 680 __m64 _mm_slli_pi32 (__m64 a, int bits) pure @safe 681 { 682 return to_m64(_mm_slli_epi32(to_m128i(a), bits)); 683 } 684 unittest 685 { 686 __m64 A = _mm_setr_pi32(-4, 5); 687 int2 B = cast(int2)( _mm_slli_pi32(A, 1) ); 688 int[2] correct = [ -8, 10 ]; 689 assert(B.array == correct); 690 } 691 692 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 693 __m64 _mm_slli_si64 (__m64 a, int bits) pure @safe 694 { 695 return to_m64(_mm_slli_epi64(to_m128i(a), bits)); 696 } 697 unittest 698 { 699 __m64 A = _mm_cvtsi64_m64(-1); 700 long1 R = cast(long1)( _mm_slli_si64(A, 1) ); 701 long[1] correct = [ -2 ]; 702 assert(R.array == correct); 703 } 704 705 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 706 __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe 707 { 708 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits))); 709 } 710 unittest 711 { 712 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 713 short4 B = cast(short4)( _mm_sra_pi16(A, _mm_cvtsi64_m64(1)) ); 714 short[4] correct = [ -2, -3, 3, 3 ]; 715 assert(B.array == correct); 716 } 717 718 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 719 __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe 720 { 721 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits))); 722 } 723 unittest 724 { 725 __m64 A = _mm_setr_pi32(-4, 5); 726 int2 B = cast(int2)( _mm_sra_pi32(A, _mm_cvtsi64_m64(1)) ); 727 int[2] correct = [ -2, 2 ]; 728 assert(B.array == correct); 729 } 730 731 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 732 __m64 _mm_srai_pi16 (__m64 a, int bits) pure @safe 733 { 734 return to_m64(_mm_srai_epi16(to_m128i(a), bits)); 735 } 736 unittest 737 { 738 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 739 short4 B = cast(short4)( _mm_srai_pi16(A, 1) ); 740 short[4] correct = [ -2, -3, 3, 3 ]; 741 assert(B.array == correct); 742 } 743 744 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 745 __m64 _mm_srai_pi32 (__m64 a, int bits) pure @safe 746 { 747 return to_m64(_mm_srai_epi32(to_m128i(a), bits)); 748 } 749 unittest 750 { 751 __m64 A = _mm_setr_pi32(-4, 5); 752 int2 B = cast(int2)( _mm_srai_pi32(A, 1) ); 753 int[2] correct = [ -2, 2 ]; 754 assert(B.array == correct); 755 } 756 757 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 758 __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe 759 { 760 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits))); 761 } 762 unittest 763 { 764 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 765 short4 B = cast(short4)( _mm_srl_pi16(A, _mm_cvtsi64_m64(1)) ); 766 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 767 assert(B.array == correct); 768 } 769 770 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 771 __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe 772 { 773 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits))); 774 } 775 unittest 776 { 777 __m64 A = _mm_setr_pi32(-4, 5); 778 int2 B = cast(int2)( _mm_srl_pi32(A, _mm_cvtsi64_m64(1)) ); 779 int[2] correct = [ 0x7ffffffe, 2 ]; 780 assert(B.array == correct); 781 } 782 783 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 784 __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe 785 { 786 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits))); 787 } 788 unittest 789 { 790 __m64 A = _mm_cvtsi64_m64(-1); 791 long1 R = cast(long1)( _mm_srl_si64(A, _mm_cvtsi64_m64(1)) ); 792 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 793 assert(R.array == correct); 794 } 795 796 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 797 __m64 _mm_srli_pi16 (__m64 a, int bits) pure @safe 798 { 799 return to_m64(_mm_srli_epi16(to_m128i(a), bits)); 800 } 801 unittest 802 { 803 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 804 short4 B = cast(short4)( _mm_srli_pi16(A, 1) ); 805 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 806 assert(B.array == correct); 807 } 808 809 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 810 __m64 _mm_srli_pi32 (__m64 a, int bits) pure @safe 811 { 812 return to_m64(_mm_srli_epi32(to_m128i(a), bits)); 813 } 814 unittest 815 { 816 __m64 A = _mm_setr_pi32(-4, 5); 817 int2 B = cast(int2)( _mm_srli_pi32(A, 1) ); 818 int[2] correct = [ 0x7ffffffe, 2 ]; 819 assert(B.array == correct); 820 } 821 822 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 823 __m64 _mm_srli_si64 (__m64 a, int bits) pure @safe 824 { 825 return to_m64(_mm_srli_epi64(to_m128i(a), bits)); 826 } 827 unittest 828 { 829 __m64 A = _mm_cvtsi64_m64(-1); 830 long1 R = cast(long1)( _mm_srli_si64(A, 1) ); 831 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 832 assert(R.array == correct); 833 } 834 835 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 836 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 837 { 838 return cast(__m64)(cast(short4)a - cast(short4)b); 839 } 840 unittest 841 { 842 short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 843 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 844 static immutable short[4] correct = [ -1,-15, 1, 32764]; 845 assert(R.array == correct); 846 } 847 848 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 849 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 850 { 851 return cast(__m64)(cast(int2)a - cast(int2)b); 852 } 853 unittest 854 { 855 int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10, 4), 856 _mm_setr_pi32( 15, -70)); 857 static immutable int[2] correct = [ -5, 74]; 858 assert(R.array == correct); 859 } 860 861 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 862 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 863 { 864 return cast(__m64)(cast(byte8)a - cast(byte8)b); 865 } 866 unittest 867 { 868 byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 869 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 870 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, 120 ]; 871 assert(R.array == correct); 872 } 873 874 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation. 875 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 876 { 877 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 878 } 879 unittest 880 { 881 short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 882 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 883 static immutable short[4] correct = [ -1,-15, 1, -32768]; 884 assert(R.array == correct); 885 } 886 887 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation. 888 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 889 { 890 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 891 } 892 unittest 893 { 894 byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 895 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 896 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, -128 ]; 897 assert(R.array == correct); 898 } 899 900 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 901 /// using saturation. 902 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 903 { 904 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 905 } 906 unittest 907 { 908 short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534, 1, 5, 4), 909 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 910 static immutable short[4] correct = [ 0, 0, 1, 0]; 911 assert(R.array == correct); 912 } 913 914 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 915 /// using saturation. 916 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 917 { 918 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 919 } 920 unittest 921 { 922 byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8), 923 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 924 static immutable byte[8] correct = [ 0, 7, 0, 0, 0, 0, 0, 0, ]; 925 assert(R.array == correct); 926 } 927 928 deprecated alias _m_to_int = _mm_cvtsi64_si32; 929 deprecated alias _m_to_int64 = _mm_cvtm64_si64; 930 931 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 932 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted 933 { 934 version(LDC) 935 { 936 // avoiding this shufflevector leads to bad performance on LDC 937 return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 938 } 939 else 940 { 941 short4 ia = cast(short4)a; 942 short4 ib = cast(short4)b; 943 short4 r; 944 r.ptr[0] = ia.array[2]; 945 r.ptr[1] = ib.array[2]; 946 r.ptr[2] = ia.array[3]; 947 r.ptr[3] = ib.array[3]; 948 return cast(__m64)r; 949 } 950 } 951 unittest 952 { 953 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 954 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 955 short4 R = cast(short4) _mm_unpackhi_pi16(A, B); 956 short[4] correct = [-16, -3, 7, 10]; 957 assert(R.array == correct); 958 } 959 960 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 961 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted 962 { 963 // Generate punpckldq as far back as LDC 1.0.0 -O1 964 // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions) 965 int2 ia = cast(int2)a; 966 int2 ib = cast(int2)b; 967 int2 r; 968 r.ptr[0] = ia.array[1]; 969 r.ptr[1] = ib.array[1]; 970 return cast(__m64)r; 971 } 972 unittest 973 { 974 __m64 A = _mm_setr_pi32(4, 8); 975 __m64 B = _mm_setr_pi32(5, 9); 976 int2 R = cast(int2) _mm_unpackhi_pi32(A, B); 977 int[2] correct = [8, 9]; 978 assert(R.array == correct); 979 } 980 981 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 982 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 983 { 984 version(LDC) 985 { 986 return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 987 } 988 else 989 { 990 byte8 ia = cast(byte8)a; 991 byte8 ib = cast(byte8)b; 992 byte8 r; 993 r.ptr[0] = ia.array[4]; 994 r.ptr[1] = ib.array[4]; 995 r.ptr[2] = ia.array[5]; 996 r.ptr[3] = ib.array[5]; 997 r.ptr[4] = ia.array[6]; 998 r.ptr[5] = ib.array[6]; 999 r.ptr[6] = ia.array[7]; 1000 r.ptr[7] = ib.array[7]; 1001 return cast(__m64)r; 1002 } 1003 } 1004 unittest 1005 { 1006 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1007 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1008 byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B); 1009 byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8]; 1010 assert(R.array == correct); 1011 } 1012 1013 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 1014 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 1015 { 1016 // Generates punpcklwd since LDC 1.0.0 -01 1017 short4 ia = cast(short4)a; 1018 short4 ib = cast(short4)b; 1019 short4 r; 1020 r.ptr[0] = ia.array[0]; 1021 r.ptr[1] = ib.array[0]; 1022 r.ptr[2] = ia.array[1]; 1023 r.ptr[3] = ib.array[1]; 1024 return cast(__m64)r; 1025 } 1026 unittest 1027 { 1028 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 1029 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 1030 short4 R = cast(short4) _mm_unpacklo_pi16(A, B); 1031 short[4] correct = [4, 5, 8, 9]; 1032 assert(R.array == correct); 1033 } 1034 1035 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 1036 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted 1037 { 1038 // Generate punpckldq as far back as LDC 1.0.0 -O1 1039 int2 ia = cast(int2)a; 1040 int2 ib = cast(int2)b; 1041 int2 r; 1042 r.ptr[0] = ia.array[0]; 1043 r.ptr[1] = ib.array[0]; 1044 return cast(__m64)r; 1045 } 1046 unittest 1047 { 1048 __m64 A = _mm_setr_pi32(4, 8); 1049 __m64 B = _mm_setr_pi32(5, 9); 1050 int2 R = cast(int2) _mm_unpacklo_pi32(A, B); 1051 int[2] correct = [4, 5]; 1052 assert(R.array == correct); 1053 } 1054 1055 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 1056 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 1057 { 1058 version(LDC) 1059 { 1060 return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 1061 } 1062 else 1063 { 1064 byte8 ia = cast(byte8)a; 1065 byte8 ib = cast(byte8)b; 1066 byte8 r; 1067 r.ptr[0] = ia.array[0]; 1068 r.ptr[1] = ib.array[0]; 1069 r.ptr[2] = ia.array[1]; 1070 r.ptr[3] = ib.array[1]; 1071 r.ptr[4] = ia.array[2]; 1072 r.ptr[5] = ib.array[2]; 1073 r.ptr[6] = ia.array[3]; 1074 r.ptr[7] = ib.array[3]; 1075 return cast(__m64)r; 1076 } 1077 } 1078 unittest 1079 { 1080 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1081 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1082 byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B); 1083 byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4]; 1084 assert(R.array == correct); 1085 } 1086 1087 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`. 1088 __m64 _mm_xor_si64 (__m64 a, __m64 b) 1089 { 1090 return a ^ b; 1091 } 1092 unittest 1093 { 1094 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 1095 __m64 B = _mm_set1_pi16(15); 1096 short4 R = cast(short4)_mm_xor_si64(A, B); 1097 short[4] correct = [240, 14, -16, 15]; 1098 assert(R.array == correct); 1099 } 1100