1 /** 2 * Copyright: Copyright Auburn Sounds 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 * Macros: 6 * GUIDE = https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=$0 7 * 8 */ 9 module inteli.mmx; 10 11 public import inteli.types; 12 import inteli.internals; 13 14 import inteli.xmmintrin; 15 import inteli.emmintrin; 16 17 nothrow @nogc: 18 19 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 20 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 21 // intel-intrinsics is just semantics. 22 23 24 /// Add packed 16-bit integers in `a` and `b`. 25 __m64 _mm_add_pi16 (__m64 a, __m64 b) 26 { 27 return cast(__m64)(cast(short4)a + cast(short4)b); 28 } 29 unittest 30 { 31 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 32 short[4] correct = [7, 7, 7, 7]; 33 assert(R.array == correct); 34 } 35 36 /// Add packed 32-bit integers in `a` and `b`. 37 __m64 _mm_add_pi32 (__m64 a, __m64 b) 38 { 39 return cast(__m64)(cast(int2)a + cast(int2)b); 40 } 41 unittest 42 { 43 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 44 int[2] correct = [7, 7]; 45 assert(R.array == correct); 46 } 47 48 /// Add packed 8-bit integers in `a` and `b`. 49 __m64 _mm_add_pi8 (__m64 a, __m64 b) 50 { 51 return cast(__m64)(cast(byte8)a + cast(byte8)b); 52 } 53 unittest 54 { 55 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 56 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 57 assert(R.array == correct); 58 } 59 60 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 61 // PERF: PADDSW not generated 62 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 63 { 64 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 65 } 66 unittest 67 { 68 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 69 _mm_set_pi16(3, 2, 1, 0)); 70 static immutable short[4] correctResult = [0, 2, 4, 6]; 71 assert(res.array == correctResult); 72 } 73 74 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 75 // PERF: PADDSB not generated 76 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 77 { 78 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 79 } 80 unittest 81 { 82 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 89 // PERF: PADDUSW not generated 90 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 91 { 92 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 93 } 94 unittest 95 { 96 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 97 _mm_set_pi16(3, 2, 1, 0)); 98 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 99 assert(res.array == correctResult); 100 } 101 102 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 103 // PERF: PADDUSB not generated 104 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 105 { 106 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 107 } 108 unittest 109 { 110 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 111 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 112 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 113 assert(res.array == correctResult); 114 } 115 116 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 117 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 118 { 119 return a & b; 120 } 121 unittest 122 { 123 __m64 A = [7]; 124 __m64 B = [14]; 125 __m64 R = _mm_and_si64(A, B); 126 assert(R.array[0] == 6); 127 } 128 129 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 130 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 131 { 132 return (~a) & b; 133 } 134 unittest 135 { 136 __m64 A = [7]; 137 __m64 B = [14]; 138 __m64 R = _mm_andnot_si64(A, B); 139 assert(R.array[0] == 8); 140 } 141 142 /// Compare packed 16-bit integers in `a` and `b` for equality. 143 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 144 { 145 static if (GDC_with_MMX) 146 { 147 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 148 } 149 else 150 { 151 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 152 } 153 } 154 unittest 155 { 156 short4 A = [-3, -2, -1, 0]; 157 short4 B = [ 4, 3, 2, 1]; 158 short[4] E = [ 0, 0, 0, 0]; 159 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 160 assert(R.array == E); 161 } 162 163 /// Compare packed 32-bit integers in `a` and `b` for equality. 164 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 165 { 166 static if (GDC_with_MMX) 167 { 168 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 169 } 170 else 171 { 172 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 173 } 174 } 175 unittest 176 { 177 int2 A = [-3, -2]; 178 int2 B = [ 4, -2]; 179 int[2] E = [ 0, -1]; 180 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 181 assert(R.array == E); 182 } 183 184 /// Compare packed 8-bit integers in `a` and `b` for equality, 185 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 186 { 187 static if (GDC_with_MMX) 188 { 189 return cast(__m64) __builtin_ia32_pcmpeqb(cast(byte8)a, cast(byte8)b); 190 } 191 else 192 { 193 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 194 } 195 } 196 unittest 197 { 198 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 199 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 200 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 201 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 202 assert(C.array == correct); 203 } 204 205 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 206 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 207 { 208 static if (GDC_with_MMX) 209 { 210 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 211 } 212 else 213 { 214 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 215 } 216 } 217 unittest 218 { 219 short4 A = [-3, -2, -1, 0]; 220 short4 B = [ 4, 3, 2, 1]; 221 short[4] E = [ 0, 0, 0, 0]; 222 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 223 assert(R.array == E); 224 } 225 226 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 227 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 228 { 229 static if (GDC_with_MMX) 230 { 231 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 232 } 233 else 234 { 235 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 236 } 237 } 238 unittest 239 { 240 int2 A = [-3, 2]; 241 int2 B = [ 4, -2]; 242 int[2] E = [ 0, -1]; 243 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 244 assert(R.array == E); 245 } 246 247 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 248 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 249 { 250 static if (GDC_with_MMX) 251 { 252 return cast(__m64) __builtin_ia32_pcmpgtb (cast(byte8)a, cast(byte8)b); 253 } 254 else 255 { 256 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 257 } 258 } 259 unittest 260 { 261 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 262 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 263 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 264 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 265 assert(C.array == correct); 266 } 267 268 /// Copy 64-bit integer `a` to `dst`. 269 long _mm_cvtm64_si64 (__m64 a) pure @safe 270 { 271 long1 la = cast(long1)a; 272 return a.array[0]; 273 } 274 unittest 275 { 276 __m64 A = _mm_setr_pi32(2, 1); 277 long1 lA = cast(long1)A; 278 assert(A.array[0] == 0x100000002); 279 } 280 281 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 282 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 283 { 284 __m64 r = void; 285 r.ptr[0] = a; 286 return r; 287 } 288 unittest 289 { 290 __m64 R = _mm_cvtsi32_si64(-1); 291 assert(R.array[0] == -1); 292 } 293 294 /// Copy 64-bit integer `a` to `dst`. 295 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 296 { 297 __m64 r = void; 298 r.ptr[0] = a; 299 return r; 300 } 301 unittest 302 { 303 __m64 R = _mm_cvtsi64_m64(0x123456789A); 304 assert(R.array[0] == 0x123456789A); 305 } 306 307 /// Get the lower 32-bit integer in `a`. 308 int _mm_cvtsi64_si32 (__m64 a) pure @safe 309 { 310 int2 r = cast(int2)a; 311 return r.array[0]; 312 } 313 unittest 314 { 315 __m64 A = _mm_setr_pi32(-6, 5); 316 int R = _mm_cvtsi64_si32(A); 317 assert(R == -6); 318 } 319 320 /// Empty the MMX state, which marks the x87 FPU registers as available for 321 /// use by x87 instructions. 322 /// This instruction is supposed to be used at the end of all MMX technology procedures. 323 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD. 324 void _mm_empty() pure @safe 325 { 326 // do nothing, see comment on top of file 327 // TODO: not sure for GDC, do something? 328 } 329 330 ///ditto 331 alias _m_empty = _mm_empty; 332 333 alias _m_from_int = _mm_cvtsi32_si64; 334 alias _m_from_int64 = _mm_cvtsi64_m64; 335 336 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 337 /// Horizontally add adjacent pairs of intermediate 32-bit integers 338 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 339 { 340 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 341 } 342 unittest 343 { 344 short4 A = [-32768, -32768, 32767, 32767]; 345 short4 B = [-32768, -32768, 32767, 32767]; 346 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 347 int[2] correct = [-2147483648, 2*32767*32767]; 348 assert(R.array == correct); 349 } 350 351 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 352 /// and store the high 16 bits of the intermediate integers. 353 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 354 { 355 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 356 } 357 unittest 358 { 359 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 360 __m64 B = _mm_set1_pi16(16384); 361 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 362 short[4] correct = [1, 2, -4, 1]; 363 assert(R.array == correct); 364 } 365 366 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 367 /// and store the low 16 bits of the intermediate integers. 368 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 369 { 370 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 371 } 372 unittest 373 { 374 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 375 __m64 B = _mm_set1_pi16(16384); 376 short4 R = cast(short4)_mm_mullo_pi16(A, B); 377 short[4] correct = [0, 16384, 0, -16384]; 378 assert(R.array == correct); 379 } 380 381 /// Compute the bitwise OR of 64 bits in `a` and `b`. 382 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 383 { 384 return a | b; 385 } 386 unittest 387 { 388 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 389 __m64 B = _mm_set1_pi16(15); 390 short4 R = cast(short4)_mm_or_si64(A, B); 391 short[4] correct = [255, 15, -1, 15]; 392 assert(R.array == correct); 393 } 394 395 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 396 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 397 { 398 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 399 int2 r; 400 r.ptr[0] = p.array[0]; 401 r.ptr[1] = p.array[2]; 402 return cast(__m64)r; 403 } 404 unittest 405 { 406 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 407 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 408 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 409 assert(R.array == correct); 410 } 411 412 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 413 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 414 { 415 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 416 int2 r; 417 r.ptr[0] = p.array[0]; 418 r.ptr[1] = p.array[2]; 419 return cast(__m64)r; 420 } 421 unittest 422 { 423 __m64 A = _mm_setr_pi32(100000, -100000); 424 short4 R = cast(short4) _mm_packs_pi32(A, A); 425 short[4] correct = [32767, -32768, 32767, -32768]; 426 assert(R.array == correct); 427 } 428 429 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 430 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 431 { 432 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 433 int2 r; 434 r.ptr[0] = p.array[0]; 435 r.ptr[1] = p.array[2]; 436 return cast(__m64)r; 437 } 438 unittest 439 { 440 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 441 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 442 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 443 assert(R.array == cast(byte[8])correct); 444 } 445 446 deprecated alias 447 _m_packssdw = _mm_packs_pi32, 448 _m_packsswb = _mm_packs_pi16, 449 _m_packuswb = _mm_packs_pu16, 450 _m_paddb = _mm_add_pi8, 451 _m_paddd = _mm_add_pi32, 452 _m_paddsb = _mm_adds_pi8, 453 _m_paddsw = _mm_adds_pi16, 454 _m_paddusb = _mm_adds_pu8, 455 _m_paddusw = _mm_adds_pu16, 456 _m_paddw = _mm_add_pi16, 457 _m_pand = _mm_and_si64, 458 _m_pandn = _mm_andnot_si64, 459 _m_pcmpeqb = _mm_cmpeq_pi8, 460 _m_pcmpeqd = _mm_cmpeq_pi32, 461 _m_pcmpeqw = _mm_cmpeq_pi16, 462 _m_pcmpgtb = _mm_cmpgt_pi8, 463 _m_pcmpgtd = _mm_cmpgt_pi32, 464 _m_pcmpgtw = _mm_cmpgt_pi16, 465 _m_pmaddwd = _mm_madd_pi16, 466 _m_pmulhw = _mm_mulhi_pi16, 467 _m_pmullw = _mm_mullo_pi16, 468 _m_por = _mm_or_si64, 469 _m_pslld = _mm_sll_pi32, 470 _m_pslldi = _mm_slli_pi32, 471 _m_psllq = _mm_sll_si64, 472 _m_psllqi = _mm_slli_si64, 473 _m_psllw = _mm_sll_pi16, 474 _m_psllwi = _mm_slli_pi16, 475 _m_psrad = _mm_sra_pi32, 476 _m_psradi = _mm_srai_pi32, 477 _m_psraw = _mm_sra_pi16, 478 _m_psrawi = _mm_srai_pi16, 479 _m_psrld = _mm_srl_pi32, 480 _m_psrldi = _mm_srli_pi32, 481 _m_psrlq = _mm_srl_si64, 482 _m_psrlqi = _mm_srli_si64, 483 _m_psrlw = _mm_srl_pi16, 484 _m_psrlwi = _mm_srli_pi16, 485 _m_psubb = _mm_sub_pi8, 486 _m_psubd = _mm_sub_pi32, 487 _m_psubsb = _mm_subs_pi8, 488 _m_psubsw = _mm_subs_pi16, 489 _m_psubusb = _mm_subs_pu8, 490 _m_psubusw = _mm_subs_pu16, 491 _m_psubw = _mm_sub_pi16, 492 _m_punpckhbw = _mm_unpackhi_pi8, 493 _m_punpckhdq = _mm_unpackhi_pi32, 494 _m_punpckhwd = _mm_unpackhi_pi16, 495 _m_punpcklbw = _mm_unpacklo_pi8, 496 _m_punpckldq = _mm_unpacklo_pi32, 497 _m_punpcklwd = _mm_unpacklo_pi16, 498 _m_pxor = _mm_xor_si64; 499 500 /// Set packed 16-bit integers with the supplied values. 501 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 502 { 503 short[4] arr = [e0, e1, e2, e3]; 504 return *cast(__m64*)(arr.ptr); 505 } 506 unittest 507 { 508 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 509 short[4] correct = [0, 1, 2, 3]; 510 assert(R.array == correct); 511 } 512 513 /// Set packed 32-bit integers with the supplied values. 514 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 515 { 516 int[2] arr = [e0, e1]; 517 return *cast(__m64*)(arr.ptr); 518 } 519 unittest 520 { 521 int2 R = cast(int2) _mm_set_pi32(1, 0); 522 int[2] correct = [0, 1]; 523 assert(R.array == correct); 524 } 525 526 /// Set packed 8-bit integers with the supplied values. 527 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 528 { 529 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 530 return *cast(__m64*)(arr.ptr); 531 } 532 unittest 533 { 534 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 535 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 536 assert(R.array == correct); 537 } 538 539 /// Broadcast 16-bit integer `a` to all elements. 540 __m64 _mm_set1_pi16 (short a) pure @trusted 541 { 542 return cast(__m64)(short4(a)); 543 } 544 unittest 545 { 546 short4 R = cast(short4) _mm_set1_pi16(44); 547 short[4] correct = [44, 44, 44, 44]; 548 assert(R.array == correct); 549 } 550 551 /// Broadcast 32-bit integer `a` to all elements. 552 __m64 _mm_set1_pi32 (int a) pure @trusted 553 { 554 return cast(__m64)(int2(a)); 555 } 556 unittest 557 { 558 int2 R = cast(int2) _mm_set1_pi32(43); 559 int[2] correct = [43, 43]; 560 assert(R.array == correct); 561 } 562 563 /// Broadcast 8-bit integer `a` to all elements. 564 __m64 _mm_set1_pi8 (byte a) pure @trusted 565 { 566 return cast(__m64)(byte8(a)); 567 } 568 unittest 569 { 570 byte8 R = cast(byte8) _mm_set1_pi8(42); 571 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 572 assert(R.array == correct); 573 } 574 575 /// Set packed 16-bit integers with the supplied values in reverse order. 576 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 577 { 578 short[4] arr = [e3, e2, e1, e0]; 579 return *cast(__m64*)(arr.ptr); 580 } 581 unittest 582 { 583 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 584 short[4] correct = [0, 1, 2, 3]; 585 assert(R.array == correct); 586 } 587 588 /// Set packed 32-bit integers with the supplied values in reverse order. 589 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 590 { 591 int[2] arr = [e1, e0]; 592 return *cast(__m64*)(arr.ptr); 593 } 594 unittest 595 { 596 int2 R = cast(int2) _mm_setr_pi32(0, 1); 597 int[2] correct = [0, 1]; 598 assert(R.array == correct); 599 } 600 601 /// Set packed 8-bit integers with the supplied values in reverse order. 602 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 603 { 604 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 605 return *cast(__m64*)(arr.ptr); 606 } 607 unittest 608 { 609 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 610 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 611 assert(R.array == correct); 612 } 613 614 /// Return vector of type `__m64` with all elements set to zero. 615 __m64 _mm_setzero_si64 () pure @trusted 616 { 617 __m64 r; 618 r.ptr[0] = 0; 619 return r; 620 } 621 unittest 622 { 623 __m64 R = _mm_setzero_si64(); 624 assert(R.array[0] == 0); 625 } 626 627 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 628 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe 629 { 630 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits))); 631 } 632 633 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 634 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe 635 { 636 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits))); 637 } 638 639 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 640 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe 641 { 642 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits))); 643 } 644 645 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 646 __m64 _mm_slli_pi16 (__m64 a, int bits) pure @safe 647 { 648 return to_m64(_mm_slli_epi16(to_m128i(a), bits)); 649 } 650 unittest 651 { 652 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 653 short4 B = cast(short4)( _mm_slli_pi16(A, 1) ); 654 short[4] correct = [ -8, -10, 12, 14 ]; 655 assert(B.array == correct); 656 } 657 658 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 659 __m64 _mm_slli_pi32 (__m64 a, int bits) pure @safe 660 { 661 return to_m64(_mm_slli_epi32(to_m128i(a), bits)); 662 } 663 unittest 664 { 665 __m64 A = _mm_setr_pi32(-4, 5); 666 int2 B = cast(int2)( _mm_slli_pi32(A, 1) ); 667 int[2] correct = [ -8, 10 ]; 668 assert(B.array == correct); 669 } 670 671 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 672 __m64 _mm_slli_si64 (__m64 a, int bits) pure @safe 673 { 674 return to_m64(_mm_slli_epi64(to_m128i(a), bits)); 675 } 676 unittest 677 { 678 __m64 A = _mm_cvtsi64_m64(-1); 679 long1 R = cast(long1)( _mm_slli_si64(A, 1) ); 680 long[1] correct = [ -2 ]; 681 assert(R.array == correct); 682 } 683 684 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 685 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe 686 { 687 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits))); 688 } 689 690 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 691 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe 692 { 693 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits))); 694 } 695 696 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 697 __m64 _mm_srai_pi16 (__m64 a, int bits) pure @safe 698 { 699 return to_m64(_mm_srai_epi16(to_m128i(a), bits)); 700 } 701 unittest 702 { 703 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 704 short4 B = cast(short4)( _mm_srai_pi16(A, 1) ); 705 short[4] correct = [ -2, -3, 3, 3 ]; 706 assert(B.array == correct); 707 } 708 709 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 710 __m64 _mm_srai_pi32 (__m64 a, int bits) pure @safe 711 { 712 return to_m64(_mm_srai_epi32(to_m128i(a), bits)); 713 } 714 unittest 715 { 716 __m64 A = _mm_setr_pi32(-4, 5); 717 int2 B = cast(int2)( _mm_srai_pi32(A, 1) ); 718 int[2] correct = [ -2, 2 ]; 719 assert(B.array == correct); 720 } 721 722 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 723 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe 724 { 725 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits))); 726 } 727 728 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 729 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe 730 { 731 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits))); 732 } 733 734 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 735 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe 736 { 737 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits))); 738 } 739 740 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 741 __m64 _mm_srli_pi16 (__m64 a, int bits) pure @safe 742 { 743 return to_m64(_mm_srli_epi16(to_m128i(a), bits)); 744 } 745 unittest 746 { 747 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 748 short4 B = cast(short4)( _mm_srli_pi16(A, 1) ); 749 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 750 assert(B.array == correct); 751 } 752 753 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 754 __m64 _mm_srli_pi32 (__m64 a, int bits) pure @safe 755 { 756 return to_m64(_mm_srli_epi32(to_m128i(a), bits)); 757 } 758 unittest 759 { 760 __m64 A = _mm_setr_pi32(-4, 5); 761 int2 B = cast(int2)( _mm_srli_pi32(A, 1) ); 762 int[2] correct = [ 0x7ffffffe, 2 ]; 763 assert(B.array == correct); 764 } 765 766 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 767 __m64 _mm_srli_si64 (__m64 a, int bits) pure @safe 768 { 769 return to_m64(_mm_srli_epi64(to_m128i(a), bits)); 770 } 771 unittest 772 { 773 __m64 A = _mm_cvtsi64_m64(-1); 774 long1 R = cast(long1)( _mm_srli_si64(A, 1) ); 775 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 776 assert(R.array == correct); 777 } 778 779 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 780 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 781 { 782 return cast(__m64)(cast(short4)a - cast(short4)b); 783 } 784 unittest 785 { 786 short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 787 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 788 static immutable short[4] correct = [ -1,-15, 1, 32764]; 789 assert(R.array == correct); 790 } 791 792 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 793 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 794 { 795 return cast(__m64)(cast(int2)a - cast(int2)b); 796 } 797 unittest 798 { 799 int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10, 4), 800 _mm_setr_pi32( 15, -70)); 801 static immutable int[2] correct = [ -5, 74]; 802 assert(R.array == correct); 803 } 804 805 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 806 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 807 { 808 return cast(__m64)(cast(byte8)a - cast(byte8)b); 809 } 810 unittest 811 { 812 byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 813 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 814 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, 120 ]; 815 assert(R.array == correct); 816 } 817 818 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation. 819 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 820 { 821 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 822 } 823 unittest 824 { 825 short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 826 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 827 static immutable short[4] correct = [ -1,-15, 1, -32768]; 828 assert(R.array == correct); 829 } 830 831 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation. 832 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 833 { 834 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 835 } 836 unittest 837 { 838 byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 839 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 840 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, -128 ]; 841 assert(R.array == correct); 842 } 843 844 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 845 /// using saturation. 846 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 847 { 848 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 849 } 850 unittest 851 { 852 short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534, 1, 5, 4), 853 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 854 static immutable short[4] correct = [ 0, 0, 1, 0]; 855 assert(R.array == correct); 856 } 857 858 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 859 /// using saturation. 860 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 861 { 862 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 863 } 864 unittest 865 { 866 byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8), 867 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 868 static immutable byte[8] correct = [ 0, 7, 0, 0, 0, 0, 0, 0, ]; 869 assert(R.array == correct); 870 } 871 872 deprecated alias _m_to_int = _mm_cvtsi64_si32; 873 deprecated alias _m_to_int64 = _mm_cvtm64_si64; 874 875 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 876 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted 877 { 878 version(LDC) 879 { 880 // avoiding this shufflevector leads to bad performance on LDC 881 return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 882 } 883 else 884 { 885 short4 ia = cast(short4)a; 886 short4 ib = cast(short4)b; 887 short4 r; 888 r.ptr[0] = ia.array[2]; 889 r.ptr[1] = ib.array[2]; 890 r.ptr[2] = ia.array[3]; 891 r.ptr[3] = ib.array[3]; 892 return cast(__m64)r; 893 } 894 } 895 unittest 896 { 897 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 898 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 899 short4 R = cast(short4) _mm_unpackhi_pi16(A, B); 900 short[4] correct = [-16, -3, 7, 10]; 901 assert(R.array == correct); 902 } 903 904 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 905 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted 906 { 907 // Generate punpckldq as far back as LDC 1.0.0 -O1 908 // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions) 909 int2 ia = cast(int2)a; 910 int2 ib = cast(int2)b; 911 int2 r; 912 r.ptr[0] = ia.array[1]; 913 r.ptr[1] = ib.array[1]; 914 return cast(__m64)r; 915 } 916 unittest 917 { 918 __m64 A = _mm_setr_pi32(4, 8); 919 __m64 B = _mm_setr_pi32(5, 9); 920 int2 R = cast(int2) _mm_unpackhi_pi32(A, B); 921 int[2] correct = [8, 9]; 922 assert(R.array == correct); 923 } 924 925 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 926 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 927 { 928 version(LDC) 929 { 930 return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 931 } 932 else 933 { 934 byte8 ia = cast(byte8)a; 935 byte8 ib = cast(byte8)b; 936 byte8 r; 937 r.ptr[0] = ia.array[4]; 938 r.ptr[1] = ib.array[4]; 939 r.ptr[2] = ia.array[5]; 940 r.ptr[3] = ib.array[5]; 941 r.ptr[4] = ia.array[6]; 942 r.ptr[5] = ib.array[6]; 943 r.ptr[6] = ia.array[7]; 944 r.ptr[7] = ib.array[7]; 945 return cast(__m64)r; 946 } 947 } 948 unittest 949 { 950 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 951 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 952 byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B); 953 byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8]; 954 assert(R.array == correct); 955 } 956 957 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 958 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 959 { 960 // Generates punpcklwd since LDC 1.0.0 -01 961 short4 ia = cast(short4)a; 962 short4 ib = cast(short4)b; 963 short4 r; 964 r.ptr[0] = ia.array[0]; 965 r.ptr[1] = ib.array[0]; 966 r.ptr[2] = ia.array[1]; 967 r.ptr[3] = ib.array[1]; 968 return cast(__m64)r; 969 } 970 unittest 971 { 972 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 973 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 974 short4 R = cast(short4) _mm_unpacklo_pi16(A, B); 975 short[4] correct = [4, 5, 8, 9]; 976 assert(R.array == correct); 977 } 978 979 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 980 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted 981 { 982 // x86: Generate punpckldq as far back as LDC 1.0.0 -O1 983 // ARM: Generate zip as far back as LDC 1.8.0 -O1 984 int2 ia = cast(int2)a; 985 int2 ib = cast(int2)b; 986 int2 r; 987 r.ptr[0] = ia.array[0]; 988 r.ptr[1] = ib.array[0]; 989 return cast(__m64)r; 990 } 991 unittest 992 { 993 __m64 A = _mm_setr_pi32(4, 8); 994 __m64 B = _mm_setr_pi32(5, 9); 995 int2 R = cast(int2) _mm_unpacklo_pi32(A, B); 996 int[2] correct = [4, 5]; 997 assert(R.array == correct); 998 } 999 1000 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 1001 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 1002 { 1003 version(LDC) 1004 { 1005 return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 1006 } 1007 else 1008 { 1009 byte8 ia = cast(byte8)a; 1010 byte8 ib = cast(byte8)b; 1011 byte8 r; 1012 r.ptr[0] = ia.array[0]; 1013 r.ptr[1] = ib.array[0]; 1014 r.ptr[2] = ia.array[1]; 1015 r.ptr[3] = ib.array[1]; 1016 r.ptr[4] = ia.array[2]; 1017 r.ptr[5] = ib.array[2]; 1018 r.ptr[6] = ia.array[3]; 1019 r.ptr[7] = ib.array[3]; 1020 return cast(__m64)r; 1021 } 1022 } 1023 unittest 1024 { 1025 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1026 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1027 byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B); 1028 byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4]; 1029 assert(R.array == correct); 1030 } 1031 1032 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`. 1033 __m64 _mm_xor_si64 (__m64 a, __m64 b) 1034 { 1035 return a ^ b; 1036 } 1037 unittest 1038 { 1039 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 1040 __m64 B = _mm_set1_pi16(15); 1041 short4 R = cast(short4)_mm_xor_si64(A, B); 1042 short[4] correct = [240, 14, -16, 15]; 1043 assert(R.array == correct); 1044 } 1045