1 /** 2 * MMX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX 4 * 5 * Copyright: Copyright Guillaume Piolat 2019-2020. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.mmx; 9 10 public import inteli.types; 11 import inteli.internals; 12 13 import inteli.xmmintrin; 14 import inteli.emmintrin; 15 16 nothrow @nogc: 17 18 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 19 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 20 // intel-intrinsics is just semantics. 21 22 23 /// Add packed 16-bit integers in `a` and `b`. 24 __m64 _mm_add_pi16 (__m64 a, __m64 b) 25 { 26 return cast(__m64)(cast(short4)a + cast(short4)b); 27 } 28 unittest 29 { 30 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 31 short[4] correct = [7, 7, 7, 7]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m64 _mm_add_pi32 (__m64 a, __m64 b) 37 { 38 return cast(__m64)(cast(int2)a + cast(int2)b); 39 } 40 unittest 41 { 42 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 43 int[2] correct = [7, 7]; 44 assert(R.array == correct); 45 } 46 47 /// Add packed 8-bit integers in `a` and `b`. 48 __m64 _mm_add_pi8 (__m64 a, __m64 b) 49 { 50 return cast(__m64)(cast(byte8)a + cast(byte8)b); 51 } 52 unittest 53 { 54 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 55 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 56 assert(R.array == correct); 57 } 58 59 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 60 // PERF: PADDSW not generated 61 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 62 { 63 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 64 } 65 unittest 66 { 67 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 68 _mm_set_pi16(3, 2, 1, 0)); 69 static immutable short[4] correctResult = [0, 2, 4, 6]; 70 assert(res.array == correctResult); 71 } 72 73 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 74 // PERF: PADDSB not generated 75 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 76 { 77 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 78 } 79 unittest 80 { 81 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 82 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 83 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 84 assert(res.array == correctResult); 85 } 86 87 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 88 // PERF: PADDUSW not generated 89 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 90 { 91 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 92 } 93 unittest 94 { 95 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 96 _mm_set_pi16(3, 2, 1, 0)); 97 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 98 assert(res.array == correctResult); 99 } 100 101 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 102 // PERF: PADDUSB not generated 103 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 104 { 105 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 106 } 107 unittest 108 { 109 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 110 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 111 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 112 assert(res.array == correctResult); 113 } 114 115 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 116 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 117 { 118 return a & b; 119 } 120 unittest 121 { 122 __m64 A = [7]; 123 __m64 B = [14]; 124 __m64 R = _mm_and_si64(A, B); 125 assert(R.array[0] == 6); 126 } 127 128 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 129 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 130 { 131 return (~a) & b; 132 } 133 unittest 134 { 135 __m64 A = [7]; 136 __m64 B = [14]; 137 __m64 R = _mm_andnot_si64(A, B); 138 assert(R.array[0] == 8); 139 } 140 141 /// Compare packed 16-bit integers in `a` and `b` for equality. 142 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 143 { 144 static if (GDC_with_MMX) 145 { 146 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 147 } 148 else 149 { 150 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 151 } 152 } 153 unittest 154 { 155 short4 A = [-3, -2, -1, 0]; 156 short4 B = [ 4, 3, 2, 1]; 157 short[4] E = [ 0, 0, 0, 0]; 158 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 159 assert(R.array == E); 160 } 161 162 /// Compare packed 32-bit integers in `a` and `b` for equality. 163 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 164 { 165 static if (GDC_with_MMX) 166 { 167 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 168 } 169 else 170 { 171 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 172 } 173 } 174 unittest 175 { 176 int2 A = [-3, -2]; 177 int2 B = [ 4, -2]; 178 int[2] E = [ 0, -1]; 179 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 180 assert(R.array == E); 181 } 182 183 /// Compare packed 8-bit integers in `a` and `b` for equality, 184 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 185 { 186 static if (GDC_with_MMX) 187 { 188 return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b); 189 } 190 else 191 { 192 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 193 } 194 } 195 unittest 196 { 197 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 198 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 199 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 200 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 201 assert(C.array == correct); 202 } 203 204 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 205 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 206 { 207 static if (GDC_with_MMX) 208 { 209 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 210 } 211 else 212 { 213 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 214 } 215 } 216 unittest 217 { 218 short4 A = [-3, -2, -1, 0]; 219 short4 B = [ 4, 3, 2, 1]; 220 short[4] E = [ 0, 0, 0, 0]; 221 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 222 assert(R.array == E); 223 } 224 225 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 226 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 227 { 228 static if (GDC_with_MMX) 229 { 230 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 231 } 232 else 233 { 234 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 235 } 236 } 237 unittest 238 { 239 int2 A = [-3, 2]; 240 int2 B = [ 4, -2]; 241 int[2] E = [ 0, -1]; 242 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 243 assert(R.array == E); 244 } 245 246 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 247 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 248 { 249 static if (GDC_with_MMX) 250 { 251 return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b); 252 } 253 else 254 { 255 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 256 } 257 } 258 unittest 259 { 260 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 261 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 262 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 263 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 264 assert(C.array == correct); 265 } 266 267 /// Copy 64-bit integer `a` to `dst`. 268 long _mm_cvtm64_si64 (__m64 a) pure @safe 269 { 270 long1 la = cast(long1)a; 271 return a.array[0]; 272 } 273 unittest 274 { 275 __m64 A = _mm_setr_pi32(2, 1); 276 long1 lA = cast(long1)A; 277 assert(A.array[0] == 0x100000002); 278 } 279 280 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 281 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 282 { 283 __m64 r = void; 284 r.ptr[0] = a; 285 return r; 286 } 287 unittest 288 { 289 __m64 R = _mm_cvtsi32_si64(-1); 290 assert(R.array[0] == -1); 291 } 292 293 /// Copy 64-bit integer `a` to `dst`. 294 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 295 { 296 __m64 r = void; 297 r.ptr[0] = a; 298 return r; 299 } 300 unittest 301 { 302 __m64 R = _mm_cvtsi64_m64(0x123456789A); 303 assert(R.array[0] == 0x123456789A); 304 } 305 306 /// Get the lower 32-bit integer in `a`. 307 int _mm_cvtsi64_si32 (__m64 a) pure @safe 308 { 309 int2 r = cast(int2)a; 310 return r.array[0]; 311 } 312 unittest 313 { 314 __m64 A = _mm_setr_pi32(-6, 5); 315 int R = _mm_cvtsi64_si32(A); 316 assert(R == -6); 317 } 318 319 /// Empty the MMX state, which marks the x87 FPU registers as available for 320 /// use by x87 instructions. 321 /// This instruction is supposed to be used at the end of all MMX technology procedures. 322 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD. 323 void _mm_empty() pure @safe 324 { 325 // do nothing, see comment on top of file 326 // TODO: not sure for GDC, do something? 327 } 328 329 330 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics. 331 deprecated alias _m_from_int = _mm_cvtsi32_si64; ///ditto 332 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto 333 334 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 335 /// Horizontally add adjacent pairs of intermediate 32-bit integers 336 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 337 { 338 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 339 } 340 unittest 341 { 342 short4 A = [-32768, -32768, 32767, 32767]; 343 short4 B = [-32768, -32768, 32767, 32767]; 344 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 345 int[2] correct = [-2147483648, 2*32767*32767]; 346 assert(R.array == correct); 347 } 348 349 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 350 /// and store the high 16 bits of the intermediate integers. 351 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 352 { 353 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 354 } 355 unittest 356 { 357 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 358 __m64 B = _mm_set1_pi16(16384); 359 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 360 short[4] correct = [1, 2, -4, 1]; 361 assert(R.array == correct); 362 } 363 364 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 365 /// and store the low 16 bits of the intermediate integers. 366 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 367 { 368 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 369 } 370 unittest 371 { 372 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 373 __m64 B = _mm_set1_pi16(16384); 374 short4 R = cast(short4)_mm_mullo_pi16(A, B); 375 short[4] correct = [0, 16384, 0, -16384]; 376 assert(R.array == correct); 377 } 378 379 /// Compute the bitwise OR of 64 bits in `a` and `b`. 380 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 381 { 382 return a | b; 383 } 384 unittest 385 { 386 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 387 __m64 B = _mm_set1_pi16(15); 388 short4 R = cast(short4)_mm_or_si64(A, B); 389 short[4] correct = [255, 15, -1, 15]; 390 assert(R.array == correct); 391 } 392 393 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 394 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 395 { 396 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 397 int2 r; 398 r.ptr[0] = p.array[0]; 399 r.ptr[1] = p.array[2]; 400 return cast(__m64)r; 401 } 402 unittest 403 { 404 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 405 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 406 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 407 assert(R.array == correct); 408 } 409 410 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 411 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 412 { 413 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 414 int2 r; 415 r.ptr[0] = p.array[0]; 416 r.ptr[1] = p.array[2]; 417 return cast(__m64)r; 418 } 419 unittest 420 { 421 __m64 A = _mm_setr_pi32(100000, -100000); 422 short4 R = cast(short4) _mm_packs_pi32(A, A); 423 short[4] correct = [32767, -32768, 32767, -32768]; 424 assert(R.array == correct); 425 } 426 427 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 428 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 429 { 430 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 431 int2 r; 432 r.ptr[0] = p.array[0]; 433 r.ptr[1] = p.array[2]; 434 return cast(__m64)r; 435 } 436 unittest 437 { 438 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 439 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 440 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 441 assert(R.array == cast(byte[8])correct); 442 } 443 444 deprecated alias 445 _m_packssdw = _mm_packs_pi32, /// Deprecated intrinsics. 446 _m_packsswb = _mm_packs_pi16, ///ditto 447 _m_packuswb = _mm_packs_pu16, ///ditto 448 _m_paddb = _mm_add_pi8, ///ditto 449 _m_paddd = _mm_add_pi32, ///ditto 450 _m_paddsb = _mm_adds_pi8, ///ditto 451 _m_paddsw = _mm_adds_pi16, ///ditto 452 _m_paddusb = _mm_adds_pu8, ///ditto 453 _m_paddusw = _mm_adds_pu16, ///ditto 454 _m_paddw = _mm_add_pi16, ///ditto 455 _m_pand = _mm_and_si64, ///ditto 456 _m_pandn = _mm_andnot_si64, ///ditto 457 _m_pcmpeqb = _mm_cmpeq_pi8, ///ditto 458 _m_pcmpeqd = _mm_cmpeq_pi32, ///ditto 459 _m_pcmpeqw = _mm_cmpeq_pi16, ///ditto 460 _m_pcmpgtb = _mm_cmpgt_pi8, ///ditto 461 _m_pcmpgtd = _mm_cmpgt_pi32, ///ditto 462 _m_pcmpgtw = _mm_cmpgt_pi16, ///ditto 463 _m_pmaddwd = _mm_madd_pi16, ///ditto 464 _m_pmulhw = _mm_mulhi_pi16, ///ditto 465 _m_pmullw = _mm_mullo_pi16, ///ditto 466 _m_por = _mm_or_si64, ///ditto 467 _m_pslld = _mm_sll_pi32, ///ditto 468 _m_pslldi = _mm_slli_pi32, ///ditto 469 _m_psllq = _mm_sll_si64, ///ditto 470 _m_psllqi = _mm_slli_si64, ///ditto 471 _m_psllw = _mm_sll_pi16, ///ditto 472 _m_psllwi = _mm_slli_pi16, ///ditto 473 _m_psrad = _mm_sra_pi32, ///ditto 474 _m_psradi = _mm_srai_pi32, ///ditto 475 _m_psraw = _mm_sra_pi16, ///ditto 476 _m_psrawi = _mm_srai_pi16, ///ditto 477 _m_psrld = _mm_srl_pi32, ///ditto 478 _m_psrldi = _mm_srli_pi32, ///ditto 479 _m_psrlq = _mm_srl_si64, ///ditto 480 _m_psrlqi = _mm_srli_si64, ///ditto 481 _m_psrlw = _mm_srl_pi16, ///ditto 482 _m_psrlwi = _mm_srli_pi16, ///ditto 483 _m_psubb = _mm_sub_pi8, ///ditto 484 _m_psubd = _mm_sub_pi32, ///ditto 485 _m_psubsb = _mm_subs_pi8, ///ditto 486 _m_psubsw = _mm_subs_pi16, ///ditto 487 _m_psubusb = _mm_subs_pu8, ///ditto 488 _m_psubusw = _mm_subs_pu16, ///ditto 489 _m_psubw = _mm_sub_pi16, ///ditto 490 _m_punpckhbw = _mm_unpackhi_pi8, ///ditto 491 _m_punpckhdq = _mm_unpackhi_pi32, ///ditto 492 _m_punpckhwd = _mm_unpackhi_pi16, ///ditto 493 _m_punpcklbw = _mm_unpacklo_pi8, ///ditto 494 _m_punpckldq = _mm_unpacklo_pi32, ///ditto 495 _m_punpcklwd = _mm_unpacklo_pi16, ///ditto 496 _m_pxor = _mm_xor_si64; ///ditto 497 498 /// Set packed 16-bit integers with the supplied values. 499 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 500 { 501 short[4] arr = [e0, e1, e2, e3]; 502 return *cast(__m64*)(arr.ptr); 503 } 504 unittest 505 { 506 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 507 short[4] correct = [0, 1, 2, 3]; 508 assert(R.array == correct); 509 } 510 511 /// Set packed 32-bit integers with the supplied values. 512 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 513 { 514 int[2] arr = [e0, e1]; 515 return *cast(__m64*)(arr.ptr); 516 } 517 unittest 518 { 519 int2 R = cast(int2) _mm_set_pi32(1, 0); 520 int[2] correct = [0, 1]; 521 assert(R.array == correct); 522 } 523 524 /// Set packed 8-bit integers with the supplied values. 525 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 526 { 527 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 528 return *cast(__m64*)(arr.ptr); 529 } 530 unittest 531 { 532 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 533 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 534 assert(R.array == correct); 535 } 536 537 /// Broadcast 16-bit integer `a` to all elements. 538 __m64 _mm_set1_pi16 (short a) pure @trusted 539 { 540 return cast(__m64)(short4(a)); 541 } 542 unittest 543 { 544 short4 R = cast(short4) _mm_set1_pi16(44); 545 short[4] correct = [44, 44, 44, 44]; 546 assert(R.array == correct); 547 } 548 549 /// Broadcast 32-bit integer `a` to all elements. 550 __m64 _mm_set1_pi32 (int a) pure @trusted 551 { 552 return cast(__m64)(int2(a)); 553 } 554 unittest 555 { 556 int2 R = cast(int2) _mm_set1_pi32(43); 557 int[2] correct = [43, 43]; 558 assert(R.array == correct); 559 } 560 561 /// Broadcast 8-bit integer `a` to all elements. 562 __m64 _mm_set1_pi8 (byte a) pure @trusted 563 { 564 return cast(__m64)(byte8(a)); 565 } 566 unittest 567 { 568 byte8 R = cast(byte8) _mm_set1_pi8(42); 569 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 570 assert(R.array == correct); 571 } 572 573 /// Set packed 16-bit integers with the supplied values in reverse order. 574 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 575 { 576 short[4] arr = [e3, e2, e1, e0]; 577 return *cast(__m64*)(arr.ptr); 578 } 579 unittest 580 { 581 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 582 short[4] correct = [0, 1, 2, 3]; 583 assert(R.array == correct); 584 } 585 586 /// Set packed 32-bit integers with the supplied values in reverse order. 587 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 588 { 589 int[2] arr = [e1, e0]; 590 return *cast(__m64*)(arr.ptr); 591 } 592 unittest 593 { 594 int2 R = cast(int2) _mm_setr_pi32(0, 1); 595 int[2] correct = [0, 1]; 596 assert(R.array == correct); 597 } 598 599 /// Set packed 8-bit integers with the supplied values in reverse order. 600 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 601 { 602 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 603 return *cast(__m64*)(arr.ptr); 604 } 605 unittest 606 { 607 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 608 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 609 assert(R.array == correct); 610 } 611 612 /// Return vector of type `__m64` with all elements set to zero. 613 __m64 _mm_setzero_si64 () pure @trusted 614 { 615 __m64 r; // PERF =void; 616 r.ptr[0] = 0; 617 return r; 618 } 619 unittest 620 { 621 __m64 R = _mm_setzero_si64(); 622 assert(R.array[0] == 0); 623 } 624 625 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 626 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe 627 { 628 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits))); 629 } 630 631 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 632 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe 633 { 634 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits))); 635 } 636 637 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 638 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe 639 { 640 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits))); 641 } 642 643 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 644 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe 645 { 646 return to_m64(_mm_slli_epi16(to_m128i(a), imm8)); 647 } 648 unittest 649 { 650 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 651 short4 B = cast(short4)( _mm_slli_pi16(A, 1) ); 652 short[4] correct = [ -8, -10, 12, 14 ]; 653 assert(B.array == correct); 654 } 655 656 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 657 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe 658 { 659 return to_m64(_mm_slli_epi32(to_m128i(a), imm8)); 660 } 661 unittest 662 { 663 __m64 A = _mm_setr_pi32(-4, 5); 664 int2 B = cast(int2)( _mm_slli_pi32(A, 1) ); 665 int[2] correct = [ -8, 10 ]; 666 assert(B.array == correct); 667 } 668 669 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros. 670 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe 671 { 672 return to_m64(_mm_slli_epi64(to_m128i(a), imm8)); 673 } 674 unittest 675 { 676 __m64 A = _mm_cvtsi64_m64(-1); 677 long1 R = cast(long1)( _mm_slli_si64(A, 1) ); 678 long[1] correct = [ -2 ]; 679 assert(R.array == correct); 680 } 681 682 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 683 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe 684 { 685 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits))); 686 } 687 688 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 689 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe 690 { 691 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits))); 692 } 693 694 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 695 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe 696 { 697 return to_m64(_mm_srai_epi16(to_m128i(a), imm8)); 698 } 699 unittest 700 { 701 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 702 short4 B = cast(short4)( _mm_srai_pi16(A, 1) ); 703 short[4] correct = [ -2, -3, 3, 3 ]; 704 assert(B.array == correct); 705 } 706 707 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 708 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe 709 { 710 return to_m64(_mm_srai_epi32(to_m128i(a), imm8)); 711 } 712 unittest 713 { 714 __m64 A = _mm_setr_pi32(-4, 5); 715 int2 B = cast(int2)( _mm_srai_pi32(A, 1) ); 716 int[2] correct = [ -2, 2 ]; 717 assert(B.array == correct); 718 } 719 720 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 721 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe 722 { 723 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits))); 724 } 725 726 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 727 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe 728 { 729 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits))); 730 } 731 732 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 733 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe 734 { 735 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits))); 736 } 737 738 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 739 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe 740 { 741 return to_m64(_mm_srli_epi16(to_m128i(a), imm8)); 742 } 743 unittest 744 { 745 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 746 short4 B = cast(short4)( _mm_srli_pi16(A, 1) ); 747 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 748 assert(B.array == correct); 749 } 750 751 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 752 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe 753 { 754 return to_m64(_mm_srli_epi32(to_m128i(a), imm8)); 755 } 756 unittest 757 { 758 __m64 A = _mm_setr_pi32(-4, 5); 759 int2 B = cast(int2)( _mm_srli_pi32(A, 1) ); 760 int[2] correct = [ 0x7ffffffe, 2 ]; 761 assert(B.array == correct); 762 } 763 764 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros. 765 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe 766 { 767 return to_m64(_mm_srli_epi64(to_m128i(a), imm8)); 768 } 769 unittest 770 { 771 __m64 A = _mm_cvtsi64_m64(-1); 772 long1 R = cast(long1)( _mm_srli_si64(A, 1) ); 773 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 774 assert(R.array == correct); 775 } 776 777 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 778 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 779 { 780 return cast(__m64)(cast(short4)a - cast(short4)b); 781 } 782 unittest 783 { 784 short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 785 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 786 static immutable short[4] correct = [ -1,-15, 1, 32764]; 787 assert(R.array == correct); 788 } 789 790 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 791 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 792 { 793 return cast(__m64)(cast(int2)a - cast(int2)b); 794 } 795 unittest 796 { 797 int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10, 4), 798 _mm_setr_pi32( 15, -70)); 799 static immutable int[2] correct = [ -5, 74]; 800 assert(R.array == correct); 801 } 802 803 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 804 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 805 { 806 return cast(__m64)(cast(byte8)a - cast(byte8)b); 807 } 808 unittest 809 { 810 byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 811 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 812 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, 120 ]; 813 assert(R.array == correct); 814 } 815 816 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation. 817 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 818 { 819 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 820 } 821 unittest 822 { 823 short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 824 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 825 static immutable short[4] correct = [ -1,-15, 1, -32768]; 826 assert(R.array == correct); 827 } 828 829 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation. 830 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 831 { 832 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 833 } 834 unittest 835 { 836 byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 837 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 838 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, -128 ]; 839 assert(R.array == correct); 840 } 841 842 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 843 /// using saturation. 844 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 845 { 846 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 847 } 848 unittest 849 { 850 short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534, 1, 5, 4), 851 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 852 static immutable short[4] correct = [ 0, 0, 1, 0]; 853 assert(R.array == correct); 854 } 855 856 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 857 /// using saturation. 858 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 859 { 860 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 861 } 862 unittest 863 { 864 byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8), 865 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 866 static immutable byte[8] correct = [ 0, 7, 0, 0, 0, 0, 0, 0, ]; 867 assert(R.array == correct); 868 } 869 870 deprecated alias _m_to_int = _mm_cvtsi64_si32; /// Deprecated intrinsics. 871 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto 872 873 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 874 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted 875 { 876 version(LDC) 877 { 878 // avoiding this shufflevector leads to bad performance on LDC 879 return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 880 } 881 else 882 { 883 short4 ia = cast(short4)a; 884 short4 ib = cast(short4)b; 885 short4 r; 886 r.ptr[0] = ia.array[2]; 887 r.ptr[1] = ib.array[2]; 888 r.ptr[2] = ia.array[3]; 889 r.ptr[3] = ib.array[3]; 890 return cast(__m64)r; 891 } 892 } 893 unittest 894 { 895 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 896 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 897 short4 R = cast(short4) _mm_unpackhi_pi16(A, B); 898 short[4] correct = [-16, -3, 7, 10]; 899 assert(R.array == correct); 900 } 901 902 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 903 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted 904 { 905 // Generate punpckldq as far back as LDC 1.0.0 -O1 906 // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions) 907 int2 ia = cast(int2)a; 908 int2 ib = cast(int2)b; 909 int2 r; 910 r.ptr[0] = ia.array[1]; 911 r.ptr[1] = ib.array[1]; 912 return cast(__m64)r; 913 } 914 unittest 915 { 916 __m64 A = _mm_setr_pi32(4, 8); 917 __m64 B = _mm_setr_pi32(5, 9); 918 int2 R = cast(int2) _mm_unpackhi_pi32(A, B); 919 int[2] correct = [8, 9]; 920 assert(R.array == correct); 921 } 922 923 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 924 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 925 { 926 version(LDC) 927 { 928 return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 929 } 930 else 931 { 932 byte8 ia = cast(byte8)a; 933 byte8 ib = cast(byte8)b; 934 byte8 r; 935 r.ptr[0] = ia.array[4]; 936 r.ptr[1] = ib.array[4]; 937 r.ptr[2] = ia.array[5]; 938 r.ptr[3] = ib.array[5]; 939 r.ptr[4] = ia.array[6]; 940 r.ptr[5] = ib.array[6]; 941 r.ptr[6] = ia.array[7]; 942 r.ptr[7] = ib.array[7]; 943 return cast(__m64)r; 944 } 945 } 946 unittest 947 { 948 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 949 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 950 byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B); 951 byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8]; 952 assert(R.array == correct); 953 } 954 955 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 956 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 957 { 958 // Generates punpcklwd since LDC 1.0.0 -01 959 short4 ia = cast(short4)a; 960 short4 ib = cast(short4)b; 961 short4 r; 962 r.ptr[0] = ia.array[0]; 963 r.ptr[1] = ib.array[0]; 964 r.ptr[2] = ia.array[1]; 965 r.ptr[3] = ib.array[1]; 966 return cast(__m64)r; 967 } 968 unittest 969 { 970 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 971 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 972 short4 R = cast(short4) _mm_unpacklo_pi16(A, B); 973 short[4] correct = [4, 5, 8, 9]; 974 assert(R.array == correct); 975 } 976 977 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 978 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted 979 { 980 // x86: Generate punpckldq as far back as LDC 1.0.0 -O1 981 // ARM: Generate zip as far back as LDC 1.8.0 -O1 982 int2 ia = cast(int2)a; 983 int2 ib = cast(int2)b; 984 int2 r; 985 r.ptr[0] = ia.array[0]; 986 r.ptr[1] = ib.array[0]; 987 return cast(__m64)r; 988 } 989 unittest 990 { 991 __m64 A = _mm_setr_pi32(4, 8); 992 __m64 B = _mm_setr_pi32(5, 9); 993 int2 R = cast(int2) _mm_unpacklo_pi32(A, B); 994 int[2] correct = [4, 5]; 995 assert(R.array == correct); 996 } 997 998 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 999 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 1000 { 1001 version(LDC) 1002 { 1003 return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 1004 } 1005 else 1006 { 1007 byte8 ia = cast(byte8)a; 1008 byte8 ib = cast(byte8)b; 1009 byte8 r; 1010 r.ptr[0] = ia.array[0]; 1011 r.ptr[1] = ib.array[0]; 1012 r.ptr[2] = ia.array[1]; 1013 r.ptr[3] = ib.array[1]; 1014 r.ptr[4] = ia.array[2]; 1015 r.ptr[5] = ib.array[2]; 1016 r.ptr[6] = ia.array[3]; 1017 r.ptr[7] = ib.array[3]; 1018 return cast(__m64)r; 1019 } 1020 } 1021 unittest 1022 { 1023 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1024 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1025 byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B); 1026 byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4]; 1027 assert(R.array == correct); 1028 } 1029 1030 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`. 1031 __m64 _mm_xor_si64 (__m64 a, __m64 b) 1032 { 1033 return a ^ b; 1034 } 1035 unittest 1036 { 1037 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 1038 __m64 B = _mm_set1_pi16(15); 1039 short4 R = cast(short4)_mm_xor_si64(A, B); 1040 short[4] correct = [240, 14, -16, 15]; 1041 assert(R.array == correct); 1042 } 1043