1 /** 2 * MMX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX 4 * 5 * Copyright: Copyright Guillaume Piolat 2019-2020. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.mmx; 9 10 public import inteli.types; 11 import inteli.internals; 12 13 import inteli.xmmintrin; 14 import inteli.emmintrin; 15 16 nothrow @nogc: 17 18 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 19 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 20 // intel-intrinsics is just semantics. 21 // Even GDC does not seem to use mm0-mm7 registers, instead preferring xmm0-xmm7. 22 23 24 /// Add packed 16-bit integers in `a` and `b`. 25 __m64 _mm_add_pi16 (__m64 a, __m64 b) 26 { 27 return cast(__m64)(cast(short4)a + cast(short4)b); 28 } 29 unittest 30 { 31 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 32 short[4] correct = [7, 7, 7, 7]; 33 assert(R.array == correct); 34 } 35 36 /// Add packed 32-bit integers in `a` and `b`. 37 __m64 _mm_add_pi32 (__m64 a, __m64 b) 38 { 39 return cast(__m64)(cast(int2)a + cast(int2)b); 40 } 41 unittest 42 { 43 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 44 int[2] correct = [7, 7]; 45 assert(R.array == correct); 46 } 47 48 /// Add packed 8-bit integers in `a` and `b`. 49 __m64 _mm_add_pi8 (__m64 a, __m64 b) 50 { 51 return cast(__m64)(cast(byte8)a + cast(byte8)b); 52 } 53 unittest 54 { 55 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 56 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 57 assert(R.array == correct); 58 } 59 60 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 61 // PERF: PADDSW not generated 62 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 63 { 64 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 65 } 66 unittest 67 { 68 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 69 _mm_set_pi16(3, 2, 1, 0)); 70 static immutable short[4] correctResult = [0, 2, 4, 6]; 71 assert(res.array == correctResult); 72 } 73 74 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 75 // PERF: PADDSB not generated 76 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 77 { 78 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 79 } 80 unittest 81 { 82 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 89 // PERF: PADDUSW not generated 90 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 91 { 92 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 93 } 94 unittest 95 { 96 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 97 _mm_set_pi16(3, 2, 1, 0)); 98 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 99 assert(res.array == correctResult); 100 } 101 102 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 103 // PERF: PADDUSB not generated 104 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 105 { 106 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 107 } 108 unittest 109 { 110 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 111 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 112 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 113 assert(res.array == correctResult); 114 } 115 116 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 117 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 118 { 119 return a & b; 120 } 121 unittest 122 { 123 __m64 A = [7]; 124 __m64 B = [14]; 125 __m64 R = _mm_and_si64(A, B); 126 assert(R.array[0] == 6); 127 } 128 129 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 130 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 131 { 132 return (~a) & b; 133 } 134 unittest 135 { 136 __m64 A = [7]; 137 __m64 B = [14]; 138 __m64 R = _mm_andnot_si64(A, B); 139 assert(R.array[0] == 8); 140 } 141 142 /// Compare packed 16-bit integers in `a` and `b` for equality. 143 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 144 { 145 static if (GDC_with_MMX) 146 { 147 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 148 } 149 else 150 { 151 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 152 } 153 } 154 unittest 155 { 156 short4 A = [-3, -2, -1, 0]; 157 short4 B = [ 4, 3, 2, 1]; 158 short[4] E = [ 0, 0, 0, 0]; 159 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 160 assert(R.array == E); 161 } 162 163 /// Compare packed 32-bit integers in `a` and `b` for equality. 164 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 165 { 166 static if (GDC_with_MMX) 167 { 168 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 169 } 170 else 171 { 172 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 173 } 174 } 175 unittest 176 { 177 int2 A = [-3, -2]; 178 int2 B = [ 4, -2]; 179 int[2] E = [ 0, -1]; 180 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 181 assert(R.array == E); 182 } 183 184 /// Compare packed 8-bit integers in `a` and `b` for equality, 185 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 186 { 187 static if (GDC_with_MMX) 188 { 189 return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b); 190 } 191 else 192 { 193 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 194 } 195 } 196 unittest 197 { 198 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 199 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 200 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 201 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 202 assert(C.array == correct); 203 } 204 205 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 206 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 207 { 208 static if (GDC_with_MMX) 209 { 210 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 211 } 212 else 213 { 214 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 215 } 216 } 217 unittest 218 { 219 short4 A = [-3, -2, -1, 0]; 220 short4 B = [ 4, 3, 2, 1]; 221 short[4] E = [ 0, 0, 0, 0]; 222 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 223 assert(R.array == E); 224 } 225 226 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 227 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 228 { 229 static if (GDC_with_MMX) 230 { 231 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 232 } 233 else 234 { 235 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 236 } 237 } 238 unittest 239 { 240 int2 A = [-3, 2]; 241 int2 B = [ 4, -2]; 242 int[2] E = [ 0, -1]; 243 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 244 assert(R.array == E); 245 } 246 247 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 248 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 249 { 250 static if (GDC_with_MMX) 251 { 252 return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b); 253 } 254 else 255 { 256 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 257 } 258 } 259 unittest 260 { 261 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 262 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 263 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 264 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 265 assert(C.array == correct); 266 } 267 268 /// Copy 64-bit integer `a` to `dst`. 269 long _mm_cvtm64_si64 (__m64 a) pure @safe 270 { 271 long1 la = cast(long1)a; 272 return a.array[0]; 273 } 274 unittest 275 { 276 __m64 A = _mm_setr_pi32(2, 1); 277 long1 lA = cast(long1)A; 278 assert(A.array[0] == 0x100000002); 279 } 280 281 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 282 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 283 { 284 __m64 r = void; 285 r.ptr[0] = a; 286 return r; 287 } 288 unittest 289 { 290 __m64 R = _mm_cvtsi32_si64(-1); 291 assert(R.array[0] == -1); 292 } 293 294 /// Copy 64-bit integer `a` to `dst`. 295 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 296 { 297 __m64 r = void; 298 r.ptr[0] = a; 299 return r; 300 } 301 unittest 302 { 303 __m64 R = _mm_cvtsi64_m64(0x123456789A); 304 assert(R.array[0] == 0x123456789A); 305 } 306 307 /// Get the lower 32-bit integer in `a`. 308 int _mm_cvtsi64_si32 (__m64 a) pure @safe 309 { 310 int2 r = cast(int2)a; 311 return r.array[0]; 312 } 313 unittest 314 { 315 __m64 A = _mm_setr_pi32(-6, 5); 316 int R = _mm_cvtsi64_si32(A); 317 assert(R == -6); 318 } 319 320 /// Empty the MMX state, which marks the x87 FPU registers as available for 321 /// use by x87 instructions. 322 /// This instruction is supposed to be used at the end of all MMX technology procedures. 323 /// But this is useless when using `intel-intrinsics`, with all D compilers. 324 void _mm_empty() pure @safe 325 { 326 // do nothing, see comment on top of file 327 } 328 329 330 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics. 331 deprecated alias _m_from_int = _mm_cvtsi32_si64; ///ditto 332 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto 333 334 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 335 /// Horizontally add adjacent pairs of intermediate 32-bit integers 336 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 337 { 338 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 339 } 340 unittest 341 { 342 short4 A = [-32768, -32768, 32767, 32767]; 343 short4 B = [-32768, -32768, 32767, 32767]; 344 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 345 int[2] correct = [-2147483648, 2*32767*32767]; 346 assert(R.array == correct); 347 } 348 349 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 350 /// and store the high 16 bits of the intermediate integers. 351 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 352 { 353 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 354 } 355 unittest 356 { 357 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 358 __m64 B = _mm_set1_pi16(16384); 359 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 360 short[4] correct = [1, 2, -4, 1]; 361 assert(R.array == correct); 362 } 363 364 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 365 /// and store the low 16 bits of the intermediate integers. 366 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 367 { 368 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 369 } 370 unittest 371 { 372 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 373 __m64 B = _mm_set1_pi16(16384); 374 short4 R = cast(short4)_mm_mullo_pi16(A, B); 375 short[4] correct = [0, 16384, 0, -16384]; 376 assert(R.array == correct); 377 } 378 379 /// Compute the bitwise OR of 64 bits in `a` and `b`. 380 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 381 { 382 return a | b; 383 } 384 unittest 385 { 386 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 387 __m64 B = _mm_set1_pi16(15); 388 short4 R = cast(short4)_mm_or_si64(A, B); 389 short[4] correct = [255, 15, -1, 15]; 390 assert(R.array == correct); 391 } 392 393 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 394 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 395 { 396 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 397 int2 r; 398 r.ptr[0] = p.array[0]; 399 r.ptr[1] = p.array[2]; 400 return cast(__m64)r; 401 } 402 unittest 403 { 404 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 405 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 406 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 407 assert(R.array == correct); 408 } 409 410 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 411 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 412 { 413 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 414 int2 r; 415 r.ptr[0] = p.array[0]; 416 r.ptr[1] = p.array[2]; 417 return cast(__m64)r; 418 } 419 unittest 420 { 421 __m64 A = _mm_setr_pi32(100000, -100000); 422 short4 R = cast(short4) _mm_packs_pi32(A, A); 423 short[4] correct = [32767, -32768, 32767, -32768]; 424 assert(R.array == correct); 425 } 426 427 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 428 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 429 { 430 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 431 int2 r; 432 r.ptr[0] = p.array[0]; 433 r.ptr[1] = p.array[2]; 434 return cast(__m64)r; 435 } 436 unittest 437 { 438 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 439 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 440 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 441 assert(R.array == cast(byte[8])correct); 442 } 443 444 deprecated alias 445 _m_packssdw = _mm_packs_pi32, /// Deprecated intrinsics. 446 _m_packsswb = _mm_packs_pi16, ///ditto 447 _m_packuswb = _mm_packs_pu16, ///ditto 448 _m_paddb = _mm_add_pi8, ///ditto 449 _m_paddd = _mm_add_pi32, ///ditto 450 _m_paddsb = _mm_adds_pi8, ///ditto 451 _m_paddsw = _mm_adds_pi16, ///ditto 452 _m_paddusb = _mm_adds_pu8, ///ditto 453 _m_paddusw = _mm_adds_pu16, ///ditto 454 _m_paddw = _mm_add_pi16, ///ditto 455 _m_pand = _mm_and_si64, ///ditto 456 _m_pandn = _mm_andnot_si64, ///ditto 457 _m_pcmpeqb = _mm_cmpeq_pi8, ///ditto 458 _m_pcmpeqd = _mm_cmpeq_pi32, ///ditto 459 _m_pcmpeqw = _mm_cmpeq_pi16, ///ditto 460 _m_pcmpgtb = _mm_cmpgt_pi8, ///ditto 461 _m_pcmpgtd = _mm_cmpgt_pi32, ///ditto 462 _m_pcmpgtw = _mm_cmpgt_pi16, ///ditto 463 _m_pmaddwd = _mm_madd_pi16, ///ditto 464 _m_pmulhw = _mm_mulhi_pi16, ///ditto 465 _m_pmullw = _mm_mullo_pi16, ///ditto 466 _m_por = _mm_or_si64, ///ditto 467 _m_pslld = _mm_sll_pi32, ///ditto 468 _m_pslldi = _mm_slli_pi32, ///ditto 469 _m_psllq = _mm_sll_si64, ///ditto 470 _m_psllqi = _mm_slli_si64, ///ditto 471 _m_psllw = _mm_sll_pi16, ///ditto 472 _m_psllwi = _mm_slli_pi16, ///ditto 473 _m_psrad = _mm_sra_pi32, ///ditto 474 _m_psradi = _mm_srai_pi32, ///ditto 475 _m_psraw = _mm_sra_pi16, ///ditto 476 _m_psrawi = _mm_srai_pi16, ///ditto 477 _m_psrld = _mm_srl_pi32, ///ditto 478 _m_psrldi = _mm_srli_pi32, ///ditto 479 _m_psrlq = _mm_srl_si64, ///ditto 480 _m_psrlqi = _mm_srli_si64, ///ditto 481 _m_psrlw = _mm_srl_pi16, ///ditto 482 _m_psrlwi = _mm_srli_pi16, ///ditto 483 _m_psubb = _mm_sub_pi8, ///ditto 484 _m_psubd = _mm_sub_pi32, ///ditto 485 _m_psubsb = _mm_subs_pi8, ///ditto 486 _m_psubsw = _mm_subs_pi16, ///ditto 487 _m_psubusb = _mm_subs_pu8, ///ditto 488 _m_psubusw = _mm_subs_pu16, ///ditto 489 _m_psubw = _mm_sub_pi16, ///ditto 490 _m_punpckhbw = _mm_unpackhi_pi8, ///ditto 491 _m_punpckhdq = _mm_unpackhi_pi32, ///ditto 492 _m_punpckhwd = _mm_unpackhi_pi16, ///ditto 493 _m_punpcklbw = _mm_unpacklo_pi8, ///ditto 494 _m_punpckldq = _mm_unpacklo_pi32, ///ditto 495 _m_punpcklwd = _mm_unpacklo_pi16, ///ditto 496 _m_pxor = _mm_xor_si64; ///ditto 497 498 /// Set packed 16-bit integers with the supplied values. 499 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 500 { 501 short[4] arr = [e0, e1, e2, e3]; 502 return *cast(__m64*)(arr.ptr); 503 } 504 unittest 505 { 506 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 507 short[4] correct = [0, 1, 2, 3]; 508 assert(R.array == correct); 509 } 510 511 /// Set packed 32-bit integers with the supplied values. 512 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 513 { 514 int[2] arr = [e0, e1]; 515 return *cast(__m64*)(arr.ptr); 516 } 517 unittest 518 { 519 int2 R = cast(int2) _mm_set_pi32(1, 0); 520 int[2] correct = [0, 1]; 521 assert(R.array == correct); 522 } 523 524 /// Set packed 8-bit integers with the supplied values. 525 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 526 { 527 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 528 return *cast(__m64*)(arr.ptr); 529 } 530 unittest 531 { 532 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 533 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 534 assert(R.array == correct); 535 } 536 537 /// Broadcast 16-bit integer `a` to all elements. 538 __m64 _mm_set1_pi16 (short a) pure @trusted 539 { 540 return cast(__m64)(short4(a)); 541 } 542 unittest 543 { 544 short4 R = cast(short4) _mm_set1_pi16(44); 545 short[4] correct = [44, 44, 44, 44]; 546 assert(R.array == correct); 547 } 548 549 /// Broadcast 32-bit integer `a` to all elements. 550 __m64 _mm_set1_pi32 (int a) pure @trusted 551 { 552 return cast(__m64)(int2(a)); 553 } 554 unittest 555 { 556 int2 R = cast(int2) _mm_set1_pi32(43); 557 int[2] correct = [43, 43]; 558 assert(R.array == correct); 559 } 560 561 /// Broadcast 8-bit integer `a` to all elements. 562 __m64 _mm_set1_pi8 (byte a) pure @trusted 563 { 564 return cast(__m64)(byte8(a)); 565 } 566 unittest 567 { 568 byte8 R = cast(byte8) _mm_set1_pi8(42); 569 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 570 assert(R.array == correct); 571 } 572 573 /// Set packed 16-bit integers with the supplied values in reverse order. 574 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 575 { 576 short[4] arr = [e3, e2, e1, e0]; 577 return *cast(__m64*)(arr.ptr); 578 } 579 unittest 580 { 581 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 582 short[4] correct = [0, 1, 2, 3]; 583 assert(R.array == correct); 584 } 585 586 /// Set packed 32-bit integers with the supplied values in reverse order. 587 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 588 { 589 int[2] arr = [e1, e0]; 590 return *cast(__m64*)(arr.ptr); 591 } 592 unittest 593 { 594 int2 R = cast(int2) _mm_setr_pi32(0, 1); 595 int[2] correct = [0, 1]; 596 assert(R.array == correct); 597 } 598 599 /// Set packed 8-bit integers with the supplied values in reverse order. 600 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 601 { 602 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 603 return *cast(__m64*)(arr.ptr); 604 } 605 unittest 606 { 607 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 608 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 609 assert(R.array == correct); 610 } 611 612 /// Return vector of type `__m64` with all elements set to zero. 613 __m64 _mm_setzero_si64 () pure @trusted 614 { 615 __m64 r; // PERF =void; 616 r.ptr[0] = 0; 617 return r; 618 } 619 unittest 620 { 621 __m64 R = _mm_setzero_si64(); 622 assert(R.array[0] == 0); 623 } 624 625 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 626 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe 627 { 628 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits))); 629 } 630 631 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 632 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe 633 { 634 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits))); 635 } 636 637 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 638 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe 639 { 640 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits))); 641 } 642 643 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 644 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe 645 { 646 return to_m64(_mm_slli_epi16(to_m128i(a), imm8)); 647 } 648 unittest 649 { 650 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 651 short4 B = cast(short4)( _mm_slli_pi16(A, 1) ); 652 short[4] correct = [ -8, -10, 12, 14 ]; 653 assert(B.array == correct); 654 } 655 656 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 657 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe 658 { 659 return to_m64(_mm_slli_epi32(to_m128i(a), imm8)); 660 } 661 unittest 662 { 663 __m64 A = _mm_setr_pi32(-4, 5); 664 int2 B = cast(int2)( _mm_slli_pi32(A, 1) ); 665 int[2] correct = [ -8, 10 ]; 666 assert(B.array == correct); 667 } 668 669 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros. 670 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe 671 { 672 return to_m64(_mm_slli_epi64(to_m128i(a), imm8)); 673 } 674 unittest 675 { 676 __m64 A = _mm_cvtsi64_m64(-1); 677 long1 R = cast(long1)( _mm_slli_si64(A, 1) ); 678 long[1] correct = [ -2 ]; 679 assert(R.array == correct); 680 } 681 682 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 683 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe 684 { 685 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits))); 686 } 687 688 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 689 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe 690 { 691 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits))); 692 } 693 694 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 695 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe 696 { 697 return to_m64(_mm_srai_epi16(to_m128i(a), imm8)); 698 } 699 unittest 700 { 701 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 702 short4 B = cast(short4)( _mm_srai_pi16(A, 1) ); 703 short[4] correct = [ -2, -3, 3, 3 ]; 704 assert(B.array == correct); 705 } 706 707 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 708 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe 709 { 710 return to_m64(_mm_srai_epi32(to_m128i(a), imm8)); 711 } 712 unittest 713 { 714 __m64 A = _mm_setr_pi32(-4, 5); 715 int2 B = cast(int2)( _mm_srai_pi32(A, 1) ); 716 int[2] correct = [ -2, 2 ]; 717 assert(B.array == correct); 718 } 719 720 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 721 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe 722 { 723 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits))); 724 } 725 726 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 727 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe 728 { 729 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits))); 730 } 731 732 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 733 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe 734 { 735 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits))); 736 } 737 738 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 739 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe 740 { 741 return to_m64(_mm_srli_epi16(to_m128i(a), imm8)); 742 } 743 unittest 744 { 745 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 746 short4 B = cast(short4)( _mm_srli_pi16(A, 1) ); 747 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 748 assert(B.array == correct); 749 } 750 751 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 752 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe 753 { 754 return to_m64(_mm_srli_epi32(to_m128i(a), imm8)); 755 } 756 unittest 757 { 758 __m64 A = _mm_setr_pi32(-4, 5); 759 int2 B = cast(int2)( _mm_srli_pi32(A, 1) ); 760 int[2] correct = [ 0x7ffffffe, 2 ]; 761 assert(B.array == correct); 762 } 763 764 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros. 765 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe 766 { 767 return to_m64(_mm_srli_epi64(to_m128i(a), imm8)); 768 } 769 unittest 770 { 771 __m64 A = _mm_cvtsi64_m64(-1); 772 long1 R = cast(long1)( _mm_srli_si64(A, 1) ); 773 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 774 assert(R.array == correct); 775 } 776 777 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 778 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 779 { 780 return cast(__m64)(cast(short4)a - cast(short4)b); 781 } 782 unittest 783 { 784 short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 785 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 786 static immutable short[4] correct = [ -1,-15, 1, 32764]; 787 assert(R.array == correct); 788 } 789 790 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 791 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 792 { 793 return cast(__m64)(cast(int2)a - cast(int2)b); 794 } 795 unittest 796 { 797 int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10, 4), 798 _mm_setr_pi32( 15, -70)); 799 static immutable int[2] correct = [ -5, 74]; 800 assert(R.array == correct); 801 } 802 803 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 804 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 805 { 806 return cast(__m64)(cast(byte8)a - cast(byte8)b); 807 } 808 unittest 809 { 810 byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 811 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 812 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, 120 ]; 813 assert(R.array == correct); 814 } 815 816 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation. 817 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 818 { 819 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 820 } 821 unittest 822 { 823 short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 824 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 825 static immutable short[4] correct = [ -1,-15, 1, -32768]; 826 assert(R.array == correct); 827 } 828 829 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation. 830 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 831 { 832 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 833 } 834 unittest 835 { 836 byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 837 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 838 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, -128 ]; 839 assert(R.array == correct); 840 } 841 842 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 843 /// using saturation. 844 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 845 { 846 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 847 } 848 unittest 849 { 850 short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534, 1, 5, 4), 851 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 852 static immutable short[4] correct = [ 0, 0, 1, 0]; 853 assert(R.array == correct); 854 } 855 856 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 857 /// using saturation. 858 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 859 { 860 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 861 } 862 unittest 863 { 864 byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8), 865 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 866 static immutable byte[8] correct = [ 0, 7, 0, 0, 0, 0, 0, 0, ]; 867 assert(R.array == correct); 868 } 869 870 deprecated alias _m_to_int = _mm_cvtsi64_si32; /// Deprecated intrinsics. 871 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto 872 873 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 874 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted 875 { 876 version(LDC) 877 { 878 return cast(__m64) shufflevectorLDC!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 879 } 880 else 881 { 882 short4 ia = cast(short4)a; 883 short4 ib = cast(short4)b; 884 short4 r; 885 r.ptr[0] = ia.array[2]; 886 r.ptr[1] = ib.array[2]; 887 r.ptr[2] = ia.array[3]; 888 r.ptr[3] = ib.array[3]; 889 return cast(__m64)r; 890 } 891 } 892 unittest 893 { 894 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 895 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 896 short4 R = cast(short4) _mm_unpackhi_pi16(A, B); 897 short[4] correct = [-16, -3, 7, 10]; 898 assert(R.array == correct); 899 } 900 901 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 902 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted 903 { 904 // Generate punpckldq as far back as LDC 1.0.0 -O1 905 // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions) 906 int2 ia = cast(int2)a; 907 int2 ib = cast(int2)b; 908 int2 r; 909 r.ptr[0] = ia.array[1]; 910 r.ptr[1] = ib.array[1]; 911 return cast(__m64)r; 912 } 913 unittest 914 { 915 __m64 A = _mm_setr_pi32(4, 8); 916 __m64 B = _mm_setr_pi32(5, 9); 917 int2 R = cast(int2) _mm_unpackhi_pi32(A, B); 918 int[2] correct = [8, 9]; 919 assert(R.array == correct); 920 } 921 922 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 923 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 924 { 925 version(LDC) 926 { 927 return cast(__m64) shufflevectorLDC!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 928 } 929 else 930 { 931 byte8 ia = cast(byte8)a; 932 byte8 ib = cast(byte8)b; 933 byte8 r; 934 r.ptr[0] = ia.array[4]; 935 r.ptr[1] = ib.array[4]; 936 r.ptr[2] = ia.array[5]; 937 r.ptr[3] = ib.array[5]; 938 r.ptr[4] = ia.array[6]; 939 r.ptr[5] = ib.array[6]; 940 r.ptr[6] = ia.array[7]; 941 r.ptr[7] = ib.array[7]; 942 return cast(__m64)r; 943 } 944 } 945 unittest 946 { 947 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 948 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 949 byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B); 950 byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8]; 951 assert(R.array == correct); 952 } 953 954 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 955 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 956 { 957 // Generates punpcklwd since LDC 1.0.0 -01 958 short4 ia = cast(short4)a; 959 short4 ib = cast(short4)b; 960 short4 r; 961 r.ptr[0] = ia.array[0]; 962 r.ptr[1] = ib.array[0]; 963 r.ptr[2] = ia.array[1]; 964 r.ptr[3] = ib.array[1]; 965 return cast(__m64)r; 966 } 967 unittest 968 { 969 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 970 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 971 short4 R = cast(short4) _mm_unpacklo_pi16(A, B); 972 short[4] correct = [4, 5, 8, 9]; 973 assert(R.array == correct); 974 } 975 976 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 977 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted 978 { 979 // x86: Generate punpckldq as far back as LDC 1.0.0 -O1 980 // ARM: Generate zip as far back as LDC 1.8.0 -O1 981 int2 ia = cast(int2)a; 982 int2 ib = cast(int2)b; 983 int2 r; 984 r.ptr[0] = ia.array[0]; 985 r.ptr[1] = ib.array[0]; 986 return cast(__m64)r; 987 } 988 unittest 989 { 990 __m64 A = _mm_setr_pi32(4, 8); 991 __m64 B = _mm_setr_pi32(5, 9); 992 int2 R = cast(int2) _mm_unpacklo_pi32(A, B); 993 int[2] correct = [4, 5]; 994 assert(R.array == correct); 995 } 996 997 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 998 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 999 { 1000 version(LDC) 1001 { 1002 return cast(__m64) shufflevectorLDC!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 1003 } 1004 else 1005 { 1006 byte8 ia = cast(byte8)a; 1007 byte8 ib = cast(byte8)b; 1008 byte8 r; 1009 r.ptr[0] = ia.array[0]; 1010 r.ptr[1] = ib.array[0]; 1011 r.ptr[2] = ia.array[1]; 1012 r.ptr[3] = ib.array[1]; 1013 r.ptr[4] = ia.array[2]; 1014 r.ptr[5] = ib.array[2]; 1015 r.ptr[6] = ia.array[3]; 1016 r.ptr[7] = ib.array[3]; 1017 return cast(__m64)r; 1018 } 1019 } 1020 unittest 1021 { 1022 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1023 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1024 byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B); 1025 byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4]; 1026 assert(R.array == correct); 1027 } 1028 1029 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`. 1030 __m64 _mm_xor_si64 (__m64 a, __m64 b) 1031 { 1032 return a ^ b; 1033 } 1034 unittest 1035 { 1036 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 1037 __m64 B = _mm_set1_pi16(15); 1038 short4 R = cast(short4)_mm_xor_si64(A, B); 1039 short[4] correct = [240, 14, -16, 15]; 1040 assert(R.array == correct); 1041 } 1042