1 /** 2 * MMX intrinsics. 3 * 4 * Copyright: Copyright Guillaume Piolat 2019-2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.mmx; 8 9 public import inteli.types; 10 import inteli.internals; 11 12 import inteli.xmmintrin; 13 import inteli.emmintrin; 14 15 nothrow @nogc: 16 17 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 18 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 19 // intel-intrinsics is just semantics. 20 21 22 /// Add packed 16-bit integers in `a` and `b`. 23 __m64 _mm_add_pi16 (__m64 a, __m64 b) 24 { 25 return cast(__m64)(cast(short4)a + cast(short4)b); 26 } 27 unittest 28 { 29 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 30 short[4] correct = [7, 7, 7, 7]; 31 assert(R.array == correct); 32 } 33 34 /// Add packed 32-bit integers in `a` and `b`. 35 __m64 _mm_add_pi32 (__m64 a, __m64 b) 36 { 37 return cast(__m64)(cast(int2)a + cast(int2)b); 38 } 39 unittest 40 { 41 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 42 int[2] correct = [7, 7]; 43 assert(R.array == correct); 44 } 45 46 /// Add packed 8-bit integers in `a` and `b`. 47 __m64 _mm_add_pi8 (__m64 a, __m64 b) 48 { 49 return cast(__m64)(cast(byte8)a + cast(byte8)b); 50 } 51 unittest 52 { 53 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 54 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 55 assert(R.array == correct); 56 } 57 58 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 59 // PERF: PADDSW not generated 60 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 61 { 62 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 63 } 64 unittest 65 { 66 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 67 _mm_set_pi16(3, 2, 1, 0)); 68 static immutable short[4] correctResult = [0, 2, 4, 6]; 69 assert(res.array == correctResult); 70 } 71 72 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 73 // PERF: PADDSB not generated 74 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 75 { 76 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 77 } 78 unittest 79 { 80 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 81 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 82 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 83 assert(res.array == correctResult); 84 } 85 86 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 87 // PERF: PADDUSW not generated 88 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 89 { 90 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 91 } 92 unittest 93 { 94 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 95 _mm_set_pi16(3, 2, 1, 0)); 96 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 97 assert(res.array == correctResult); 98 } 99 100 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 101 // PERF: PADDUSB not generated 102 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 103 { 104 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 105 } 106 unittest 107 { 108 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 109 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 110 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 111 assert(res.array == correctResult); 112 } 113 114 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 115 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 116 { 117 return a & b; 118 } 119 unittest 120 { 121 __m64 A = [7]; 122 __m64 B = [14]; 123 __m64 R = _mm_and_si64(A, B); 124 assert(R.array[0] == 6); 125 } 126 127 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 128 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 129 { 130 return (~a) & b; 131 } 132 unittest 133 { 134 __m64 A = [7]; 135 __m64 B = [14]; 136 __m64 R = _mm_andnot_si64(A, B); 137 assert(R.array[0] == 8); 138 } 139 140 /// Compare packed 16-bit integers in `a` and `b` for equality. 141 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 142 { 143 static if (GDC_with_MMX) 144 { 145 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 146 } 147 else 148 { 149 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 150 } 151 } 152 unittest 153 { 154 short4 A = [-3, -2, -1, 0]; 155 short4 B = [ 4, 3, 2, 1]; 156 short[4] E = [ 0, 0, 0, 0]; 157 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 158 assert(R.array == E); 159 } 160 161 /// Compare packed 32-bit integers in `a` and `b` for equality. 162 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 163 { 164 static if (GDC_with_MMX) 165 { 166 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 167 } 168 else 169 { 170 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 171 } 172 } 173 unittest 174 { 175 int2 A = [-3, -2]; 176 int2 B = [ 4, -2]; 177 int[2] E = [ 0, -1]; 178 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 179 assert(R.array == E); 180 } 181 182 /// Compare packed 8-bit integers in `a` and `b` for equality, 183 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 184 { 185 static if (GDC_with_MMX) 186 { 187 return cast(__m64) __builtin_ia32_pcmpeqb(cast(ubyte8)a, cast(ubyte8)b); 188 } 189 else 190 { 191 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 192 } 193 } 194 unittest 195 { 196 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 197 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 198 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 199 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 200 assert(C.array == correct); 201 } 202 203 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 204 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 205 { 206 static if (GDC_with_MMX) 207 { 208 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 209 } 210 else 211 { 212 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 213 } 214 } 215 unittest 216 { 217 short4 A = [-3, -2, -1, 0]; 218 short4 B = [ 4, 3, 2, 1]; 219 short[4] E = [ 0, 0, 0, 0]; 220 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 221 assert(R.array == E); 222 } 223 224 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 225 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 226 { 227 static if (GDC_with_MMX) 228 { 229 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 230 } 231 else 232 { 233 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 234 } 235 } 236 unittest 237 { 238 int2 A = [-3, 2]; 239 int2 B = [ 4, -2]; 240 int[2] E = [ 0, -1]; 241 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 242 assert(R.array == E); 243 } 244 245 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 246 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 247 { 248 static if (GDC_with_MMX) 249 { 250 return cast(__m64) __builtin_ia32_pcmpgtb (cast(ubyte8)a, cast(ubyte8)b); 251 } 252 else 253 { 254 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 255 } 256 } 257 unittest 258 { 259 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 260 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 261 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 262 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 263 assert(C.array == correct); 264 } 265 266 /// Copy 64-bit integer `a` to `dst`. 267 long _mm_cvtm64_si64 (__m64 a) pure @safe 268 { 269 long1 la = cast(long1)a; 270 return a.array[0]; 271 } 272 unittest 273 { 274 __m64 A = _mm_setr_pi32(2, 1); 275 long1 lA = cast(long1)A; 276 assert(A.array[0] == 0x100000002); 277 } 278 279 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 280 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 281 { 282 __m64 r = void; 283 r.ptr[0] = a; 284 return r; 285 } 286 unittest 287 { 288 __m64 R = _mm_cvtsi32_si64(-1); 289 assert(R.array[0] == -1); 290 } 291 292 /// Copy 64-bit integer `a` to `dst`. 293 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 294 { 295 __m64 r = void; 296 r.ptr[0] = a; 297 return r; 298 } 299 unittest 300 { 301 __m64 R = _mm_cvtsi64_m64(0x123456789A); 302 assert(R.array[0] == 0x123456789A); 303 } 304 305 /// Get the lower 32-bit integer in `a`. 306 int _mm_cvtsi64_si32 (__m64 a) pure @safe 307 { 308 int2 r = cast(int2)a; 309 return r.array[0]; 310 } 311 unittest 312 { 313 __m64 A = _mm_setr_pi32(-6, 5); 314 int R = _mm_cvtsi64_si32(A); 315 assert(R == -6); 316 } 317 318 /// Empty the MMX state, which marks the x87 FPU registers as available for 319 /// use by x87 instructions. 320 /// This instruction is supposed to be used at the end of all MMX technology procedures. 321 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD. 322 void _mm_empty() pure @safe 323 { 324 // do nothing, see comment on top of file 325 // TODO: not sure for GDC, do something? 326 } 327 328 329 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics. 330 deprecated alias _m_from_int = _mm_cvtsi32_si64; ///ditto 331 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto 332 333 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 334 /// Horizontally add adjacent pairs of intermediate 32-bit integers 335 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 336 { 337 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 338 } 339 unittest 340 { 341 short4 A = [-32768, -32768, 32767, 32767]; 342 short4 B = [-32768, -32768, 32767, 32767]; 343 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 344 int[2] correct = [-2147483648, 2*32767*32767]; 345 assert(R.array == correct); 346 } 347 348 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 349 /// and store the high 16 bits of the intermediate integers. 350 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 351 { 352 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 353 } 354 unittest 355 { 356 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 357 __m64 B = _mm_set1_pi16(16384); 358 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 359 short[4] correct = [1, 2, -4, 1]; 360 assert(R.array == correct); 361 } 362 363 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 364 /// and store the low 16 bits of the intermediate integers. 365 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 366 { 367 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 368 } 369 unittest 370 { 371 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 372 __m64 B = _mm_set1_pi16(16384); 373 short4 R = cast(short4)_mm_mullo_pi16(A, B); 374 short[4] correct = [0, 16384, 0, -16384]; 375 assert(R.array == correct); 376 } 377 378 /// Compute the bitwise OR of 64 bits in `a` and `b`. 379 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 380 { 381 return a | b; 382 } 383 unittest 384 { 385 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 386 __m64 B = _mm_set1_pi16(15); 387 short4 R = cast(short4)_mm_or_si64(A, B); 388 short[4] correct = [255, 15, -1, 15]; 389 assert(R.array == correct); 390 } 391 392 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 393 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 394 { 395 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 396 int2 r; 397 r.ptr[0] = p.array[0]; 398 r.ptr[1] = p.array[2]; 399 return cast(__m64)r; 400 } 401 unittest 402 { 403 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 404 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 405 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 406 assert(R.array == correct); 407 } 408 409 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 410 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 411 { 412 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 413 int2 r; 414 r.ptr[0] = p.array[0]; 415 r.ptr[1] = p.array[2]; 416 return cast(__m64)r; 417 } 418 unittest 419 { 420 __m64 A = _mm_setr_pi32(100000, -100000); 421 short4 R = cast(short4) _mm_packs_pi32(A, A); 422 short[4] correct = [32767, -32768, 32767, -32768]; 423 assert(R.array == correct); 424 } 425 426 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 427 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 428 { 429 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 430 int2 r; 431 r.ptr[0] = p.array[0]; 432 r.ptr[1] = p.array[2]; 433 return cast(__m64)r; 434 } 435 unittest 436 { 437 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 438 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 439 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 440 assert(R.array == cast(byte[8])correct); 441 } 442 443 deprecated alias 444 _m_packssdw = _mm_packs_pi32, /// Deprecated intrinsics. 445 _m_packsswb = _mm_packs_pi16, ///ditto 446 _m_packuswb = _mm_packs_pu16, ///ditto 447 _m_paddb = _mm_add_pi8, ///ditto 448 _m_paddd = _mm_add_pi32, ///ditto 449 _m_paddsb = _mm_adds_pi8, ///ditto 450 _m_paddsw = _mm_adds_pi16, ///ditto 451 _m_paddusb = _mm_adds_pu8, ///ditto 452 _m_paddusw = _mm_adds_pu16, ///ditto 453 _m_paddw = _mm_add_pi16, ///ditto 454 _m_pand = _mm_and_si64, ///ditto 455 _m_pandn = _mm_andnot_si64, ///ditto 456 _m_pcmpeqb = _mm_cmpeq_pi8, ///ditto 457 _m_pcmpeqd = _mm_cmpeq_pi32, ///ditto 458 _m_pcmpeqw = _mm_cmpeq_pi16, ///ditto 459 _m_pcmpgtb = _mm_cmpgt_pi8, ///ditto 460 _m_pcmpgtd = _mm_cmpgt_pi32, ///ditto 461 _m_pcmpgtw = _mm_cmpgt_pi16, ///ditto 462 _m_pmaddwd = _mm_madd_pi16, ///ditto 463 _m_pmulhw = _mm_mulhi_pi16, ///ditto 464 _m_pmullw = _mm_mullo_pi16, ///ditto 465 _m_por = _mm_or_si64, ///ditto 466 _m_pslld = _mm_sll_pi32, ///ditto 467 _m_pslldi = _mm_slli_pi32, ///ditto 468 _m_psllq = _mm_sll_si64, ///ditto 469 _m_psllqi = _mm_slli_si64, ///ditto 470 _m_psllw = _mm_sll_pi16, ///ditto 471 _m_psllwi = _mm_slli_pi16, ///ditto 472 _m_psrad = _mm_sra_pi32, ///ditto 473 _m_psradi = _mm_srai_pi32, ///ditto 474 _m_psraw = _mm_sra_pi16, ///ditto 475 _m_psrawi = _mm_srai_pi16, ///ditto 476 _m_psrld = _mm_srl_pi32, ///ditto 477 _m_psrldi = _mm_srli_pi32, ///ditto 478 _m_psrlq = _mm_srl_si64, ///ditto 479 _m_psrlqi = _mm_srli_si64, ///ditto 480 _m_psrlw = _mm_srl_pi16, ///ditto 481 _m_psrlwi = _mm_srli_pi16, ///ditto 482 _m_psubb = _mm_sub_pi8, ///ditto 483 _m_psubd = _mm_sub_pi32, ///ditto 484 _m_psubsb = _mm_subs_pi8, ///ditto 485 _m_psubsw = _mm_subs_pi16, ///ditto 486 _m_psubusb = _mm_subs_pu8, ///ditto 487 _m_psubusw = _mm_subs_pu16, ///ditto 488 _m_psubw = _mm_sub_pi16, ///ditto 489 _m_punpckhbw = _mm_unpackhi_pi8, ///ditto 490 _m_punpckhdq = _mm_unpackhi_pi32, ///ditto 491 _m_punpckhwd = _mm_unpackhi_pi16, ///ditto 492 _m_punpcklbw = _mm_unpacklo_pi8, ///ditto 493 _m_punpckldq = _mm_unpacklo_pi32, ///ditto 494 _m_punpcklwd = _mm_unpacklo_pi16, ///ditto 495 _m_pxor = _mm_xor_si64; ///ditto 496 497 /// Set packed 16-bit integers with the supplied values. 498 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 499 { 500 short[4] arr = [e0, e1, e2, e3]; 501 return *cast(__m64*)(arr.ptr); 502 } 503 unittest 504 { 505 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 506 short[4] correct = [0, 1, 2, 3]; 507 assert(R.array == correct); 508 } 509 510 /// Set packed 32-bit integers with the supplied values. 511 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 512 { 513 int[2] arr = [e0, e1]; 514 return *cast(__m64*)(arr.ptr); 515 } 516 unittest 517 { 518 int2 R = cast(int2) _mm_set_pi32(1, 0); 519 int[2] correct = [0, 1]; 520 assert(R.array == correct); 521 } 522 523 /// Set packed 8-bit integers with the supplied values. 524 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 525 { 526 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 527 return *cast(__m64*)(arr.ptr); 528 } 529 unittest 530 { 531 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 532 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 533 assert(R.array == correct); 534 } 535 536 /// Broadcast 16-bit integer `a` to all elements. 537 __m64 _mm_set1_pi16 (short a) pure @trusted 538 { 539 return cast(__m64)(short4(a)); 540 } 541 unittest 542 { 543 short4 R = cast(short4) _mm_set1_pi16(44); 544 short[4] correct = [44, 44, 44, 44]; 545 assert(R.array == correct); 546 } 547 548 /// Broadcast 32-bit integer `a` to all elements. 549 __m64 _mm_set1_pi32 (int a) pure @trusted 550 { 551 return cast(__m64)(int2(a)); 552 } 553 unittest 554 { 555 int2 R = cast(int2) _mm_set1_pi32(43); 556 int[2] correct = [43, 43]; 557 assert(R.array == correct); 558 } 559 560 /// Broadcast 8-bit integer `a` to all elements. 561 __m64 _mm_set1_pi8 (byte a) pure @trusted 562 { 563 return cast(__m64)(byte8(a)); 564 } 565 unittest 566 { 567 byte8 R = cast(byte8) _mm_set1_pi8(42); 568 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 569 assert(R.array == correct); 570 } 571 572 /// Set packed 16-bit integers with the supplied values in reverse order. 573 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 574 { 575 short[4] arr = [e3, e2, e1, e0]; 576 return *cast(__m64*)(arr.ptr); 577 } 578 unittest 579 { 580 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 581 short[4] correct = [0, 1, 2, 3]; 582 assert(R.array == correct); 583 } 584 585 /// Set packed 32-bit integers with the supplied values in reverse order. 586 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 587 { 588 int[2] arr = [e1, e0]; 589 return *cast(__m64*)(arr.ptr); 590 } 591 unittest 592 { 593 int2 R = cast(int2) _mm_setr_pi32(0, 1); 594 int[2] correct = [0, 1]; 595 assert(R.array == correct); 596 } 597 598 /// Set packed 8-bit integers with the supplied values in reverse order. 599 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 600 { 601 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 602 return *cast(__m64*)(arr.ptr); 603 } 604 unittest 605 { 606 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 607 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 608 assert(R.array == correct); 609 } 610 611 /// Return vector of type `__m64` with all elements set to zero. 612 __m64 _mm_setzero_si64 () pure @trusted 613 { 614 __m64 r; 615 r.ptr[0] = 0; 616 return r; 617 } 618 unittest 619 { 620 __m64 R = _mm_setzero_si64(); 621 assert(R.array[0] == 0); 622 } 623 624 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 625 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe 626 { 627 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits))); 628 } 629 630 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 631 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe 632 { 633 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits))); 634 } 635 636 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 637 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe 638 { 639 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits))); 640 } 641 642 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 643 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe 644 { 645 return to_m64(_mm_slli_epi16(to_m128i(a), imm8)); 646 } 647 unittest 648 { 649 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 650 short4 B = cast(short4)( _mm_slli_pi16(A, 1) ); 651 short[4] correct = [ -8, -10, 12, 14 ]; 652 assert(B.array == correct); 653 } 654 655 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 656 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe 657 { 658 return to_m64(_mm_slli_epi32(to_m128i(a), imm8)); 659 } 660 unittest 661 { 662 __m64 A = _mm_setr_pi32(-4, 5); 663 int2 B = cast(int2)( _mm_slli_pi32(A, 1) ); 664 int[2] correct = [ -8, 10 ]; 665 assert(B.array == correct); 666 } 667 668 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros. 669 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe 670 { 671 return to_m64(_mm_slli_epi64(to_m128i(a), imm8)); 672 } 673 unittest 674 { 675 __m64 A = _mm_cvtsi64_m64(-1); 676 long1 R = cast(long1)( _mm_slli_si64(A, 1) ); 677 long[1] correct = [ -2 ]; 678 assert(R.array == correct); 679 } 680 681 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 682 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe 683 { 684 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits))); 685 } 686 687 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 688 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe 689 { 690 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits))); 691 } 692 693 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 694 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe 695 { 696 return to_m64(_mm_srai_epi16(to_m128i(a), imm8)); 697 } 698 unittest 699 { 700 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 701 short4 B = cast(short4)( _mm_srai_pi16(A, 1) ); 702 short[4] correct = [ -2, -3, 3, 3 ]; 703 assert(B.array == correct); 704 } 705 706 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 707 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe 708 { 709 return to_m64(_mm_srai_epi32(to_m128i(a), imm8)); 710 } 711 unittest 712 { 713 __m64 A = _mm_setr_pi32(-4, 5); 714 int2 B = cast(int2)( _mm_srai_pi32(A, 1) ); 715 int[2] correct = [ -2, 2 ]; 716 assert(B.array == correct); 717 } 718 719 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 720 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe 721 { 722 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits))); 723 } 724 725 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 726 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe 727 { 728 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits))); 729 } 730 731 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 732 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe 733 { 734 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits))); 735 } 736 737 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 738 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe 739 { 740 return to_m64(_mm_srli_epi16(to_m128i(a), imm8)); 741 } 742 unittest 743 { 744 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 745 short4 B = cast(short4)( _mm_srli_pi16(A, 1) ); 746 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 747 assert(B.array == correct); 748 } 749 750 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 751 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe 752 { 753 return to_m64(_mm_srli_epi32(to_m128i(a), imm8)); 754 } 755 unittest 756 { 757 __m64 A = _mm_setr_pi32(-4, 5); 758 int2 B = cast(int2)( _mm_srli_pi32(A, 1) ); 759 int[2] correct = [ 0x7ffffffe, 2 ]; 760 assert(B.array == correct); 761 } 762 763 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros. 764 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe 765 { 766 return to_m64(_mm_srli_epi64(to_m128i(a), imm8)); 767 } 768 unittest 769 { 770 __m64 A = _mm_cvtsi64_m64(-1); 771 long1 R = cast(long1)( _mm_srli_si64(A, 1) ); 772 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 773 assert(R.array == correct); 774 } 775 776 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 777 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 778 { 779 return cast(__m64)(cast(short4)a - cast(short4)b); 780 } 781 unittest 782 { 783 short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 784 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 785 static immutable short[4] correct = [ -1,-15, 1, 32764]; 786 assert(R.array == correct); 787 } 788 789 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 790 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 791 { 792 return cast(__m64)(cast(int2)a - cast(int2)b); 793 } 794 unittest 795 { 796 int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10, 4), 797 _mm_setr_pi32( 15, -70)); 798 static immutable int[2] correct = [ -5, 74]; 799 assert(R.array == correct); 800 } 801 802 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 803 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 804 { 805 return cast(__m64)(cast(byte8)a - cast(byte8)b); 806 } 807 unittest 808 { 809 byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 810 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 811 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, 120 ]; 812 assert(R.array == correct); 813 } 814 815 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation. 816 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 817 { 818 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 819 } 820 unittest 821 { 822 short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 823 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 824 static immutable short[4] correct = [ -1,-15, 1, -32768]; 825 assert(R.array == correct); 826 } 827 828 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation. 829 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 830 { 831 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 832 } 833 unittest 834 { 835 byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 836 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 837 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, -128 ]; 838 assert(R.array == correct); 839 } 840 841 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 842 /// using saturation. 843 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 844 { 845 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 846 } 847 unittest 848 { 849 short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534, 1, 5, 4), 850 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 851 static immutable short[4] correct = [ 0, 0, 1, 0]; 852 assert(R.array == correct); 853 } 854 855 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 856 /// using saturation. 857 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 858 { 859 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 860 } 861 unittest 862 { 863 byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8), 864 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 865 static immutable byte[8] correct = [ 0, 7, 0, 0, 0, 0, 0, 0, ]; 866 assert(R.array == correct); 867 } 868 869 deprecated alias _m_to_int = _mm_cvtsi64_si32; /// Deprecated intrinsics. 870 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto 871 872 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 873 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted 874 { 875 version(LDC) 876 { 877 // avoiding this shufflevector leads to bad performance on LDC 878 return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 879 } 880 else 881 { 882 short4 ia = cast(short4)a; 883 short4 ib = cast(short4)b; 884 short4 r; 885 r.ptr[0] = ia.array[2]; 886 r.ptr[1] = ib.array[2]; 887 r.ptr[2] = ia.array[3]; 888 r.ptr[3] = ib.array[3]; 889 return cast(__m64)r; 890 } 891 } 892 unittest 893 { 894 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 895 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 896 short4 R = cast(short4) _mm_unpackhi_pi16(A, B); 897 short[4] correct = [-16, -3, 7, 10]; 898 assert(R.array == correct); 899 } 900 901 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 902 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted 903 { 904 // Generate punpckldq as far back as LDC 1.0.0 -O1 905 // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions) 906 int2 ia = cast(int2)a; 907 int2 ib = cast(int2)b; 908 int2 r; 909 r.ptr[0] = ia.array[1]; 910 r.ptr[1] = ib.array[1]; 911 return cast(__m64)r; 912 } 913 unittest 914 { 915 __m64 A = _mm_setr_pi32(4, 8); 916 __m64 B = _mm_setr_pi32(5, 9); 917 int2 R = cast(int2) _mm_unpackhi_pi32(A, B); 918 int[2] correct = [8, 9]; 919 assert(R.array == correct); 920 } 921 922 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 923 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 924 { 925 version(LDC) 926 { 927 return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 928 } 929 else 930 { 931 byte8 ia = cast(byte8)a; 932 byte8 ib = cast(byte8)b; 933 byte8 r; 934 r.ptr[0] = ia.array[4]; 935 r.ptr[1] = ib.array[4]; 936 r.ptr[2] = ia.array[5]; 937 r.ptr[3] = ib.array[5]; 938 r.ptr[4] = ia.array[6]; 939 r.ptr[5] = ib.array[6]; 940 r.ptr[6] = ia.array[7]; 941 r.ptr[7] = ib.array[7]; 942 return cast(__m64)r; 943 } 944 } 945 unittest 946 { 947 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 948 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 949 byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B); 950 byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8]; 951 assert(R.array == correct); 952 } 953 954 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 955 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 956 { 957 // Generates punpcklwd since LDC 1.0.0 -01 958 short4 ia = cast(short4)a; 959 short4 ib = cast(short4)b; 960 short4 r; 961 r.ptr[0] = ia.array[0]; 962 r.ptr[1] = ib.array[0]; 963 r.ptr[2] = ia.array[1]; 964 r.ptr[3] = ib.array[1]; 965 return cast(__m64)r; 966 } 967 unittest 968 { 969 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 970 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 971 short4 R = cast(short4) _mm_unpacklo_pi16(A, B); 972 short[4] correct = [4, 5, 8, 9]; 973 assert(R.array == correct); 974 } 975 976 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 977 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted 978 { 979 // x86: Generate punpckldq as far back as LDC 1.0.0 -O1 980 // ARM: Generate zip as far back as LDC 1.8.0 -O1 981 int2 ia = cast(int2)a; 982 int2 ib = cast(int2)b; 983 int2 r; 984 r.ptr[0] = ia.array[0]; 985 r.ptr[1] = ib.array[0]; 986 return cast(__m64)r; 987 } 988 unittest 989 { 990 __m64 A = _mm_setr_pi32(4, 8); 991 __m64 B = _mm_setr_pi32(5, 9); 992 int2 R = cast(int2) _mm_unpacklo_pi32(A, B); 993 int[2] correct = [4, 5]; 994 assert(R.array == correct); 995 } 996 997 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 998 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 999 { 1000 version(LDC) 1001 { 1002 return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 1003 } 1004 else 1005 { 1006 byte8 ia = cast(byte8)a; 1007 byte8 ib = cast(byte8)b; 1008 byte8 r; 1009 r.ptr[0] = ia.array[0]; 1010 r.ptr[1] = ib.array[0]; 1011 r.ptr[2] = ia.array[1]; 1012 r.ptr[3] = ib.array[1]; 1013 r.ptr[4] = ia.array[2]; 1014 r.ptr[5] = ib.array[2]; 1015 r.ptr[6] = ia.array[3]; 1016 r.ptr[7] = ib.array[3]; 1017 return cast(__m64)r; 1018 } 1019 } 1020 unittest 1021 { 1022 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1023 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1024 byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B); 1025 byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4]; 1026 assert(R.array == correct); 1027 } 1028 1029 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`. 1030 __m64 _mm_xor_si64 (__m64 a, __m64 b) 1031 { 1032 return a ^ b; 1033 } 1034 unittest 1035 { 1036 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 1037 __m64 B = _mm_set1_pi16(15); 1038 short4 R = cast(short4)_mm_xor_si64(A, B); 1039 short[4] correct = [240, 14, -16, 15]; 1040 assert(R.array == correct); 1041 } 1042