1 /** 2 * MMX intrinsics. 3 * 4 * Copyright: Copyright Auburn Sounds 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Guillaume Piolat 7 * Macros: 8 * GUIDE = https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=$0 9 * 10 */ 11 module inteli.mmx; 12 13 public import inteli.types; 14 import inteli.internals; 15 16 import inteli.xmmintrin; 17 import inteli.emmintrin; 18 19 nothrow @nogc: 20 21 // Important: you don't need to call _mm_empty when using "MMX" capabilities of intel-intrinsics, 22 // since it just generates the right IR and cleaning-up FPU registers is up to the codegen. 23 // intel-intrinsics is just semantics. 24 25 26 /// Add packed 16-bit integers in `a` and `b`. 27 __m64 _mm_add_pi16 (__m64 a, __m64 b) 28 { 29 return cast(__m64)(cast(short4)a + cast(short4)b); 30 } 31 unittest 32 { 33 short4 R = cast(short4) _mm_add_pi16(_mm_set1_pi16(4), _mm_set1_pi16(3)); 34 short[4] correct = [7, 7, 7, 7]; 35 assert(R.array == correct); 36 } 37 38 /// Add packed 32-bit integers in `a` and `b`. 39 __m64 _mm_add_pi32 (__m64 a, __m64 b) 40 { 41 return cast(__m64)(cast(int2)a + cast(int2)b); 42 } 43 unittest 44 { 45 int2 R = cast(int2) _mm_add_pi32(_mm_set1_pi32(4), _mm_set1_pi32(3)); 46 int[2] correct = [7, 7]; 47 assert(R.array == correct); 48 } 49 50 /// Add packed 8-bit integers in `a` and `b`. 51 __m64 _mm_add_pi8 (__m64 a, __m64 b) 52 { 53 return cast(__m64)(cast(byte8)a + cast(byte8)b); 54 } 55 unittest 56 { 57 byte8 R = cast(byte8) _mm_add_pi8(_mm_set1_pi8(127), _mm_set1_pi8(-128)); 58 byte[8] correct = [-1, -1, -1, -1, -1, -1, -1, -1]; 59 assert(R.array == correct); 60 } 61 62 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 63 // PERF: PADDSW not generated 64 __m64 _mm_adds_pi16(__m64 a, __m64 b) pure @trusted 65 { 66 return to_m64(_mm_adds_epi16(to_m128i(a), to_m128i(b))); 67 } 68 unittest 69 { 70 short4 res = cast(short4) _mm_adds_pi16(_mm_set_pi16(3, 2, 1, 0), 71 _mm_set_pi16(3, 2, 1, 0)); 72 static immutable short[4] correctResult = [0, 2, 4, 6]; 73 assert(res.array == correctResult); 74 } 75 76 /// Add packed 8-bit integers in `a` and `b` using signed saturation. 77 // PERF: PADDSB not generated 78 __m64 _mm_adds_pi8(__m64 a, __m64 b) pure @trusted 79 { 80 return to_m64(_mm_adds_epi8(to_m128i(a), to_m128i(b))); 81 } 82 unittest 83 { 84 byte8 res = cast(byte8) _mm_adds_pi8(_mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0), 85 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 86 static immutable byte[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 87 assert(res.array == correctResult); 88 } 89 90 /// Add packed 16-bit integers in `a` and `b` using unsigned saturation. 91 // PERF: PADDUSW not generated 92 __m64 _mm_adds_pu16(__m64 a, __m64 b) pure @trusted 93 { 94 return to_m64(_mm_adds_epu16(to_m128i(a), to_m128i(b))); 95 } 96 unittest 97 { 98 short4 res = cast(short4) _mm_adds_pu16(_mm_set_pi16(3, 2, cast(short)65535, 0), 99 _mm_set_pi16(3, 2, 1, 0)); 100 static immutable short[4] correctResult = [0, cast(short)65535, 4, 6]; 101 assert(res.array == correctResult); 102 } 103 104 /// Add packed 8-bit integers in `a` and `b` using unsigned saturation. 105 // PERF: PADDUSB not generated 106 __m64 _mm_adds_pu8(__m64 a, __m64 b) pure @trusted 107 { 108 return to_m64(_mm_adds_epu8(to_m128i(a), to_m128i(b))); 109 } 110 unittest 111 { 112 byte8 res = cast(byte8) _mm_adds_pu8(_mm_set_pi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0), 113 _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0)); 114 static immutable byte[8] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 115 assert(res.array == correctResult); 116 } 117 118 /// Compute the bitwise AND of 64 bits (representing integer data) in `a` and `b`. 119 __m64 _mm_and_si64 (__m64 a, __m64 b) pure @safe 120 { 121 return a & b; 122 } 123 unittest 124 { 125 __m64 A = [7]; 126 __m64 B = [14]; 127 __m64 R = _mm_and_si64(A, B); 128 assert(R.array[0] == 6); 129 } 130 131 /// Compute the bitwise NOT of 64 bits (representing integer data) in `a` and then AND with `b`. 132 __m64 _mm_andnot_si64 (__m64 a, __m64 b) 133 { 134 return (~a) & b; 135 } 136 unittest 137 { 138 __m64 A = [7]; 139 __m64 B = [14]; 140 __m64 R = _mm_andnot_si64(A, B); 141 assert(R.array[0] == 8); 142 } 143 144 /// Compare packed 16-bit integers in `a` and `b` for equality. 145 __m64 _mm_cmpeq_pi16 (__m64 a, __m64 b) pure @safe 146 { 147 static if (GDC_with_MMX) 148 { 149 return cast(__m64) __builtin_ia32_pcmpeqw(cast(short4)a, cast(short4)b); 150 } 151 else 152 { 153 return cast(__m64) equalMask!short4(cast(short4)a, cast(short4)b); 154 } 155 } 156 unittest 157 { 158 short4 A = [-3, -2, -1, 0]; 159 short4 B = [ 4, 3, 2, 1]; 160 short[4] E = [ 0, 0, 0, 0]; 161 short4 R = cast(short4)(_mm_cmpeq_pi16(cast(__m64)A, cast(__m64)B)); 162 assert(R.array == E); 163 } 164 165 /// Compare packed 32-bit integers in `a` and `b` for equality. 166 __m64 _mm_cmpeq_pi32 (__m64 a, __m64 b) pure @safe 167 { 168 static if (GDC_with_MMX) 169 { 170 return cast(__m64) __builtin_ia32_pcmpeqd(cast(int2)a, cast(int2)b); 171 } 172 else 173 { 174 return cast(__m64) equalMask!int2(cast(int2)a, cast(int2)b); 175 } 176 } 177 unittest 178 { 179 int2 A = [-3, -2]; 180 int2 B = [ 4, -2]; 181 int[2] E = [ 0, -1]; 182 int2 R = cast(int2)(_mm_cmpeq_pi32(cast(__m64)A, cast(__m64)B)); 183 assert(R.array == E); 184 } 185 186 /// Compare packed 8-bit integers in `a` and `b` for equality, 187 __m64 _mm_cmpeq_pi8 (__m64 a, __m64 b) pure @safe 188 { 189 static if (GDC_with_MMX) 190 { 191 return cast(__m64) __builtin_ia32_pcmpeqb(cast(byte8)a, cast(byte8)b); 192 } 193 else 194 { 195 return cast(__m64) equalMask!byte8(cast(byte8)a, cast(byte8)b); 196 } 197 } 198 unittest 199 { 200 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 201 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 202 byte8 C = cast(byte8) _mm_cmpeq_pi8(A, B); 203 byte[8] correct = [0,-1, 0, 0, 0,-1, 0, 0]; 204 assert(C.array == correct); 205 } 206 207 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 208 __m64 _mm_cmpgt_pi16 (__m64 a, __m64 b) pure @safe 209 { 210 static if (GDC_with_MMX) 211 { 212 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 213 } 214 else 215 { 216 return cast(__m64) greaterMask!short4(cast(short4)a, cast(short4)b); 217 } 218 } 219 unittest 220 { 221 short4 A = [-3, -2, -1, 0]; 222 short4 B = [ 4, 3, 2, 1]; 223 short[4] E = [ 0, 0, 0, 0]; 224 short4 R = cast(short4)(_mm_cmpgt_pi16(cast(__m64)A, cast(__m64)B)); 225 assert(R.array == E); 226 } 227 228 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 229 __m64 _mm_cmpgt_pi32 (__m64 a, __m64 b) pure @safe 230 { 231 static if (GDC_with_MMX) 232 { 233 return cast(__m64) __builtin_ia32_pcmpgtw (cast(short4)a, cast(short4)b); 234 } 235 else 236 { 237 return cast(__m64) greaterMask!int2(cast(int2)a, cast(int2)b); 238 } 239 } 240 unittest 241 { 242 int2 A = [-3, 2]; 243 int2 B = [ 4, -2]; 244 int[2] E = [ 0, -1]; 245 int2 R = cast(int2)(_mm_cmpgt_pi32(cast(__m64)A, cast(__m64)B)); 246 assert(R.array == E); 247 } 248 249 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 250 __m64 _mm_cmpgt_pi8 (__m64 a, __m64 b) pure @safe 251 { 252 static if (GDC_with_MMX) 253 { 254 return cast(__m64) __builtin_ia32_pcmpgtb (cast(byte8)a, cast(byte8)b); 255 } 256 else 257 { 258 return cast(__m64) greaterMask!byte8(cast(byte8)a, cast(byte8)b); 259 } 260 } 261 unittest 262 { 263 __m64 A = _mm_setr_pi8(1, 2, 3, 1, 2, 1, 1, 2); 264 __m64 B = _mm_setr_pi8(2, 2, 1, 2, 3, 1, 2, 3); 265 byte8 C = cast(byte8) _mm_cmpgt_pi8(A, B); 266 byte[8] correct = [0, 0,-1, 0, 0, 0, 0, 0]; 267 assert(C.array == correct); 268 } 269 270 /// Copy 64-bit integer `a` to `dst`. 271 long _mm_cvtm64_si64 (__m64 a) pure @safe 272 { 273 long1 la = cast(long1)a; 274 return a.array[0]; 275 } 276 unittest 277 { 278 __m64 A = _mm_setr_pi32(2, 1); 279 long1 lA = cast(long1)A; 280 assert(A.array[0] == 0x100000002); 281 } 282 283 /// Copy 32-bit integer `a` to the lower elements of `dst`, and zero the upper element of `dst`. 284 __m64 _mm_cvtsi32_si64 (int a) pure @trusted 285 { 286 __m64 r = void; 287 r.ptr[0] = a; 288 return r; 289 } 290 unittest 291 { 292 __m64 R = _mm_cvtsi32_si64(-1); 293 assert(R.array[0] == -1); 294 } 295 296 /// Copy 64-bit integer `a` to `dst`. 297 __m64 _mm_cvtsi64_m64 (long a) pure @trusted 298 { 299 __m64 r = void; 300 r.ptr[0] = a; 301 return r; 302 } 303 unittest 304 { 305 __m64 R = _mm_cvtsi64_m64(0x123456789A); 306 assert(R.array[0] == 0x123456789A); 307 } 308 309 /// Get the lower 32-bit integer in `a`. 310 int _mm_cvtsi64_si32 (__m64 a) pure @safe 311 { 312 int2 r = cast(int2)a; 313 return r.array[0]; 314 } 315 unittest 316 { 317 __m64 A = _mm_setr_pi32(-6, 5); 318 int R = _mm_cvtsi64_si32(A); 319 assert(R == -6); 320 } 321 322 /// Empty the MMX state, which marks the x87 FPU registers as available for 323 /// use by x87 instructions. 324 /// This instruction is supposed to be used at the end of all MMX technology procedures. 325 /// This is useless when using `intel-intrinsics`, at least with LDC and DMD. 326 void _mm_empty() pure @safe 327 { 328 // do nothing, see comment on top of file 329 // TODO: not sure for GDC, do something? 330 } 331 332 333 deprecated alias _m_empty = _mm_empty; /// Deprecated intrinsics. 334 deprecated alias _m_from_int = _mm_cvtsi32_si64; ///ditto 335 deprecated alias _m_from_int64 = _mm_cvtsi64_m64; ///ditto 336 337 /// Multiply packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers. 338 /// Horizontally add adjacent pairs of intermediate 32-bit integers 339 __m64 _mm_madd_pi16 (__m64 a, __m64 b) pure @safe 340 { 341 return to_m64(_mm_madd_epi16(to_m128i(a), to_m128i(b))); 342 } 343 unittest 344 { 345 short4 A = [-32768, -32768, 32767, 32767]; 346 short4 B = [-32768, -32768, 32767, 32767]; 347 int2 R = cast(int2) _mm_madd_pi16(cast(__m64)A, cast(__m64)B); 348 int[2] correct = [-2147483648, 2*32767*32767]; 349 assert(R.array == correct); 350 } 351 352 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 353 /// and store the high 16 bits of the intermediate integers. 354 __m64 _mm_mulhi_pi16 (__m64 a, __m64 b) pure @safe 355 { 356 return to_m64(_mm_mulhi_epi16(to_m128i(a), to_m128i(b))); 357 } 358 unittest 359 { 360 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 361 __m64 B = _mm_set1_pi16(16384); 362 short4 R = cast(short4)_mm_mulhi_pi16(A, B); 363 short[4] correct = [1, 2, -4, 1]; 364 assert(R.array == correct); 365 } 366 367 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 368 /// and store the low 16 bits of the intermediate integers. 369 __m64 _mm_mullo_pi16 (__m64 a, __m64 b) pure @safe 370 { 371 return to_m64(_mm_mullo_epi16(to_m128i(a), to_m128i(b))); 372 } 373 unittest 374 { 375 __m64 A = _mm_setr_pi16(4, 1, 16, 7); 376 __m64 B = _mm_set1_pi16(16384); 377 short4 R = cast(short4)_mm_mullo_pi16(A, B); 378 short[4] correct = [0, 16384, 0, -16384]; 379 assert(R.array == correct); 380 } 381 382 /// Compute the bitwise OR of 64 bits in `a` and `b`. 383 __m64 _mm_or_si64 (__m64 a, __m64 b) pure @safe 384 { 385 return a | b; 386 } 387 unittest 388 { 389 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 390 __m64 B = _mm_set1_pi16(15); 391 short4 R = cast(short4)_mm_or_si64(A, B); 392 short[4] correct = [255, 15, -1, 15]; 393 assert(R.array == correct); 394 } 395 396 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 397 __m64 _mm_packs_pi16 (__m64 a, __m64 b) pure @trusted 398 { 399 int4 p = cast(int4) _mm_packs_epi16(to_m128i(a), to_m128i(b)); 400 int2 r; 401 r.ptr[0] = p.array[0]; 402 r.ptr[1] = p.array[2]; 403 return cast(__m64)r; 404 } 405 unittest 406 { 407 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 408 byte8 R = cast(byte8) _mm_packs_pi16(A, A); 409 byte[8] correct = [127, -128, 127, 0, 127, -128, 127, 0]; 410 assert(R.array == correct); 411 } 412 413 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 414 __m64 _mm_packs_pi32 (__m64 a, __m64 b) pure @trusted 415 { 416 int4 p = cast(int4) _mm_packs_epi32(to_m128i(a), to_m128i(b)); 417 int2 r; 418 r.ptr[0] = p.array[0]; 419 r.ptr[1] = p.array[2]; 420 return cast(__m64)r; 421 } 422 unittest 423 { 424 __m64 A = _mm_setr_pi32(100000, -100000); 425 short4 R = cast(short4) _mm_packs_pi32(A, A); 426 short[4] correct = [32767, -32768, 32767, -32768]; 427 assert(R.array == correct); 428 } 429 430 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 431 __m64 _mm_packs_pu16 (__m64 a, __m64 b) pure @trusted 432 { 433 int4 p = cast(int4) _mm_packus_epi16(to_m128i(a), to_m128i(b)); 434 int2 r; 435 r.ptr[0] = p.array[0]; 436 r.ptr[1] = p.array[2]; 437 return cast(__m64)r; 438 } 439 unittest 440 { 441 __m64 A = _mm_setr_pi16(256, -129, 254, 0); 442 byte8 R = cast(byte8) _mm_packs_pu16(A, A); 443 ubyte[8] correct = [255, 0, 254, 0, 255, 0, 254, 0]; 444 assert(R.array == cast(byte[8])correct); 445 } 446 447 deprecated alias 448 _m_packssdw = _mm_packs_pi32, /// Deprecated intrinsics. 449 _m_packsswb = _mm_packs_pi16, ///ditto 450 _m_packuswb = _mm_packs_pu16, ///ditto 451 _m_paddb = _mm_add_pi8, ///ditto 452 _m_paddd = _mm_add_pi32, ///ditto 453 _m_paddsb = _mm_adds_pi8, ///ditto 454 _m_paddsw = _mm_adds_pi16, ///ditto 455 _m_paddusb = _mm_adds_pu8, ///ditto 456 _m_paddusw = _mm_adds_pu16, ///ditto 457 _m_paddw = _mm_add_pi16, ///ditto 458 _m_pand = _mm_and_si64, ///ditto 459 _m_pandn = _mm_andnot_si64, ///ditto 460 _m_pcmpeqb = _mm_cmpeq_pi8, ///ditto 461 _m_pcmpeqd = _mm_cmpeq_pi32, ///ditto 462 _m_pcmpeqw = _mm_cmpeq_pi16, ///ditto 463 _m_pcmpgtb = _mm_cmpgt_pi8, ///ditto 464 _m_pcmpgtd = _mm_cmpgt_pi32, ///ditto 465 _m_pcmpgtw = _mm_cmpgt_pi16, ///ditto 466 _m_pmaddwd = _mm_madd_pi16, ///ditto 467 _m_pmulhw = _mm_mulhi_pi16, ///ditto 468 _m_pmullw = _mm_mullo_pi16, ///ditto 469 _m_por = _mm_or_si64, ///ditto 470 _m_pslld = _mm_sll_pi32, ///ditto 471 _m_pslldi = _mm_slli_pi32, ///ditto 472 _m_psllq = _mm_sll_si64, ///ditto 473 _m_psllqi = _mm_slli_si64, ///ditto 474 _m_psllw = _mm_sll_pi16, ///ditto 475 _m_psllwi = _mm_slli_pi16, ///ditto 476 _m_psrad = _mm_sra_pi32, ///ditto 477 _m_psradi = _mm_srai_pi32, ///ditto 478 _m_psraw = _mm_sra_pi16, ///ditto 479 _m_psrawi = _mm_srai_pi16, ///ditto 480 _m_psrld = _mm_srl_pi32, ///ditto 481 _m_psrldi = _mm_srli_pi32, ///ditto 482 _m_psrlq = _mm_srl_si64, ///ditto 483 _m_psrlqi = _mm_srli_si64, ///ditto 484 _m_psrlw = _mm_srl_pi16, ///ditto 485 _m_psrlwi = _mm_srli_pi16, ///ditto 486 _m_psubb = _mm_sub_pi8, ///ditto 487 _m_psubd = _mm_sub_pi32, ///ditto 488 _m_psubsb = _mm_subs_pi8, ///ditto 489 _m_psubsw = _mm_subs_pi16, ///ditto 490 _m_psubusb = _mm_subs_pu8, ///ditto 491 _m_psubusw = _mm_subs_pu16, ///ditto 492 _m_psubw = _mm_sub_pi16, ///ditto 493 _m_punpckhbw = _mm_unpackhi_pi8, ///ditto 494 _m_punpckhdq = _mm_unpackhi_pi32, ///ditto 495 _m_punpckhwd = _mm_unpackhi_pi16, ///ditto 496 _m_punpcklbw = _mm_unpacklo_pi8, ///ditto 497 _m_punpckldq = _mm_unpacklo_pi32, ///ditto 498 _m_punpcklwd = _mm_unpacklo_pi16, ///ditto 499 _m_pxor = _mm_xor_si64; ///ditto 500 501 /// Set packed 16-bit integers with the supplied values. 502 __m64 _mm_set_pi16 (short e3, short e2, short e1, short e0) pure @trusted 503 { 504 short[4] arr = [e0, e1, e2, e3]; 505 return *cast(__m64*)(arr.ptr); 506 } 507 unittest 508 { 509 short4 R = cast(short4) _mm_set_pi16(3, 2, 1, 0); 510 short[4] correct = [0, 1, 2, 3]; 511 assert(R.array == correct); 512 } 513 514 /// Set packed 32-bit integers with the supplied values. 515 __m64 _mm_set_pi32 (int e1, int e0) pure @trusted 516 { 517 int[2] arr = [e0, e1]; 518 return *cast(__m64*)(arr.ptr); 519 } 520 unittest 521 { 522 int2 R = cast(int2) _mm_set_pi32(1, 0); 523 int[2] correct = [0, 1]; 524 assert(R.array == correct); 525 } 526 527 /// Set packed 8-bit integers with the supplied values. 528 __m64 _mm_set_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 529 { 530 byte[8] arr = [e0, e1, e2, e3, e4, e5, e6, e7]; 531 return *cast(__m64*)(arr.ptr); 532 } 533 unittest 534 { 535 byte8 R = cast(byte8) _mm_set_pi8(7, 6, 5, 4, 3, 2, 1, 0); 536 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 537 assert(R.array == correct); 538 } 539 540 /// Broadcast 16-bit integer `a` to all elements. 541 __m64 _mm_set1_pi16 (short a) pure @trusted 542 { 543 return cast(__m64)(short4(a)); 544 } 545 unittest 546 { 547 short4 R = cast(short4) _mm_set1_pi16(44); 548 short[4] correct = [44, 44, 44, 44]; 549 assert(R.array == correct); 550 } 551 552 /// Broadcast 32-bit integer `a` to all elements. 553 __m64 _mm_set1_pi32 (int a) pure @trusted 554 { 555 return cast(__m64)(int2(a)); 556 } 557 unittest 558 { 559 int2 R = cast(int2) _mm_set1_pi32(43); 560 int[2] correct = [43, 43]; 561 assert(R.array == correct); 562 } 563 564 /// Broadcast 8-bit integer `a` to all elements. 565 __m64 _mm_set1_pi8 (byte a) pure @trusted 566 { 567 return cast(__m64)(byte8(a)); 568 } 569 unittest 570 { 571 byte8 R = cast(byte8) _mm_set1_pi8(42); 572 byte[8] correct = [42, 42, 42, 42, 42, 42, 42, 42]; 573 assert(R.array == correct); 574 } 575 576 /// Set packed 16-bit integers with the supplied values in reverse order. 577 __m64 _mm_setr_pi16 (short e3, short e2, short e1, short e0) pure @trusted 578 { 579 short[4] arr = [e3, e2, e1, e0]; 580 return *cast(__m64*)(arr.ptr); 581 } 582 unittest 583 { 584 short4 R = cast(short4) _mm_setr_pi16(0, 1, 2, 3); 585 short[4] correct = [0, 1, 2, 3]; 586 assert(R.array == correct); 587 } 588 589 /// Set packed 32-bit integers with the supplied values in reverse order. 590 __m64 _mm_setr_pi32 (int e1, int e0) pure @trusted 591 { 592 int[2] arr = [e1, e0]; 593 return *cast(__m64*)(arr.ptr); 594 } 595 unittest 596 { 597 int2 R = cast(int2) _mm_setr_pi32(0, 1); 598 int[2] correct = [0, 1]; 599 assert(R.array == correct); 600 } 601 602 /// Set packed 8-bit integers with the supplied values in reverse order. 603 __m64 _mm_setr_pi8 (byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 604 { 605 byte[8] arr = [e7, e6, e5, e4, e3, e2, e1, e0]; 606 return *cast(__m64*)(arr.ptr); 607 } 608 unittest 609 { 610 byte8 R = cast(byte8) _mm_setr_pi8(0, 1, 2, 3, 4, 5, 6, 7); 611 byte[8] correct = [0, 1, 2, 3, 4, 5, 6, 7]; 612 assert(R.array == correct); 613 } 614 615 /// Return vector of type `__m64` with all elements set to zero. 616 __m64 _mm_setzero_si64 () pure @trusted 617 { 618 __m64 r; 619 r.ptr[0] = 0; 620 return r; 621 } 622 unittest 623 { 624 __m64 R = _mm_setzero_si64(); 625 assert(R.array[0] == 0); 626 } 627 628 /// Shift packed 16-bit integers in `a` left by `bits` while shifting in zeros. 629 deprecated("Use _mm_slli_pi16 instead.") __m64 _mm_sll_pi16 (__m64 a, __m64 bits) pure @safe 630 { 631 return to_m64(_mm_sll_epi16(to_m128i(a), to_m128i(bits))); 632 } 633 634 /// Shift packed 32-bit integers in `a` left by `bits` while shifting in zeros. 635 deprecated("Use _mm_slli_pi32 instead.") __m64 _mm_sll_pi32 (__m64 a, __m64 bits) pure @safe 636 { 637 return to_m64(_mm_sll_epi32(to_m128i(a), to_m128i(bits))); 638 } 639 640 /// Shift 64-bit integer `a` left by `bits` while shifting in zeros. 641 deprecated("Use _mm_slli_si64 instead.") __m64 _mm_sll_si64 (__m64 a, __m64 bits) pure @safe 642 { 643 return to_m64(_mm_sll_epi64(to_m128i(a), to_m128i(bits))); 644 } 645 646 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 647 __m64 _mm_slli_pi16 (__m64 a, int imm8) pure @safe 648 { 649 return to_m64(_mm_slli_epi16(to_m128i(a), imm8)); 650 } 651 unittest 652 { 653 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 654 short4 B = cast(short4)( _mm_slli_pi16(A, 1) ); 655 short[4] correct = [ -8, -10, 12, 14 ]; 656 assert(B.array == correct); 657 } 658 659 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 660 __m64 _mm_slli_pi32 (__m64 a, int imm8) pure @safe 661 { 662 return to_m64(_mm_slli_epi32(to_m128i(a), imm8)); 663 } 664 unittest 665 { 666 __m64 A = _mm_setr_pi32(-4, 5); 667 int2 B = cast(int2)( _mm_slli_pi32(A, 1) ); 668 int[2] correct = [ -8, 10 ]; 669 assert(B.array == correct); 670 } 671 672 /// Shift 64-bit integer `a` left by `imm8` while shifting in zeros. 673 __m64 _mm_slli_si64 (__m64 a, int imm8) pure @safe 674 { 675 return to_m64(_mm_slli_epi64(to_m128i(a), imm8)); 676 } 677 unittest 678 { 679 __m64 A = _mm_cvtsi64_m64(-1); 680 long1 R = cast(long1)( _mm_slli_si64(A, 1) ); 681 long[1] correct = [ -2 ]; 682 assert(R.array == correct); 683 } 684 685 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in sign bits. 686 deprecated("Use _mm_srai_pi16 instead.") __m64 _mm_sra_pi16 (__m64 a, __m64 bits) pure @safe 687 { 688 return to_m64(_mm_sra_epi16(to_m128i(a), to_m128i(bits))); 689 } 690 691 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in sign bits. 692 deprecated("Use _mm_srai_pi32 instead.") __m64 _mm_sra_pi32 (__m64 a, __m64 bits) pure @safe 693 { 694 return to_m64(_mm_sra_epi32(to_m128i(a), to_m128i(bits))); 695 } 696 697 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 698 __m64 _mm_srai_pi16 (__m64 a, int imm8) pure @safe 699 { 700 return to_m64(_mm_srai_epi16(to_m128i(a), imm8)); 701 } 702 unittest 703 { 704 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 705 short4 B = cast(short4)( _mm_srai_pi16(A, 1) ); 706 short[4] correct = [ -2, -3, 3, 3 ]; 707 assert(B.array == correct); 708 } 709 710 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 711 __m64 _mm_srai_pi32 (__m64 a, int imm8) pure @safe 712 { 713 return to_m64(_mm_srai_epi32(to_m128i(a), imm8)); 714 } 715 unittest 716 { 717 __m64 A = _mm_setr_pi32(-4, 5); 718 int2 B = cast(int2)( _mm_srai_pi32(A, 1) ); 719 int[2] correct = [ -2, 2 ]; 720 assert(B.array == correct); 721 } 722 723 /// Shift packed 16-bit integers in `a` right by `bits` while shifting in zeros. 724 deprecated("Use _mm_srli_pi16 instead.") __m64 _mm_srl_pi16 (__m64 a, __m64 bits) pure @safe 725 { 726 return to_m64(_mm_srl_epi16(to_m128i(a), to_m128i(bits))); 727 } 728 729 /// Shift packed 32-bit integers in `a` right by `bits` while shifting in zeros. 730 deprecated("Use _mm_srli_pi32 instead.") __m64 _mm_srl_pi32 (__m64 a, __m64 bits) pure @safe 731 { 732 return to_m64(_mm_srl_epi32(to_m128i(a), to_m128i(bits))); 733 } 734 735 /// Shift 64-bit integer `a` right by `bits` while shifting in zeros. 736 deprecated("Use _mm_srli_si64 instead.") __m64 _mm_srl_si64 (__m64 a, __m64 bits) pure @safe 737 { 738 return to_m64(_mm_srl_epi64(to_m128i(a), to_m128i(bits))); 739 } 740 741 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 742 __m64 _mm_srli_pi16 (__m64 a, int imm8) pure @safe 743 { 744 return to_m64(_mm_srli_epi16(to_m128i(a), imm8)); 745 } 746 unittest 747 { 748 __m64 A = _mm_setr_pi16(-4, -5, 6, 7); 749 short4 B = cast(short4)( _mm_srli_pi16(A, 1) ); 750 short[4] correct = [ 0x7ffe, 0x7ffd, 3, 3 ]; 751 assert(B.array == correct); 752 } 753 754 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 755 __m64 _mm_srli_pi32 (__m64 a, int imm8) pure @safe 756 { 757 return to_m64(_mm_srli_epi32(to_m128i(a), imm8)); 758 } 759 unittest 760 { 761 __m64 A = _mm_setr_pi32(-4, 5); 762 int2 B = cast(int2)( _mm_srli_pi32(A, 1) ); 763 int[2] correct = [ 0x7ffffffe, 2 ]; 764 assert(B.array == correct); 765 } 766 767 /// Shift 64-bit integer `a` right by `imm8` while shifting in zeros. 768 __m64 _mm_srli_si64 (__m64 a, int imm8) pure @safe 769 { 770 return to_m64(_mm_srli_epi64(to_m128i(a), imm8)); 771 } 772 unittest 773 { 774 __m64 A = _mm_cvtsi64_m64(-1); 775 long1 R = cast(long1)( _mm_srli_si64(A, 1) ); 776 long[1] correct = [ 0x7fff_ffff_ffff_ffff ]; 777 assert(R.array == correct); 778 } 779 780 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 781 __m64 _mm_sub_pi16 (__m64 a, __m64 b) pure @safe 782 { 783 return cast(__m64)(cast(short4)a - cast(short4)b); 784 } 785 unittest 786 { 787 short4 R = cast(short4) _mm_sub_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 788 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 789 static immutable short[4] correct = [ -1,-15, 1, 32764]; 790 assert(R.array == correct); 791 } 792 793 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 794 __m64 _mm_sub_pi32 (__m64 a, __m64 b) pure @safe 795 { 796 return cast(__m64)(cast(int2)a - cast(int2)b); 797 } 798 unittest 799 { 800 int2 R = cast(int2) _mm_sub_pi32(_mm_setr_pi32( 10, 4), 801 _mm_setr_pi32( 15, -70)); 802 static immutable int[2] correct = [ -5, 74]; 803 assert(R.array == correct); 804 } 805 806 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 807 __m64 _mm_sub_pi8 (__m64 a, __m64 b) pure @safe 808 { 809 return cast(__m64)(cast(byte8)a - cast(byte8)b); 810 } 811 unittest 812 { 813 byte8 R = cast(byte8) _mm_sub_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 814 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 815 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, 120 ]; 816 assert(R.array == correct); 817 } 818 819 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` using saturation. 820 __m64 _mm_subs_pi16 (__m64 a, __m64 b) pure @safe 821 { 822 return to_m64(_mm_subs_epi16(to_m128i(a), to_m128i(b))); 823 } 824 unittest 825 { 826 short4 R = cast(short4) _mm_subs_pi16(_mm_setr_pi16(cast(short)65534, 1, 5, -32768), 827 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 828 static immutable short[4] correct = [ -1,-15, 1, -32768]; 829 assert(R.array == correct); 830 } 831 832 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` using saturation. 833 __m64 _mm_subs_pi8 (__m64 a, __m64 b) pure @safe 834 { 835 return to_m64(_mm_subs_epi8(to_m128i(a), to_m128i(b))); 836 } 837 unittest 838 { 839 byte8 R = cast(byte8) _mm_subs_pi8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, -128), 840 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 841 static immutable byte[8] correct = [ -1, 7, -1,-30, 0, 0, 0, -128 ]; 842 assert(R.array == correct); 843 } 844 845 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 846 /// using saturation. 847 __m64 _mm_subs_pu16 (__m64 a, __m64 b) pure @safe 848 { 849 return to_m64(_mm_subs_epu16(to_m128i(a), to_m128i(b))); 850 } 851 unittest 852 { 853 short4 R = cast(short4) _mm_subs_pu16(_mm_setr_pi16(cast(short)65534, 1, 5, 4), 854 _mm_setr_pi16(cast(short)65535, 16, 4, 4)); 855 static immutable short[4] correct = [ 0, 0, 1, 0]; 856 assert(R.array == correct); 857 } 858 859 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` 860 /// using saturation. 861 __m64 _mm_subs_pu8 (__m64 a, __m64 b) pure @safe 862 { 863 return to_m64(_mm_subs_epu8(to_m128i(a), to_m128i(b))); 864 } 865 unittest 866 { 867 byte8 R = cast(byte8) _mm_subs_pu8(_mm_setr_pi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8), 868 _mm_setr_pi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8)); 869 static immutable byte[8] correct = [ 0, 7, 0, 0, 0, 0, 0, 0, ]; 870 assert(R.array == correct); 871 } 872 873 deprecated alias _m_to_int = _mm_cvtsi64_si32; /// Deprecated intrinsics. 874 deprecated alias _m_to_int64 = _mm_cvtm64_si64; ///ditto 875 876 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 877 __m64 _mm_unpackhi_pi16 (__m64 a, __m64 b) pure @trusted 878 { 879 version(LDC) 880 { 881 // avoiding this shufflevector leads to bad performance on LDC 882 return cast(__m64) shufflevector!(short4, 2, 6, 3, 7)(cast(short4)a, cast(short4)b); 883 } 884 else 885 { 886 short4 ia = cast(short4)a; 887 short4 ib = cast(short4)b; 888 short4 r; 889 r.ptr[0] = ia.array[2]; 890 r.ptr[1] = ib.array[2]; 891 r.ptr[2] = ia.array[3]; 892 r.ptr[3] = ib.array[3]; 893 return cast(__m64)r; 894 } 895 } 896 unittest 897 { 898 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 899 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 900 short4 R = cast(short4) _mm_unpackhi_pi16(A, B); 901 short[4] correct = [-16, -3, 7, 10]; 902 assert(R.array == correct); 903 } 904 905 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 906 __m64 _mm_unpackhi_pi32 (__m64 a, __m64 b) pure @trusted 907 { 908 // Generate punpckldq as far back as LDC 1.0.0 -O1 909 // (Yes, LLVM does generate punpckldq to reuse SSE2 instructions) 910 int2 ia = cast(int2)a; 911 int2 ib = cast(int2)b; 912 int2 r; 913 r.ptr[0] = ia.array[1]; 914 r.ptr[1] = ib.array[1]; 915 return cast(__m64)r; 916 } 917 unittest 918 { 919 __m64 A = _mm_setr_pi32(4, 8); 920 __m64 B = _mm_setr_pi32(5, 9); 921 int2 R = cast(int2) _mm_unpackhi_pi32(A, B); 922 int[2] correct = [8, 9]; 923 assert(R.array == correct); 924 } 925 926 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 927 __m64 _mm_unpackhi_pi8 (__m64 a, __m64 b) 928 { 929 version(LDC) 930 { 931 return cast(__m64) shufflevector!(byte8, 4, 12, 5, 13, 6, 14, 7, 15)(cast(byte8)a, cast(byte8)b); 932 } 933 else 934 { 935 byte8 ia = cast(byte8)a; 936 byte8 ib = cast(byte8)b; 937 byte8 r; 938 r.ptr[0] = ia.array[4]; 939 r.ptr[1] = ib.array[4]; 940 r.ptr[2] = ia.array[5]; 941 r.ptr[3] = ib.array[5]; 942 r.ptr[4] = ia.array[6]; 943 r.ptr[5] = ib.array[6]; 944 r.ptr[6] = ia.array[7]; 945 r.ptr[7] = ib.array[7]; 946 return cast(__m64)r; 947 } 948 } 949 unittest 950 { 951 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 952 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 953 byte8 R = cast(byte8) _mm_unpackhi_pi8(A, B); 954 byte[8] correct = [5, -5, 6, -6, 7, -7, 8, -8]; 955 assert(R.array == correct); 956 } 957 958 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 959 __m64 _mm_unpacklo_pi16 (__m64 a, __m64 b) 960 { 961 // Generates punpcklwd since LDC 1.0.0 -01 962 short4 ia = cast(short4)a; 963 short4 ib = cast(short4)b; 964 short4 r; 965 r.ptr[0] = ia.array[0]; 966 r.ptr[1] = ib.array[0]; 967 r.ptr[2] = ia.array[1]; 968 r.ptr[3] = ib.array[1]; 969 return cast(__m64)r; 970 } 971 unittest 972 { 973 __m64 A = _mm_setr_pi16(4, 8, -16, 7); 974 __m64 B = _mm_setr_pi16(5, 9, -3, 10); 975 short4 R = cast(short4) _mm_unpacklo_pi16(A, B); 976 short[4] correct = [4, 5, 8, 9]; 977 assert(R.array == correct); 978 } 979 980 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 981 __m64 _mm_unpacklo_pi32 (__m64 a, __m64 b) pure @trusted 982 { 983 // x86: Generate punpckldq as far back as LDC 1.0.0 -O1 984 // ARM: Generate zip as far back as LDC 1.8.0 -O1 985 int2 ia = cast(int2)a; 986 int2 ib = cast(int2)b; 987 int2 r; 988 r.ptr[0] = ia.array[0]; 989 r.ptr[1] = ib.array[0]; 990 return cast(__m64)r; 991 } 992 unittest 993 { 994 __m64 A = _mm_setr_pi32(4, 8); 995 __m64 B = _mm_setr_pi32(5, 9); 996 int2 R = cast(int2) _mm_unpacklo_pi32(A, B); 997 int[2] correct = [4, 5]; 998 assert(R.array == correct); 999 } 1000 1001 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 1002 __m64 _mm_unpacklo_pi8 (__m64 a, __m64 b) 1003 { 1004 version(LDC) 1005 { 1006 return cast(__m64) shufflevector!(byte8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(byte8)a, cast(byte8)b); 1007 } 1008 else 1009 { 1010 byte8 ia = cast(byte8)a; 1011 byte8 ib = cast(byte8)b; 1012 byte8 r; 1013 r.ptr[0] = ia.array[0]; 1014 r.ptr[1] = ib.array[0]; 1015 r.ptr[2] = ia.array[1]; 1016 r.ptr[3] = ib.array[1]; 1017 r.ptr[4] = ia.array[2]; 1018 r.ptr[5] = ib.array[2]; 1019 r.ptr[6] = ia.array[3]; 1020 r.ptr[7] = ib.array[3]; 1021 return cast(__m64)r; 1022 } 1023 } 1024 unittest 1025 { 1026 __m64 A = _mm_setr_pi8( 1, 2, 3, 4, 5, 6, 7, 8); 1027 __m64 B = _mm_setr_pi8(-1, -2, -3, -4, -5, -6, -7, -8); 1028 byte8 R = cast(byte8) _mm_unpacklo_pi8(A, B); 1029 byte[8] correct = [1, -1, 2, -2, 3, -3, 4, -4]; 1030 assert(R.array == correct); 1031 } 1032 1033 /// Compute the bitwise XOR of 64 bits (representing integer data) in `a` and `b`. 1034 __m64 _mm_xor_si64 (__m64 a, __m64 b) 1035 { 1036 return a ^ b; 1037 } 1038 unittest 1039 { 1040 __m64 A = _mm_setr_pi16(255, 1, -1, 0); 1041 __m64 B = _mm_set1_pi16(15); 1042 short4 R = cast(short4)_mm_xor_si64(A, B); 1043 short[4] correct = [240, 14, -16, 15]; 1044 assert(R.array == correct); 1045 } 1046