1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 // SSE2 instructions 16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 17 18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 19 { 20 return cast(__m128i)(cast(short8)a + cast(short8)b); 21 } 22 23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 24 { 25 return cast(__m128i)(cast(int4)a + cast(int4)b); 26 } 27 28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 29 { 30 return cast(__m128i)(cast(long2)a + cast(long2)b); 31 } 32 33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 36 } 37 38 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 39 { 40 a[0] += b[0]; 41 return a; 42 } 43 unittest 44 { 45 __m128d a = [1.5, -2.0]; 46 a = _mm_add_sd(a, a); 47 assert(a.array == [3.0, -2.0]); 48 } 49 50 51 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 52 { 53 return a + b; 54 } 55 unittest 56 { 57 __m128d a = [1.5, -2.0]; 58 a = _mm_add_pd(a, a); 59 assert(a.array == [3.0, -4.0]); 60 } 61 62 // MMXREG: _mm_add_si64 63 64 version(LDC) 65 { 66 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 67 } 68 else 69 { 70 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 71 { 72 short[8] res; 73 short8 sa = cast(short8)a; 74 short8 sb = cast(short8)b; 75 foreach(i; 0..8) 76 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 77 return _mm_loadu_si128(cast(int4*)res.ptr); 78 } 79 } 80 unittest 81 { 82 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 version(LDC) 89 { 90 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 91 } 92 else 93 { 94 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 95 { 96 byte[16] res; 97 byte16 sa = cast(byte16)a; 98 byte16 sb = cast(byte16)b; 99 foreach(i; 0..16) 100 res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 101 return _mm_loadu_si128(cast(int4*)res.ptr); 102 } 103 } 104 unittest 105 { 106 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 107 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 108 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 109 16, 18, 20, 22, 24, 26, 28, 30]; 110 assert(res.array == correctResult); 111 } 112 113 version(LDC) 114 { 115 alias _mm_adds_epu8 = __builtin_ia32_paddusb128; 116 } 117 else 118 { 119 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 120 { 121 ubyte[16] res; 122 byte16 sa = cast(byte16)a; 123 byte16 sb = cast(byte16)b; 124 foreach(i; 0..16) 125 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 126 return _mm_loadu_si128(cast(int4*)res.ptr); 127 } 128 } 129 130 version(LDC) 131 { 132 alias _mm_adds_epu16 = __builtin_ia32_paddusw128; 133 } 134 else 135 { 136 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 137 { 138 ushort[8] res; 139 short8 sa = cast(short8)a; 140 short8 sb = cast(short8)b; 141 foreach(i; 0..8) 142 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 143 return _mm_loadu_si128(cast(int4*)res.ptr); 144 } 145 } 146 147 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 148 { 149 return cast(__m128d)( cast(__m128i)a & cast(__m128i)b ); 150 } 151 152 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 153 { 154 return a & b; 155 } 156 157 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 158 { 159 return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b ); 160 } 161 162 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 163 { 164 return (~a) & b; 165 } 166 167 version(LDC) 168 { 169 pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.w") 170 short8 _mm_avg_epu16(short8, short8) pure @safe; 171 172 pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.b") 173 byte16 _mm_avg_epu8(byte16, byte16) pure @safe; 174 } 175 // TODO 176 177 178 // TODO: __m128i _mm_bslli_si128 (__m128i a, int imm8) 179 // TODO: __m128i _mm_bsrli_si128 (__m128i a, int imm8) 180 181 __m128 _mm_castpd_ps (__m128d a) pure @safe 182 { 183 return cast(__m128)a; 184 } 185 186 __m128i _mm_castpd_si128 (__m128d a) pure @safe 187 { 188 return cast(__m128i)a; 189 } 190 191 __m128d _mm_castps_pd (__m128 a) pure @safe 192 { 193 return cast(__m128d)a; 194 } 195 196 __m128i _mm_castps_si128 (__m128 a) pure @safe 197 { 198 return cast(__m128i)a; 199 } 200 201 __m128d _mm_castsi128_pd (__m128i a) pure @safe 202 { 203 return cast(__m128d)a; 204 } 205 206 __m128 _mm_castsi128_ps (__m128i a) pure @safe 207 { 208 return cast(__m128)a; 209 } 210 211 version(LDC) 212 { 213 alias _mm_clflush = __builtin_ia32_clflush; 214 } 215 // TODO 216 217 version(LDC) 218 { 219 pragma(LDC_intrinsic, "llvm.x86.sse2.cmp.pd") 220 double2 __builtin_ia32_cmppd(double2, double2, byte) pure @safe; 221 } 222 // TODO 223 224 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 225 { 226 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 227 } 228 229 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 230 { 231 return equalMask!__m128i(a, b); 232 } 233 234 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 235 { 236 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 237 } 238 unittest 239 { 240 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 241 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 242 243 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 244 static immutable byte[16] correct = 245 [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 246 247 __m128i D = _mm_cmpeq_epi8(A, B); 248 assert(C.array == correct); 249 } 250 251 252 version(LDC) 253 { 254 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 255 { 256 return __builtin_ia32_cmppd(a, b, 0); 257 } 258 259 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 260 { 261 return __builtin_ia32_cmpsd(a, b, 0); 262 } 263 264 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 265 { 266 return __builtin_ia32_cmppd(b, a, 2); 267 } 268 269 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 270 { 271 return __builtin_ia32_cmpsd(b, a, 2); 272 } 273 } 274 // TODO 275 276 277 // TODO 278 /+__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 279 { 280 return cast(__m128i)( cast(short8)a > cast(short8)b ); 281 } 282 283 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 284 { 285 return cast(__m128i)( cast(int4)a > cast(int4)b ); 286 } 287 288 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 289 { 290 return cast(__m128i)( cast(byte16)a > cast(byte16)b ); 291 }+/ 292 293 version(LDC) 294 { 295 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 296 { 297 return __builtin_ia32_cmppd(b, a, 1); 298 } 299 300 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 301 { 302 return __builtin_ia32_cmpsd(b, a, 1); 303 } 304 305 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 306 { 307 return __builtin_ia32_cmppd(a, b, 2); 308 } 309 310 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 311 { 312 return __builtin_ia32_cmpsd(a, b, 2); 313 } 314 } 315 // TODO 316 317 // TODO 318 /+__m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 319 { 320 return cast(__m128i)( cast(short8)a < cast(short8)b ); 321 } 322 323 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 324 { 325 return cast(__m128i)( cast(int4)a < cast(int4)b ); 326 } 327 328 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 329 { 330 return cast(__m128i)( cast(byte8)a < cast(byte8)b ); 331 }+/ 332 333 version(LDC) 334 { 335 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 336 { 337 return __builtin_ia32_cmppd(a, b, 1); 338 } 339 340 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 341 { 342 return __builtin_ia32_cmpsd(a, b, 1); 343 } 344 345 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 346 { 347 return __builtin_ia32_cmppd(a, b, 4); 348 } 349 350 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 351 { 352 return __builtin_ia32_cmpsd(a, b, 4); 353 } 354 355 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 356 { 357 return __builtin_ia32_cmppd(b, a, 6); 358 } 359 360 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 361 { 362 return __builtin_ia32_cmpsd(b, a, 6); 363 } 364 365 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 366 { 367 return __builtin_ia32_cmppd(b, a, 5); 368 } 369 370 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 371 { 372 return __builtin_ia32_cmpsd(b, a, 5); 373 } 374 375 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 376 { 377 return __builtin_ia32_cmppd(a, b, 6); 378 } 379 380 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 381 { 382 return __builtin_ia32_cmpsd(a, b, 6); 383 } 384 385 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 386 { 387 return __builtin_ia32_cmppd(a, b, 5); 388 } 389 390 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 391 { 392 return __builtin_ia32_cmpsd(a, b, 5); 393 } 394 395 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 396 { 397 return __builtin_ia32_cmppd(a, b, 7); 398 } 399 400 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 401 { 402 return __builtin_ia32_cmpsd(a, b, 7); 403 } 404 405 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 406 { 407 return __builtin_ia32_cmppd(a, b, 3); 408 } 409 410 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 411 { 412 return __builtin_ia32_cmpsd(a, b, 3); 413 } 414 } 415 // TODO 416 417 version(LDC) 418 { 419 alias _mm_comieq_sd = __builtin_ia32_comisdeq; 420 alias _mm_comige_sd = __builtin_ia32_comisdge; 421 alias _mm_comigt_sd = __builtin_ia32_comisdgt; 422 alias _mm_comile_sd = __builtin_ia32_comisdle; 423 alias _mm_comilt_sd = __builtin_ia32_comisdlt; 424 alias _mm_comineq_sd = __builtin_ia32_comisdneq; 425 } 426 // TODO 427 428 // TODO: alias _mm_cvtepi32_pd = __builtin_ia32_cvtdq2pd; 429 430 version(LDC) 431 { 432 alias _mm_cvtepi32_ps = __builtin_ia32_cvtdq2ps; 433 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 434 } 435 436 // MMXREG: _mm_cvtpd_pi32 437 version(LDC) 438 { 439 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; 440 // MMXREG: _mm_cvtpi32_pd 441 alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 442 } 443 // TODO 444 445 // TODO: alias _mm_cvtps_pd = __builtin_ia32_cvtps2pd; 446 447 double _mm_cvtsd_f64 (__m128d a) pure @safe 448 { 449 return extractelement!(double2, 0)(a); 450 } 451 452 version(LDC) 453 { 454 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 455 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 456 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 457 } 458 // TODO 459 460 version(LDC) 461 { 462 alias _mm_cvtsd_ss = __builtin_ia32_cvtsd2ss; 463 } 464 // TODO 465 466 int _mm_cvtsi128_si32 (__m128i a) pure @safe 467 { 468 return a[0]; 469 } 470 471 long _mm_cvtsi128_si64 (__m128i a) pure @safe 472 { 473 long2 la = cast(long2)a; 474 return la[0]; 475 } 476 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 477 478 version(LDC) 479 { 480 // this LLVM intrinsics seems to still be there 481 pragma(LDC_intrinsic, "llvm.x86.sse2.cvtsi2sd") 482 double2 _mm_cvtsi32_sd(double2, int) pure @safe; 483 } 484 else 485 { 486 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe 487 { 488 v[0] = cast(double)x; 489 return v; 490 } 491 } 492 unittest 493 { 494 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 495 assert(a.array == [42.0, 0]); 496 } 497 498 __m128i _mm_cvtsi32_si128 (int a) pure @safe 499 { 500 int4 r = [0, 0, 0, 0]; 501 r[0] = a; 502 return r; 503 } 504 505 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 506 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe 507 { 508 v[0] = cast(double)x; 509 return v; 510 } 511 unittest 512 { 513 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 514 assert(a.array == [42.0, 0]); 515 } 516 517 __m128i _mm_cvtsi64_si128 (long a) pure @safe 518 { 519 long2 r = [0, 0]; 520 r[0] = a; 521 return cast(__m128i)(r); 522 } 523 524 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 525 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 526 527 version(LDC) 528 { 529 pragma(LDC_intrinsic, "llvm.x86.sse2.cvtss2sd") 530 double2 _mm_cvtss_sd(double2, float4) pure @safe; 531 } 532 else 533 { 534 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe 535 { 536 v[0] = x[0]; 537 return v; 538 } 539 } 540 unittest 541 { 542 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 543 assert(a.array == [42.0, 0]); 544 } 545 546 version(LDC) 547 { 548 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 549 //MMXREG: _mm_cvttpd_pi32 550 alias _mm_cvttps_epi32 = __builtin_ia32_cvttps2dq; 551 alias _mm_cvttsd_si32 = __builtin_ia32_cvttsd2si; 552 alias _mm_cvttsd_si64 = __builtin_ia32_cvttsd2si64; 553 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 554 } 555 // TODO 556 557 558 559 __m128d _mm_div_ps(__m128d a, __m128d b) 560 { 561 return a / b; 562 } 563 564 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 565 { 566 a[0] /= b[0]; 567 return a; 568 } 569 unittest 570 { 571 __m128d a = [2.0, 4.5]; 572 a = _mm_div_sd(a, a); 573 assert(a.array == [1.0, 4.5]); 574 } 575 576 int _mm_extract_epi16(int imm8)(__m128i a) pure @safe 577 { 578 return shufflevector!(short8, imm8)(a); 579 } 580 581 __m128i _mm_insert_epi16(int imm8)(__m128i a, int i) pure @safe 582 { 583 return insertelement!(short8, imm8)(a, i); 584 } 585 586 version(LDC) 587 { 588 alias _mm_lfence = __builtin_ia32_lfence; 589 } 590 // TODO 591 592 593 __m128d _mm_load_pd (const(double) * mem_addr) pure 594 { 595 __m128d* aligned = cast(__m128d*)mem_addr; 596 return *aligned; 597 } 598 599 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 600 { 601 double[2] arr = [*mem_addr, *mem_addr]; 602 return loadUnaligned!(double2)(&arr[0]); 603 } 604 605 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe 606 { 607 double2 r = [0, 0]; 608 r[0] = *mem_addr; 609 return r; 610 } 611 unittest 612 { 613 double x = -42; 614 __m128d a = _mm_load_sd(&x); 615 assert(a.array == [-42.0, 0.0]); 616 } 617 618 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 619 { 620 return *mem_addr; 621 } 622 623 alias _mm_load1_pd = _mm_load_pd1; 624 625 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe 626 { 627 a[1] = *mem_addr; 628 return a; 629 } 630 631 // Note: strange signature since the memory doesn't have to aligned 632 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe 633 { 634 auto pLong = cast(const(long)*)mem_addr; 635 long2 r = [0, 0]; 636 r[0] = *pLong; 637 return cast(__m128i)(r); 638 } 639 640 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe 641 { 642 a[0] = *mem_addr; 643 return a; 644 } 645 646 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 647 { 648 __m128d a = _mm_load_pd(mem_addr); 649 return shufflevector!(__m128d, 1, 0)(a, a); 650 } 651 652 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 653 { 654 return loadUnaligned!(double2)(mem_addr); 655 } 656 657 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 658 { 659 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 660 } 661 662 version(LDC) 663 { 664 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 665 666 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; 667 668 pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxs.w") 669 short8 __builtin_ia32_pmaxsw128(short8, short8) pure @safe; 670 alias _mm_max_epi16 = __builtin_ia32_pmaxsw128; 671 672 pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxu.b") 673 byte16 __builtin_ia32_pmaxub128(byte16, byte16) pure @safe; 674 alias _mm_max_epu8 = __builtin_ia32_pmaxub128; 675 676 alias _mm_max_pd = __builtin_ia32_maxpd; 677 alias _mm_max_sd = __builtin_ia32_maxsd; 678 679 alias _mm_mfence = __builtin_ia32_mfence; 680 681 pragma(LDC_intrinsic, "llvm.x86.sse2.pmins.w") 682 short8 __builtin_ia32_pminsw128(short8, short8) pure @safe; 683 alias _mm_min_epi16 = __builtin_ia32_pminsw128; 684 685 pragma(LDC_intrinsic, "llvm.x86.sse2.pminu.b") 686 byte16 __builtin_ia32_pminub128(byte16, byte16) pure @safe; 687 alias _mm_min_epu8 = __builtin_ia32_pminub128; 688 689 alias _mm_min_pd = __builtin_ia32_minpd; 690 alias _mm_min_sd = __builtin_ia32_minsd; 691 } 692 // TODO 693 694 __m128i _mm_move_epi64 (__m128i a) pure @safe 695 { 696 long2 result = [ 0, 0 ]; 697 long2 la = cast(long2) a; 698 result[0] = la[0]; 699 return cast(__m128i)(result); 700 } 701 unittest 702 { 703 long2 A = [13, 47]; 704 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 705 long[2] correct = [13, 0]; 706 assert(B.array == correct); 707 } 708 709 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 710 { 711 b[1] = a[1]; 712 return b; 713 } 714 unittest 715 { 716 double2 A = [13.0, 47.0]; 717 double2 B = [34.0, 58.0]; 718 double2 C = _mm_move_sd(A, B); 719 double[2] correct = [34.0, 47.0]; 720 assert(C.array == correct); 721 } 722 723 version(LDC) 724 { 725 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 726 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 727 } 728 729 // MMXREG: _mm_movepi64_pi64 730 // MMXREG: __m128i _mm_movpi64_epi64 (__m64 a) 731 732 version(LDC) 733 { 734 alias _mm_mul_epu32 = __builtin_ia32_pmuludq128; 735 } 736 // TODO 737 738 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 739 { 740 return a * b; 741 } 742 unittest 743 { 744 __m128d a = [-2.0, 1.5]; 745 a = _mm_mul_pd(a, a); 746 assert(a.array == [4.0, 2.25]); 747 } 748 749 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 750 { 751 a[0] *= b[0]; 752 return a; 753 } 754 unittest 755 { 756 __m128d a = [-2.0, 1.5]; 757 a = _mm_mul_sd(a, a); 758 assert(a.array == [4.0, 1.5]); 759 } 760 761 762 // MMXREG: _mm_mul_su32 763 764 version(LDC) 765 { 766 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 767 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 768 } 769 // TODO 770 771 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) 772 { 773 return cast(__m128i)(cast(short8)a * cast(short8)b); 774 } 775 776 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 777 { 778 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 779 } 780 781 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 782 { 783 return a | b; 784 } 785 786 version(LDC) 787 { 788 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 789 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 790 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 791 } 792 // TODO 793 794 version(LDC) 795 { 796 alias _mm_pause = __builtin_ia32_pause; 797 } 798 // TODO 799 800 version(LDC) 801 { 802 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 803 } 804 // TODO 805 806 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 807 { 808 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 809 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 810 } 811 unittest 812 { 813 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 814 short8 B = cast(short8) A; 815 foreach(i; 0..8) 816 assert(B.array[i] == i); 817 } 818 819 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 820 { 821 int[4] result = [e0, e1, e2, e3]; 822 return loadUnaligned!(int4)(result.ptr); 823 } 824 unittest 825 { 826 __m128i A = _mm_set_epi32(3, 2, 1, 0); 827 foreach(i; 0..4) 828 assert(A.array[i] == i); 829 } 830 831 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 832 { 833 long[2] result = [e0, e1]; 834 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 835 } 836 unittest 837 { 838 __m128i A = _mm_set_epi64x(1234, 5678); 839 long2 B = cast(long2) A; 840 assert(B.array[0] == 5678); 841 assert(B.array[1] == 1234); 842 } 843 844 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 845 byte e11, byte e10, byte e9, byte e8, 846 byte e7, byte e6, byte e5, byte e4, 847 byte e3, byte e2, byte e1, byte e0) pure @trusted 848 { 849 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 850 e8, e9, e10, e11, e12, e13, e14, e15]; 851 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 852 } 853 854 __m128d _mm_set_pd (double e1, double e0) pure @trusted 855 { 856 double[2] result = [e0, e1]; 857 return loadUnaligned!(double2)(result.ptr); 858 } 859 860 __m128d _mm_set_pd1 (double a) pure @trusted 861 { 862 double[2] result = [a, a]; 863 return loadUnaligned!(double2)(result.ptr); 864 } 865 866 __m128d _mm_set_sd (double a) pure @trusted 867 { 868 double[2] result = [a, 0]; 869 return loadUnaligned!(double2)(result.ptr); 870 } 871 872 __m128i _mm_set1_epi16 (short a) pure @trusted 873 { 874 short[8] result = [a, a, a, a, a, a, a, a]; 875 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 876 } 877 878 __m128i _mm_set1_epi32 (int a) pure @trusted 879 { 880 int[4] result = [a, a, a, a]; 881 return loadUnaligned!(int4)(result.ptr); 882 } 883 unittest 884 { 885 __m128 a = _mm_set1_ps(-1.0f); 886 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 887 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 888 } 889 890 __m128i _mm_set1_epi64x (long a) pure @trusted 891 { 892 long[2] result = [a, a]; 893 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 894 } 895 896 __m128i _mm_set1_epi8 (char a) pure @trusted 897 { 898 byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a]; 899 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 900 } 901 902 alias _mm_set1_pd = _mm_set_pd1; 903 904 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 905 { 906 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 907 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 908 } 909 910 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 911 { 912 int[4] result = [e3, e2, e1, e0]; 913 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 914 } 915 916 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 917 { 918 long[2] result = [e1, e0]; 919 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 920 } 921 922 __m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, 923 char e11, char e10, char e9, char e8, 924 char e7, char e6, char e5, char e4, 925 char e3, char e2, char e1, char e0) pure @trusted 926 { 927 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 928 e7, e6, e5, e4, e3, e2, e1, e0]; 929 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 930 } 931 932 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 933 { 934 double[2] result = [e1, e0]; 935 return loadUnaligned!(double2)(result.ptr); 936 } 937 938 __m128d _mm_setzero_pd () pure @trusted 939 { 940 double[2] result = [0.0, 0.0]; 941 return loadUnaligned!(double2)(result.ptr); 942 } 943 944 __m128i _mm_setzero_si128() pure @trusted 945 { 946 int[4] result = [0, 0, 0, 0]; 947 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 948 } 949 950 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 951 { 952 return shufflevector!(int4, (imm8 >> 0) & 3, 953 (imm8 >> 2) & 3, 954 (imm8 >> 4) & 3, 955 (imm8 >> 6) & 3)(a, a); 956 } 957 958 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 959 { 960 return shufflevector!(double, 0 + ( (imm8 >> 0) & 1 ), 961 2 + ( (imm8 >> 1) & 1 ))(a, b); 962 } 963 964 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 965 { 966 return shufflevector!(int4, 4 + ( (imm8 >> 0) & 3 ), 967 4 + ( (imm8 >> 2) & 3 ), 968 4 + ( (imm8 >> 4) & 3 ), 969 4 + ( (imm8 >> 6) & 3 ))(a, a); 970 } 971 972 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 973 { 974 return shufflevector!(int4, ( (imm8 >> 0) & 3 ), 975 ( (imm8 >> 2) & 3 ), 976 ( (imm8 >> 4) & 3 ), 977 ( (imm8 >> 6) & 3 ))(a, a); 978 } 979 980 version(LDC) 981 { 982 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 983 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 984 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 985 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 986 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 987 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 988 } 989 // TODO 990 991 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe 992 { 993 static if (imm8 & 0xF0) 994 return _mm_setzero_si128(); 995 else 996 return shufflevector!(byte16, 997 16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8, 998 24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8) 999 (_mm_setzero_si128(), op); 1000 } 1001 1002 version(LDC) 1003 { 1004 __m128d _mm_sqrt_pd (__m128d a) pure @safe 1005 { 1006 return __builtin_ia32_sqrtpd(a); 1007 } 1008 } 1009 // TODO 1010 1011 version(LDC) 1012 { 1013 __m128d _mm_sqrt_sd (__m128d a) pure @safe 1014 { 1015 return __builtin_ia32_sqrtsd(a); 1016 } 1017 } 1018 // TODO 1019 1020 1021 version(LDC) 1022 { 1023 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 1024 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 1025 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 1026 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 1027 1028 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 1029 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 1030 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 1031 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 1032 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 1033 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 1034 } 1035 // TODO 1036 1037 __m128i _mm_srli_si128(ubyte imm8)(__m128i op) pure @safe 1038 { 1039 static if (imm8 & 0xF0) 1040 return _mm_setzero_si128(); 1041 else 1042 return cast(__m128i) shufflevector!(byte16, 1043 imm8+0, imm8+1, imm8+2, imm8+3, imm8+4, imm8+5, imm8+6, imm8+7, 1044 imm8+8, imm8+9, imm8+10, imm8+11, imm8+12, imm8+13, imm8+14, imm8+15) 1045 (cast(byte16) op, cast(byte16)_mm_setzero_si128()); 1046 } 1047 1048 // Note: this is a bonus intrinsic 1049 __m128 _mm_srli_si128(ubyte imm8)(__m128 op) @safe 1050 { 1051 return cast(__m128)_mm_srli_si128!imm8(cast(__m128i)op); 1052 } 1053 unittest 1054 { 1055 // test that cast works at all 1056 __m128 A = cast(__m128) _mm_set1_epi32(0x3F800000); 1057 assert(A.array == [1.0f, 1.0f, 1.0f, 1.0f]); 1058 1059 // test _mm_srli_si128 for __m128i 1060 assert(_mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)).array == [2, 3, 4, 0]); 1061 assert(_mm_srli_si128!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)).array == [3.0f, 4.0f, 0, 0]); 1062 } 1063 1064 __m128d _mm_srli_si128(ubyte imm8)(__m128d op) pure @safe 1065 { 1066 return cast(__m128d) _mm_srli_si128!imm8(cast(__m128i)op); 1067 } 1068 1069 void _mm_store_pd (double* mem_addr, __m128d a) pure 1070 { 1071 __m128d* aligned = cast(__m128d*)mem_addr; 1072 *aligned = a; 1073 } 1074 1075 void _mm_store_pd1 (double* mem_addr, __m128d a) pure 1076 { 1077 __m128d* aligned = cast(__m128d*)mem_addr; 1078 *aligned = shufflevector!(double2, 0, 0)(a, a); 1079 } 1080 1081 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 1082 { 1083 *mem_addr = extractelement!(double2, 0)(a); 1084 } 1085 1086 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 1087 { 1088 *mem_addr = a; 1089 } 1090 1091 alias _mm_store1_pd = _mm_store_pd1; 1092 1093 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 1094 { 1095 *mem_addr = extractelement!(double2, 1)(a); 1096 } 1097 1098 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 1099 { 1100 long* dest = cast(long*)mem_addr; 1101 *dest = extractelement!(long2, 0)(cast(long2)a); 1102 } 1103 1104 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 1105 { 1106 *mem_addr = extractelement!(double2, 0)(a); 1107 } 1108 1109 void _mm_storer_pd (double* mem_addr, __m128d a) pure 1110 { 1111 __m128d* aligned = cast(__m128d*)mem_addr; 1112 *aligned = shufflevector!(double2, 1, 0)(a, a); 1113 } 1114 1115 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 1116 { 1117 storeUnaligned!double2(a, mem_addr); 1118 } 1119 1120 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 1121 { 1122 storeUnaligned!__m128i(a, cast(int*)mem_addr); 1123 } 1124 1125 // TODO: _mm_stream_pd 1126 // TODO: _mm_stream_si128 1127 // TODO: _mm_stream_si32 1128 // TODO: _mm_stream_si64 1129 1130 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 1131 { 1132 return cast(__m128i)(cast(short8)a - cast(short8)b); 1133 } 1134 1135 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 1136 { 1137 return cast(__m128i)(cast(int4)a - cast(int4)b); 1138 } 1139 1140 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 1141 { 1142 return cast(__m128i)(cast(long2)a - cast(long2)b); 1143 } 1144 1145 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 1146 { 1147 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 1148 } 1149 1150 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 1151 { 1152 return a - b; 1153 } 1154 1155 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 1156 { 1157 a[0] -= b[0]; 1158 return a; 1159 } 1160 unittest 1161 { 1162 __m128d a = [1.5, -2.0]; 1163 a = _mm_sub_sd(a, a); 1164 assert(a.array == [0.0, -2.0]); 1165 } 1166 1167 1168 // MMXREG: _mm_sub_si64 1169 1170 version(LDC) 1171 { 1172 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 1173 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 1174 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 1175 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 1176 1177 alias _mm_ucomieq_sd = __builtin_ia32_ucomisdeq; 1178 alias _mm_ucomige_sd = __builtin_ia32_ucomisdge; 1179 alias _mm_ucomigt_sd = __builtin_ia32_ucomisdgt; 1180 alias _mm_ucomile_sd = __builtin_ia32_ucomisdle; 1181 alias _mm_ucomilt_sd = __builtin_ia32_ucomisdlt; 1182 alias _mm_ucomineq_sd = __builtin_ia32_ucomisdneq; 1183 } 1184 // TODO 1185 1186 __m128d _mm_undefined_pd() pure @safe 1187 { 1188 __m128d result = void; 1189 return result; 1190 } 1191 __m128i _mm_undefined_si128() pure @safe 1192 { 1193 __m128i result = void; 1194 return result; 1195 } 1196 1197 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 1198 { 1199 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 1200 (cast(short8)a, cast(short8)b); 1201 } 1202 1203 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 1204 { 1205 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 1206 } 1207 1208 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe 1209 { 1210 return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b); 1211 } 1212 1213 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 1214 { 1215 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 1216 12, 28, 13, 29, 14, 30, 15, 31) 1217 (cast(byte16)a, cast(byte16)b); 1218 } 1219 1220 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 1221 { 1222 return shufflevector!(__m128d, 1, 3)(a, b); 1223 } 1224 1225 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 1226 { 1227 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 1228 (cast(short8)a, cast(short8)b); 1229 } 1230 1231 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 1232 { 1233 return shufflevector!(int4, 0, 4, 1, 6) 1234 (cast(int4)a, cast(int4)b); 1235 } 1236 1237 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe 1238 { 1239 return cast(__m128i) shufflevector!(long2, 0, 2) 1240 (cast(long2)a, cast(long2)b); 1241 } 1242 1243 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 1244 { 1245 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 1246 4, 20, 5, 21, 6, 22, 7, 23) 1247 (cast(byte16)a, cast(byte16)b); 1248 } 1249 1250 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 1251 { 1252 return shufflevector!(__m128d, 0, 2)(a, b); 1253 } 1254 1255 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 1256 { 1257 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 1258 } 1259 1260 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 1261 { 1262 return a ^ b; 1263 } 1264 1265 unittest 1266 { 1267 // distance between two points in 4D 1268 float distance(float[4] a, float[4] b) nothrow @nogc 1269 { 1270 __m128 va = _mm_loadu_ps(a.ptr); 1271 __m128 vb = _mm_loadu_ps(b.ptr); 1272 __m128 diffSquared = _mm_sub_ps(va, vb); 1273 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 1274 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_si128!8(diffSquared)); 1275 sum = _mm_add_ps(sum, _mm_srli_si128!4(sum)); 1276 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 1277 } 1278 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 1279 }