1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 7 module inteli.xmmintrin; 8 9 public import inteli.types; 10 11 import inteli.internals; 12 13 // SSE1 14 // Note: intrinsics noted MMXREG are actually using MMX registers, 15 // and were not translated. These intrinsics are for instruction 16 // introduced with SSE1, that also work on MMX registers. 17 18 nothrow @nogc: 19 20 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 21 { 22 return a + b; 23 } 24 25 unittest 26 { 27 __m128 a = [1, 2, 3, 4]; 28 a = _mm_add_ps(a, a); 29 assert(a.array[0] == 2); 30 assert(a.array[1] == 4); 31 assert(a.array[2] == 6); 32 assert(a.array[3] == 8); 33 } 34 35 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 36 { 37 a[0] += b[0]; 38 return a; 39 } 40 unittest 41 { 42 __m128 a = [1, 2, 3, 4]; 43 a = _mm_add_ss(a, a); 44 assert(a.array == [2.0f, 2, 3, 4]); 45 } 46 47 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 48 { 49 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 50 } 51 unittest 52 { 53 // Note: tested in emmintrin.d 54 } 55 56 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe 57 { 58 return (~a) & b; 59 } 60 61 62 // MMXREG: _mm_avg_pu16 63 // MMXREG: _mm_avg_pu8 64 65 version(LDC) 66 { 67 pragma(LDC_intrinsic, "llvm.x86.sse.cmp.ps") 68 __m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe; 69 } 70 else 71 { 72 // unimplemented 73 /*__m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe 74 { 75 assert(false, "unimplemented"); 76 }*/ 77 } 78 79 version(LDC) 80 { 81 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 82 { 83 return __builtin_ia32_cmpps(a, b, 0); 84 } 85 86 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 87 { 88 return __builtin_ia32_cmpss(a, b, 0); 89 } 90 91 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 92 { 93 return __builtin_ia32_cmpps(b, a, 2); // CMPLEPS reversed 94 } 95 96 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 97 { 98 return __builtin_ia32_cmpss(b, a, 2); // CMPLESS reversed 99 } 100 101 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 102 { 103 return __builtin_ia32_cmpps(b, a, 1); // CMPLTPS reversed 104 } 105 106 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 107 { 108 return __builtin_ia32_cmpss(b, a, 1); // CMPLTSS reversed 109 } 110 111 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 112 { 113 return __builtin_ia32_cmpps(a, b, 2); // CMPLEPS 114 } 115 116 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 117 { 118 return __builtin_ia32_cmpss(a, b, 2); // CMPLESS 119 } 120 121 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 122 { 123 return __builtin_ia32_cmpps(a, b, 1); // CMPLTPS 124 } 125 126 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 127 { 128 return __builtin_ia32_cmpss(a, b, 1); // CMPLTSS 129 } 130 131 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 132 { 133 return __builtin_ia32_cmpps(a, b, 4); // CMPNEQPS 134 } 135 136 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 137 { 138 return __builtin_ia32_cmpss(a, b, 4); // CMPNEQSS 139 } 140 141 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 142 { 143 return __builtin_ia32_cmpps(b, a, 6); // CMPNLEPS reversed 144 } 145 146 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 147 { 148 return __builtin_ia32_cmpss(b, a, 6); // CMPNLESS reversed 149 } 150 151 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 152 { 153 return __builtin_ia32_cmpps(b, a, 5); // CMPNLTPS reversed 154 } 155 156 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 157 { 158 return __builtin_ia32_cmpss(b, a, 5); // CMPNLTPS reversed 159 } 160 161 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 162 { 163 return __builtin_ia32_cmpps(a, b, 6); // CMPNLEPS 164 } 165 166 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 167 { 168 return __builtin_ia32_cmpss(a, b, 6); // CMPNLESS 169 } 170 171 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 172 { 173 return __builtin_ia32_cmpps(a, b, 5); // CMPNLTPS 174 } 175 176 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 177 { 178 return __builtin_ia32_cmpss(a, b, 5); // CMPNLTSS 179 } 180 181 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 182 { 183 return __builtin_ia32_cmpps(a, b, 7); // CMPORDPS 184 } 185 186 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 187 { 188 return __builtin_ia32_cmpss(a, b, 7); // CMPORDSS 189 } 190 191 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 192 { 193 return __builtin_ia32_cmpps(a, b, 3); // CMPUNORDPS 194 } 195 196 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 197 { 198 return __builtin_ia32_cmpss(a, b, 3); // CMPUNORDSS 199 } 200 } 201 else 202 { 203 // TODO 204 } 205 206 version(LDC) 207 { 208 alias _mm_comieq_ss = __builtin_ia32_comieq; 209 } 210 else 211 { 212 // TODO 213 /*__m128i _mm_comieq_ss(__m128, __m128) pure @safe 214 { 215 assert(false, "unimplemented"); 216 } 217 */ 218 } 219 220 221 version(LDC) 222 { 223 alias _mm_comige_ss = __builtin_ia32_comige; 224 } 225 else 226 { 227 // TODO 228 /* 229 __m128i _mm_comige_ss(__m128, __m128) pure @safe 230 { 231 assert(false, "unimplemented"); 232 } 233 */ 234 } 235 236 237 version(LDC) 238 { 239 alias _mm_comigt_ss = __builtin_ia32_comigt; 240 } 241 else 242 { 243 // TODO 244 /* 245 __m128i _mm_comigt_ss(__m128, __m128) pure @safe 246 { 247 assert(false, "unimplemented"); 248 } 249 */ 250 } 251 252 253 version(LDC) 254 { 255 alias _mm_comile_ss = __builtin_ia32_comile; 256 } 257 else 258 { 259 // TODO 260 /* 261 __m128i _mm_comile_ss(__m128, __m128) pure @safe 262 { 263 assert(false, "unimplemented"); 264 } 265 */ 266 } 267 268 269 version(LDC) 270 { 271 alias _mm_comilt_ss = __builtin_ia32_comilt; 272 } 273 else 274 { 275 // TODO 276 /* 277 __m128i _mm_comilt_ss(__m128, __m128) pure @safe 278 { 279 assert(false, "unimplemented"); 280 } 281 */ 282 } 283 284 version(LDC) 285 { 286 alias _mm_comineq_ss = __builtin_ia32_comineq; 287 } 288 else 289 { 290 // TODO 291 /* 292 __m128i _mm_comineq_ss(__m128, __m128) pure @safe 293 { 294 assert(false, "unimplemented"); 295 } 296 */ 297 } 298 299 // MMXREG: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b) 300 // MMXREG: __m64 _mm_cvt_ps2pi (__m128 a) 301 302 303 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe 304 { 305 v[0] = cast(float)x; 306 return v; 307 } 308 unittest 309 { 310 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 311 assert(a.array == [42f, 0, 0, 0]); 312 } 313 314 version(LDC) 315 { 316 alias _mm_cvt_ss2si = __builtin_ia32_cvtss2si; 317 } 318 else 319 { 320 // TODO 321 /* 322 int _mm_cvt_ss2si(__m128 v) pure @safe 323 { 324 assert(false, "unimplemented"); 325 } 326 */ 327 } 328 329 // MMXREG: __m128 _mm_cvtpi16_ps (__m64 a) 330 // MMXREG: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) 331 // MMXREG: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) 332 // MMXREG: __m128 _mm_cvtpi8_ps (__m64 a) 333 // MMXREG: __m64 _mm_cvtps_pi16 (__m128 a) 334 // MMXREG: __m64 _mm_cvtps_pi32 (__m128 a) 335 // MMXREG: __m64 _mm_cvtps_pi8 (__m128 a) 336 // MMXREG: __m128 _mm_cvtpu16_ps (__m64 a) 337 // MMXREG: __m128 _mm_cvtpu8_ps (__m64 a) 338 339 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe 340 { 341 v[0] = cast(float)x; 342 return v; 343 } 344 unittest 345 { 346 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 347 assert(a.array == [42.0f, 0, 0, 0]); 348 } 349 350 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 351 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe 352 { 353 v[0] = cast(float)x; 354 return v; 355 } 356 unittest 357 { 358 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 359 assert(a.array == [42.0f, 0, 0, 0]); 360 } 361 362 float _mm_cvtss_f32(__m128 a) pure @safe 363 { 364 return a[0]; 365 } 366 367 version(LDC) 368 { 369 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 370 } 371 else 372 { 373 // TODO 374 } 375 376 version(LDC) 377 { 378 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 379 } 380 else 381 { 382 // TODO 383 } 384 385 // MMXREG: __m64 _mm_cvtt_ps2pi (__m128 a) 386 387 version(LDC) 388 { 389 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 390 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 391 } 392 else 393 { 394 // TODO 395 } 396 397 // MMXREG: _mm_cvttps_pi32 398 399 version(LDC) 400 { 401 alias _mm_cvttss_si64 = __builtin_ia32_cvttss2si64; 402 } 403 else 404 { 405 // TODO 406 } 407 408 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 409 { 410 return a / b; 411 } 412 unittest 413 { 414 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 415 a = _mm_div_ps(a, a); 416 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 417 assert(a.array == correct); 418 } 419 420 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 421 { 422 a[0] /= b[0]; 423 return a; 424 } 425 unittest 426 { 427 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 428 a = _mm_div_ss(a, a); 429 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 430 assert(a.array == correct); 431 } 432 433 // MMXREG: int _mm_extract_pi16 (__m64 a, int imm8) 434 435 // TODO: unsigned int _MM_GET_EXCEPTION_MASK () 436 // TODO: unsigned int _MM_GET_EXCEPTION_STATE () 437 // TODO: unsigned int _MM_GET_FLUSH_ZERO_MODE () 438 // TODO: unsigned int _MM_GET_ROUNDING_MODE () 439 // TODO: stmxcsr 440 // TODO: unsigned int _mm_getcsr (void) 441 442 // MMXREG: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8) 443 444 __m128 _mm_load_ps(const(float)*p) pure @trusted 445 { 446 return *cast(__m128*)p; 447 } 448 449 __m128 _mm_load_ps1(const(float)*p) pure @trusted 450 { 451 float[4] f = [ *p, *p, *p, *p ]; 452 return loadUnaligned!(float4)(f.ptr); 453 } 454 455 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 456 { 457 float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ]; 458 return loadUnaligned!(float4)(f.ptr); 459 } 460 461 alias _mm_load1_ps = _mm_load_ps1; 462 463 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe 464 { 465 long2 la = cast(long2)a; 466 la[1] = *mem_addr; 467 return cast(__m128)la; 468 } 469 470 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe 471 { 472 long2 la = cast(long2)a; 473 la[0] = *mem_addr; 474 return cast(__m128)la; 475 } 476 477 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 478 { 479 __m128* aligned = cast(__m128*)mem_addr; 480 __m128 a = *aligned; 481 return shufflevector!(__m128, 3, 2, 1, 0)(a, a); 482 } 483 484 __m128 _mm_loadu_ps(float*p) pure @safe 485 { 486 return loadUnaligned!(__m128)(p); 487 } 488 489 __m128i _mm_loadu_si16(const(void)* mem_addr) 490 { 491 short r = *cast(short*)(mem_addr); 492 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 493 result[0] = r; 494 return cast(__m128i)result; 495 } 496 unittest 497 { 498 short r = 13; 499 short8 A = cast(short8) _mm_loadu_si16(&r); 500 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 501 assert(A.array == correct); 502 } 503 504 __m128i _mm_loadu_si64(const(void)* mem_addr) 505 { 506 long r = *cast(int*)(mem_addr); 507 long2 result = [0, 0]; 508 result[0] = r; 509 return cast(__m128i)result; 510 } 511 unittest 512 { 513 long r = 446; 514 long2 A = cast(long2) _mm_loadu_si64(&r); 515 long[2] correct = [446, 0]; 516 assert(A.array == correct); 517 } 518 519 // MMXREG: _mm_maskmove_si64 520 // MMXREG: _m_maskmovq 521 522 // MMXREG: _mm_max_pi16 523 version(LDC) 524 { 525 alias _mm_max_ps = __builtin_ia32_maxps; 526 } 527 else 528 { 529 // TODO 530 } 531 532 // MMXREG: _mm_max_pu8 533 version(LDC) 534 { 535 alias _mm_max_ss = __builtin_ia32_maxss; 536 } 537 else 538 { 539 // TODO 540 } 541 542 // MMXREG: _mm_min_pi16 543 version(LDC) 544 { 545 alias _mm_min_ps = __builtin_ia32_minps; 546 } 547 else 548 { 549 // TODO 550 } 551 552 // MMXREG: _mm_min_pi8 553 554 version(LDC) 555 { 556 alias _mm_min_ss = __builtin_ia32_minss; 557 } 558 559 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe 560 { 561 return shufflevector!(__m128, 4, 1, 2, 3)(a, b); 562 } 563 564 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe 565 { 566 return shufflevector!(float4, 2, 3, 6, 7)(a, b); 567 } 568 569 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe 570 { 571 return shufflevector!(float4, 0, 1, 4, 5)(a, b); 572 } 573 574 // TODO: int _mm_movemask_pi8 575 version(LDC) 576 { 577 alias _mm_movemask_ps = __builtin_ia32_movmskps; 578 } 579 580 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 581 { 582 return a * b; 583 } 584 unittest 585 { 586 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 587 a = _mm_mul_ps(a, a); 588 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 589 assert(a.array == correct); 590 } 591 592 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 593 { 594 a[0] *= b[0]; 595 return a; 596 } 597 unittest 598 { 599 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 600 a = _mm_mul_ss(a, a); 601 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 602 assert(a.array == correct); 603 } 604 605 // MMXREG: _mm_mulhi_pu16 606 607 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 608 { 609 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 610 } 611 612 // MMXREG: __m64 _m_pavgb (__m64 a, __m64 b) 613 // MMXREG: __m64 _m_pavgw (__m64 a, __m64 b) 614 // MMXREG: int _m_pextrw (__m64 a, int imm8) 615 // MMXREG: __m64 _m_pinsrw (__m64 a, int i, int imm8) 616 // MMXREG: __m64 _m_pmaxsw (__m64 a, __m64 b) 617 // MMXREG: __m64 _m_pmaxub (__m64 a, __m64 b) 618 // MMXREG: __m64 _m_pminsw (__m64 a, __m64 b) 619 // MMXREG: __m64 _m_pminub (__m64 a, __m64 b) 620 // MMXREG: int _m_pmovmskb (__m64 a) 621 622 // MMXREG: __m64 _m_pmulhuw (__m64 a, __m64 b) 623 624 enum _MM_HINT_NTA = 0; 625 enum _MM_HINT_T0 = 1; 626 enum _MM_HINT_T1 = 2; 627 enum _MM_HINT_T2 = 3; 628 629 // Note: locality must be compile-time 630 void _mm_prefetch(int locality)(void* p) pure @safe 631 { 632 llvm_prefetch(p, 0, locality, 1); 633 } 634 635 // MMXREG: __m64 _m_psadbw (__m64 a, __m64 b) 636 // MMXREG: __m64 _m_pshufw (__m64 a, int imm8) 637 638 version(LDC) 639 { 640 alias _mm_rcp_ps = __builtin_ia32_rcpps; 641 } 642 // TODO 643 644 version(LDC) 645 { 646 alias _mm_rcp_ss = __builtin_ia32_rcpss; 647 } 648 // TODO 649 650 version(LDC) 651 { 652 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 653 } 654 // TODO 655 656 version(LDC) 657 { 658 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 659 } 660 // TODO 661 662 // TODO: _mm_sad_pu8 663 // TODO: void _MM_SET_EXCEPTION_MASK (unsigned int a) 664 // TODO: void _MM_SET_EXCEPTION_STATE (unsigned int a) 665 // TODO: void _MM_SET_FLUSH_ZERO_MODE (unsigned int a) 666 667 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 668 { 669 float[4] result = [e0, e1, e2, e3]; 670 return loadUnaligned!(float4)(result.ptr); 671 } 672 673 alias _mm_set_ps1 = _mm_set1_ps; 674 675 // TODO: _MM_SET_ROUNDING_MODE 676 677 __m128 _mm_set_ss (float a) pure @trusted 678 { 679 float[4] result = [a, 0.0f, 0.0f, 0.0f]; 680 return loadUnaligned!(float4)(result.ptr); 681 } 682 683 __m128 _mm_set1_ps (float a) pure @trusted 684 { 685 float[4] result = [a, a, a, a]; 686 return loadUnaligned!(float4)(result.ptr); 687 } 688 689 // TODO: _mm_setcsr 690 691 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 692 { 693 float[4] result = [e3, e2, e1, e0]; 694 return loadUnaligned!(float4)(result.ptr); 695 } 696 697 __m128 _mm_setzero_ps() pure @trusted 698 { 699 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 700 return loadUnaligned!(float4)(result.ptr); 701 } 702 703 version(LDC) 704 { 705 alias _mm_sfence = __builtin_ia32_sfence; 706 } 707 // TODO 708 709 // MMXREG: mm_shuffle_pi16 710 711 // Note: the immediate shuffle value is given at compile-time instead of runtime. 712 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 713 { 714 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 715 } 716 717 version(LDC) 718 { 719 // Disappeared with LDC 1.11 720 static if (__VERSION__ < 2081) 721 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 722 else 723 { 724 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 725 { 726 vec.array[0] = llvm_sqrt(vec.array[0]); 727 vec.array[1] = llvm_sqrt(vec.array[1]); 728 vec.array[2] = llvm_sqrt(vec.array[2]); 729 vec.array[3] = llvm_sqrt(vec.array[3]); 730 return vec; 731 } 732 } 733 } 734 else 735 { 736 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 737 { 738 import std.math: sqrt; 739 vec.array[0] = sqrt(vec.array[0]); 740 vec.array[1] = sqrt(vec.array[1]); 741 vec.array[2] = sqrt(vec.array[2]); 742 vec.array[3] = sqrt(vec.array[3]); 743 return vec; 744 } 745 } 746 unittest 747 { 748 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 749 assert(A.array[0] == 2.0f); 750 assert(A.array[1] == 2.0f); 751 assert(A.array[2] == 2.0f); 752 assert(A.array[3] == 2.0f); 753 } 754 755 version(LDC) 756 { 757 // Disappeared with LDC 1.11 758 static if (__VERSION__ < 2081) 759 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 760 else 761 { 762 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 763 { 764 vec.array[0] = llvm_sqrt(vec.array[0]); 765 vec.array[1] = vec.array[1]; 766 vec.array[2] = vec.array[2]; 767 vec.array[3] = vec.array[3]; 768 return vec; 769 } 770 } 771 } 772 else 773 { 774 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 775 { 776 import std.math: sqrt; 777 vec.array[0] = sqrt(vec.array[0]); 778 return vec; 779 } 780 } 781 unittest 782 { 783 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 784 assert(A.array[0] == 2.0f); 785 assert(A.array[1] == 4.0f); 786 assert(A.array[2] == 4.0f); 787 assert(A.array[3] == 4.0f); 788 } 789 790 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 791 { 792 __m128* aligned = cast(__m128*)mem_addr; 793 *aligned = a; 794 } 795 796 alias _mm_store_ps1 = _mm_store1_ps; 797 798 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 799 { 800 *mem_addr = a[0]; 801 } 802 803 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 804 { 805 __m128* aligned = cast(__m128*)mem_addr; 806 *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a); 807 } 808 809 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe 810 { 811 *p = extractelement!(long2, 1)(a); 812 } 813 814 void _mm_storel_pi(__m64* p, __m128 a) pure @safe 815 { 816 *p = extractelement!(long2, 0)(a); 817 } 818 819 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 820 { 821 __m128* aligned = cast(__m128*)mem_addr; 822 *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a); 823 } 824 825 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 826 { 827 storeUnaligned!(float4)(a, mem_addr); 828 } 829 830 // TODO: _mm_stream_pi, does not seem possible 831 // TODO: _mm_stream_ps, does not seem possible 832 833 834 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 835 { 836 return a - b; 837 } 838 unittest 839 { 840 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 841 a = _mm_sub_ps(a, a); 842 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 843 assert(a.array == correct); 844 } 845 846 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 847 { 848 a[0] -= b[0]; 849 return a; 850 } 851 unittest 852 { 853 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 854 a = _mm_sub_ss(a, a); 855 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 856 assert(a.array == correct); 857 } 858 859 860 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 861 { 862 __m128 tmp3, tmp2, tmp1, tmp0; 863 tmp0 = _mm_unpacklo_ps(row0, row1); 864 tmp2 = _mm_unpacklo_ps(row2, row3); 865 tmp1 = _mm_unpackhi_ps(row0, row1); 866 tmp3 = _mm_unpackhi_ps(row2, row3); 867 row0 = _mm_movelh_ps(tmp0, tmp2); 868 row1 = _mm_movehl_ps(tmp2, tmp0); 869 row2 = _mm_movelh_ps(tmp1, tmp3); 870 row3 = _mm_movehl_ps(tmp3, tmp1); 871 } 872 873 version(LDC) 874 { 875 alias _mm_ucomieq_ss = __builtin_ia32_ucomieq; 876 } 877 // TODO 878 879 version(LDC) 880 { 881 alias _mm_ucomige_ss = __builtin_ia32_ucomige; 882 } 883 // TODO 884 885 version(LDC) 886 { 887 alias _mm_ucomigt_ss = __builtin_ia32_ucomigt; 888 } 889 // TODO 890 891 version(LDC) 892 { 893 alias _mm_ucomile_ss = __builtin_ia32_ucomile; 894 } 895 // TODO 896 897 version(LDC) 898 { 899 alias _mm_ucomilt_ss = __builtin_ia32_ucomilt; 900 } 901 // TODO 902 903 version(LDC) 904 { 905 alias _mm_ucomineq_ss = __builtin_ia32_ucomineq; 906 } 907 // TODO 908 909 910 __m128 _mm_undefined_ps() pure @safe 911 { 912 __m128 undef = void; 913 return undef; 914 } 915 916 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe 917 { 918 return shufflevector!(float4, 2, 6, 3, 7)(a, b); 919 } 920 921 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe 922 { 923 return shufflevector!(float4, 0, 4, 1, 5)(a, b); 924 } 925 926 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe 927 { 928 return a ^ b; 929 }