1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 7 module inteli.xmmintrin; 8 9 public import inteli.types; 10 11 import inteli.internals; 12 13 // SSE1 14 // Note: intrinsics noted MMXREG are actually using MMX registers, 15 // and were not translated. These intrinsics are for instruction 16 // introduced with SSE1, that also work on MMX registers. 17 18 nothrow @nogc: 19 20 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 21 { 22 return a + b; 23 } 24 25 unittest 26 { 27 __m128 a = [1, 2, 3, 4]; 28 a = _mm_add_ps(a, a); 29 assert(a.array[0] == 2); 30 assert(a.array[1] == 4); 31 assert(a.array[2] == 6); 32 assert(a.array[3] == 8); 33 } 34 35 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 36 { 37 a[0] += b[0]; 38 return a; 39 } 40 unittest 41 { 42 __m128 a = [1, 2, 3, 4]; 43 a = _mm_add_ss(a, a); 44 assert(a.array == [2.0f, 2, 3, 4]); 45 } 46 47 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 48 { 49 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 50 } 51 unittest 52 { 53 // Note: tested in emmintrin.d 54 } 55 56 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe 57 { 58 return (~a) & b; 59 } 60 61 62 // MMXREG: _mm_avg_pu16 63 // MMXREG: _mm_avg_pu8 64 65 version(LDC) 66 { 67 pragma(LDC_intrinsic, "llvm.x86.sse.cmp.ps") 68 __m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe; 69 } 70 else 71 { 72 // unimplemented 73 /*__m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe 74 { 75 assert(false, "unimplemented"); 76 }*/ 77 } 78 79 version(LDC) 80 { 81 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 82 { 83 return __builtin_ia32_cmpps(a, b, 0); 84 } 85 86 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 87 { 88 return __builtin_ia32_cmpss(a, b, 0); 89 } 90 91 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 92 { 93 return __builtin_ia32_cmpps(b, a, 2); // CMPLEPS reversed 94 } 95 96 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 97 { 98 return __builtin_ia32_cmpss(b, a, 2); // CMPLESS reversed 99 } 100 101 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 102 { 103 return __builtin_ia32_cmpps(b, a, 1); // CMPLTPS reversed 104 } 105 106 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 107 { 108 return __builtin_ia32_cmpss(b, a, 1); // CMPLTSS reversed 109 } 110 111 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 112 { 113 return __builtin_ia32_cmpps(a, b, 2); // CMPLEPS 114 } 115 116 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 117 { 118 return __builtin_ia32_cmpss(a, b, 2); // CMPLESS 119 } 120 121 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 122 { 123 return __builtin_ia32_cmpps(a, b, 1); // CMPLTPS 124 } 125 126 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 127 { 128 return __builtin_ia32_cmpss(a, b, 1); // CMPLTSS 129 } 130 131 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 132 { 133 return __builtin_ia32_cmpps(a, b, 4); // CMPNEQPS 134 } 135 136 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 137 { 138 return __builtin_ia32_cmpss(a, b, 4); // CMPNEQSS 139 } 140 141 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 142 { 143 return __builtin_ia32_cmpps(b, a, 6); // CMPNLEPS reversed 144 } 145 146 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 147 { 148 return __builtin_ia32_cmpss(b, a, 6); // CMPNLESS reversed 149 } 150 151 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 152 { 153 return __builtin_ia32_cmpps(b, a, 5); // CMPNLTPS reversed 154 } 155 156 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 157 { 158 return __builtin_ia32_cmpss(b, a, 5); // CMPNLTPS reversed 159 } 160 161 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 162 { 163 return __builtin_ia32_cmpps(a, b, 6); // CMPNLEPS 164 } 165 166 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 167 { 168 return __builtin_ia32_cmpss(a, b, 6); // CMPNLESS 169 } 170 171 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 172 { 173 return __builtin_ia32_cmpps(a, b, 5); // CMPNLTPS 174 } 175 176 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 177 { 178 return __builtin_ia32_cmpss(a, b, 5); // CMPNLTSS 179 } 180 181 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 182 { 183 return __builtin_ia32_cmpps(a, b, 7); // CMPORDPS 184 } 185 186 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 187 { 188 return __builtin_ia32_cmpss(a, b, 7); // CMPORDSS 189 } 190 191 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 192 { 193 return __builtin_ia32_cmpps(a, b, 3); // CMPUNORDPS 194 } 195 196 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 197 { 198 return __builtin_ia32_cmpss(a, b, 3); // CMPUNORDSS 199 } 200 } 201 else 202 { 203 // TODO 204 } 205 206 version(LDC) 207 { 208 alias _mm_comieq_ss = __builtin_ia32_comieq; 209 } 210 else 211 { 212 // TODO 213 /*__m128i _mm_comieq_ss(__m128, __m128) pure @safe 214 { 215 assert(false, "unimplemented"); 216 } 217 */ 218 } 219 220 221 version(LDC) 222 { 223 alias _mm_comige_ss = __builtin_ia32_comige; 224 } 225 else 226 { 227 // TODO 228 /* 229 __m128i _mm_comige_ss(__m128, __m128) pure @safe 230 { 231 assert(false, "unimplemented"); 232 } 233 */ 234 } 235 236 237 version(LDC) 238 { 239 alias _mm_comigt_ss = __builtin_ia32_comigt; 240 } 241 else 242 { 243 // TODO 244 /* 245 __m128i _mm_comigt_ss(__m128, __m128) pure @safe 246 { 247 assert(false, "unimplemented"); 248 } 249 */ 250 } 251 252 253 version(LDC) 254 { 255 alias _mm_comile_ss = __builtin_ia32_comile; 256 } 257 else 258 { 259 // TODO 260 /* 261 __m128i _mm_comile_ss(__m128, __m128) pure @safe 262 { 263 assert(false, "unimplemented"); 264 } 265 */ 266 } 267 268 269 version(LDC) 270 { 271 alias _mm_comilt_ss = __builtin_ia32_comilt; 272 } 273 else 274 { 275 // TODO 276 /* 277 __m128i _mm_comilt_ss(__m128, __m128) pure @safe 278 { 279 assert(false, "unimplemented"); 280 } 281 */ 282 } 283 284 version(LDC) 285 { 286 alias _mm_comineq_ss = __builtin_ia32_comineq; 287 } 288 else 289 { 290 // TODO 291 /* 292 __m128i _mm_comineq_ss(__m128, __m128) pure @safe 293 { 294 assert(false, "unimplemented"); 295 } 296 */ 297 } 298 299 // MMXREG: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b) 300 // MMXREG: __m64 _mm_cvt_ps2pi (__m128 a) 301 302 303 version(LDC) 304 { 305 pragma(LDC_intrinsic, "llvm.x86.sse2.cvtsi2sd") 306 double2 _mm_cvt_si2ss(double2, int) pure @safe; 307 } 308 else 309 { 310 // TODO 311 /* 312 __m128d _mm_cvt_si2ss(__m128d, int) pure @safe 313 { 314 assert(false, "unimplemented"); 315 } 316 */ 317 } 318 319 version(LDC) 320 { 321 alias _mm_cvt_ss2si = __builtin_ia32_cvtss2si; 322 } 323 else 324 { 325 // TODO 326 /* 327 int _mm_cvt_ss2si(__m128 v) pure @safe 328 { 329 assert(false, "unimplemented"); 330 } 331 */ 332 } 333 334 // MMXREG: __m128 _mm_cvtpi16_ps (__m64 a) 335 // MMXREG: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) 336 // MMXREG: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) 337 // MMXREG: __m128 _mm_cvtpi8_ps (__m64 a) 338 // MMXREG: __m64 _mm_cvtps_pi16 (__m128 a) 339 // MMXREG: __m64 _mm_cvtps_pi32 (__m128 a) 340 // MMXREG: __m64 _mm_cvtps_pi8 (__m128 a) 341 // MMXREG: __m128 _mm_cvtpu16_ps (__m64 a) 342 // MMXREG: __m128 _mm_cvtpu8_ps (__m64 a) 343 344 version(LDC) 345 { 346 // this LLVM intrinsics seems to still be there 347 pragma(LDC_intrinsic, "llvm.x86.sse.cvtsi2ss") 348 float4 _mm_cvtsi32_ss(float4, int) pure @safe; 349 } 350 else 351 { 352 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe 353 { 354 v[0] = cast(float)x; 355 return v; 356 } 357 } 358 unittest 359 { 360 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 361 assert(a.array == [42.0f, 0, 0, 0]); 362 } 363 364 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 365 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe 366 { 367 v[0] = cast(float)x; 368 return v; 369 } 370 unittest 371 { 372 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 373 assert(a.array == [42.0f, 0, 0, 0]); 374 } 375 376 float _mm_cvtss_f32(__m128 a) pure @safe 377 { 378 return a[0]; 379 } 380 381 version(LDC) 382 { 383 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 384 } 385 else 386 { 387 // TODO 388 } 389 390 version(LDC) 391 { 392 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 393 } 394 else 395 { 396 // TODO 397 } 398 399 // MMXREG: __m64 _mm_cvtt_ps2pi (__m128 a) 400 401 version(LDC) 402 { 403 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 404 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 405 } 406 else 407 { 408 // TODO 409 } 410 411 // MMXREG: _mm_cvttps_pi32 412 413 version(LDC) 414 { 415 alias _mm_cvttss_si64 = __builtin_ia32_cvttss2si64; 416 } 417 else 418 { 419 // TODO 420 } 421 422 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 423 { 424 return a / b; 425 } 426 unittest 427 { 428 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 429 a = _mm_div_ps(a, a); 430 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 431 assert(a.array == correct); 432 } 433 434 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 435 { 436 a[0] /= b[0]; 437 return a; 438 } 439 unittest 440 { 441 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 442 a = _mm_div_ss(a, a); 443 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 444 assert(a.array == correct); 445 } 446 447 // MMXREG: int _mm_extract_pi16 (__m64 a, int imm8) 448 449 // TODO: unsigned int _MM_GET_EXCEPTION_MASK () 450 // TODO: unsigned int _MM_GET_EXCEPTION_STATE () 451 // TODO: unsigned int _MM_GET_FLUSH_ZERO_MODE () 452 // TODO: unsigned int _MM_GET_ROUNDING_MODE () 453 // TODO: stmxcsr 454 // TODO: unsigned int _mm_getcsr (void) 455 456 // MMXREG: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8) 457 458 __m128 _mm_load_ps(const(float)*p) pure @trusted 459 { 460 return *cast(__m128*)p; 461 } 462 463 __m128 _mm_load_ps1(const(float)*p) pure @trusted 464 { 465 float[4] f = [ *p, *p, *p, *p ]; 466 return loadUnaligned!(float4)(f.ptr); 467 } 468 469 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 470 { 471 float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ]; 472 return loadUnaligned!(float4)(f.ptr); 473 } 474 475 alias _mm_load1_ps = _mm_load_ps1; 476 477 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe 478 { 479 long2 la = cast(long2)a; 480 la[1] = *mem_addr; 481 return cast(__m128)la; 482 } 483 484 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe 485 { 486 long2 la = cast(long2)a; 487 la[0] = *mem_addr; 488 return cast(__m128)la; 489 } 490 491 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 492 { 493 __m128* aligned = cast(__m128*)mem_addr; 494 __m128 a = *aligned; 495 return shufflevector!(__m128, 3, 2, 1, 0)(a, a); 496 } 497 498 __m128 _mm_loadu_ps(float*p) pure @safe 499 { 500 return loadUnaligned!(__m128)(p); 501 } 502 503 // MMXREG: _mm_maskmove_si64 504 // MMXREG: _m_maskmovq 505 506 // MMXREG: _mm_max_pi16 507 version(LDC) 508 { 509 alias _mm_max_ps = __builtin_ia32_maxps; 510 } 511 else 512 { 513 // TODO 514 } 515 516 // MMXREG: _mm_max_pu8 517 version(LDC) 518 { 519 alias _mm_max_ss = __builtin_ia32_maxss; 520 } 521 else 522 { 523 // TODO 524 } 525 526 // MMXREG: _mm_min_pi16 527 version(LDC) 528 { 529 alias _mm_min_ps = __builtin_ia32_minps; 530 } 531 else 532 { 533 // TODO 534 } 535 536 // MMXREG: _mm_min_pi8 537 538 version(LDC) 539 { 540 alias _mm_min_ss = __builtin_ia32_minss; 541 } 542 543 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe 544 { 545 return shufflevector!(__m128, 4, 1, 2, 3)(a, b); 546 } 547 548 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe 549 { 550 return shufflevector!(float4, 2, 3, 6, 7)(a, b); 551 } 552 553 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe 554 { 555 return shufflevector!(float4, 0, 1, 4, 5)(a, b); 556 } 557 558 // TODO: int _mm_movemask_pi8 559 version(LDC) 560 { 561 alias _mm_movemask_ps = __builtin_ia32_movmskps; 562 } 563 564 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 565 { 566 return a * b; 567 } 568 unittest 569 { 570 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 571 a = _mm_mul_ps(a, a); 572 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 573 assert(a.array == correct); 574 } 575 576 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 577 { 578 a[0] *= b[0]; 579 return a; 580 } 581 unittest 582 { 583 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 584 a = _mm_mul_ss(a, a); 585 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 586 assert(a.array == correct); 587 } 588 589 // MMXREG: _mm_mulhi_pu16 590 591 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 592 { 593 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 594 } 595 596 // MMXREG: __m64 _m_pavgb (__m64 a, __m64 b) 597 // MMXREG: __m64 _m_pavgw (__m64 a, __m64 b) 598 // MMXREG: int _m_pextrw (__m64 a, int imm8) 599 // MMXREG: __m64 _m_pinsrw (__m64 a, int i, int imm8) 600 // MMXREG: __m64 _m_pmaxsw (__m64 a, __m64 b) 601 // MMXREG: __m64 _m_pmaxub (__m64 a, __m64 b) 602 // MMXREG: __m64 _m_pminsw (__m64 a, __m64 b) 603 // MMXREG: __m64 _m_pminub (__m64 a, __m64 b) 604 // MMXREG: int _m_pmovmskb (__m64 a) 605 606 // MMXREG: __m64 _m_pmulhuw (__m64 a, __m64 b) 607 608 enum _MM_HINT_NTA = 0; 609 enum _MM_HINT_T0 = 1; 610 enum _MM_HINT_T1 = 2; 611 enum _MM_HINT_T2 = 3; 612 613 // Note: locality must be compile-time 614 void _mm_prefetch(int locality)(void* p) pure @safe 615 { 616 llvm_prefetch(p, 0, locality, 1); 617 } 618 619 // MMXREG: __m64 _m_psadbw (__m64 a, __m64 b) 620 // MMXREG: __m64 _m_pshufw (__m64 a, int imm8) 621 622 version(LDC) 623 { 624 alias _mm_rcp_ps = __builtin_ia32_rcpps; 625 } 626 // TODO 627 628 version(LDC) 629 { 630 alias _mm_rcp_ss = __builtin_ia32_rcpss; 631 } 632 // TODO 633 634 version(LDC) 635 { 636 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 637 } 638 // TODO 639 640 version(LDC) 641 { 642 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 643 } 644 // TODO 645 646 // TODO: _mm_sad_pu8 647 // TODO: void _MM_SET_EXCEPTION_MASK (unsigned int a) 648 // TODO: void _MM_SET_EXCEPTION_STATE (unsigned int a) 649 // TODO: void _MM_SET_FLUSH_ZERO_MODE (unsigned int a) 650 651 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 652 { 653 float[4] result = [e0, e1, e2, e3]; 654 return loadUnaligned!(float4)(result.ptr); 655 } 656 657 alias _mm_set_ps1 = _mm_set1_ps; 658 659 // TODO: _MM_SET_ROUNDING_MODE 660 661 __m128 _mm_set_ss (float a) pure @trusted 662 { 663 float[4] result = [a, 0.0f, 0.0f, 0.0f]; 664 return loadUnaligned!(float4)(result.ptr); 665 } 666 667 __m128 _mm_set1_ps (float a) pure @trusted 668 { 669 float[4] result = [a, a, a, a]; 670 return loadUnaligned!(float4)(result.ptr); 671 } 672 673 // TODO: _mm_setcsr 674 675 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 676 { 677 float[4] result = [e3, e2, e1, e0]; 678 return loadUnaligned!(float4)(result.ptr); 679 } 680 681 __m128 _mm_setzero_ps() pure @trusted 682 { 683 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 684 return loadUnaligned!(float4)(result.ptr); 685 } 686 687 version(LDC) 688 { 689 alias _mm_sfence = __builtin_ia32_sfence; 690 } 691 // TODO 692 693 // MMXREG: mm_shuffle_pi16 694 695 // Note: the immediate shuffle value is given at compile-time instead of runtime. 696 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 697 { 698 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 699 } 700 701 version(LDC) 702 { 703 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 704 } 705 else 706 { 707 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 708 { 709 import std.math: sqrt; 710 vec.array[0] = sqrt(vec.array[0]); 711 vec.array[1] = sqrt(vec.array[1]); 712 vec.array[2] = sqrt(vec.array[2]); 713 vec.array[3] = sqrt(vec.array[3]); 714 return vec; 715 } 716 } 717 718 version(LDC) 719 { 720 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 721 } 722 else 723 { 724 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 725 { 726 import std.math: sqrt; 727 vec.array[0] = sqrt(vec.array[0]); 728 return vec; 729 } 730 } 731 732 unittest 733 { 734 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 735 assert(A.array[0] == 2.0f); 736 } 737 738 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 739 { 740 __m128* aligned = cast(__m128*)mem_addr; 741 *aligned = a; 742 } 743 744 alias _mm_store_ps1 = _mm_store1_ps; 745 746 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 747 { 748 *mem_addr = a[0]; 749 } 750 751 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 752 { 753 __m128* aligned = cast(__m128*)mem_addr; 754 *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a); 755 } 756 757 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe 758 { 759 *p = extractelement!(long2, 1)(a); 760 } 761 762 void _mm_storel_pi(__m64* p, __m128 a) pure @safe 763 { 764 *p = extractelement!(long2, 0)(a); 765 } 766 767 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 768 { 769 __m128* aligned = cast(__m128*)mem_addr; 770 *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a); 771 } 772 773 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 774 { 775 storeUnaligned!(float4)(a, mem_addr); 776 } 777 778 // TODO: _mm_stream_pi, does not seem possible 779 // TODO: _mm_stream_ps, does not seem possible 780 781 782 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 783 { 784 return a - b; 785 } 786 unittest 787 { 788 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 789 a = _mm_sub_ps(a, a); 790 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 791 assert(a.array == correct); 792 } 793 794 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 795 { 796 a[0] -= b[0]; 797 return a; 798 } 799 unittest 800 { 801 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 802 a = _mm_sub_ss(a, a); 803 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 804 assert(a.array == correct); 805 } 806 807 808 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 809 { 810 __m128 tmp3, tmp2, tmp1, tmp0; 811 tmp0 = _mm_unpacklo_ps(row0, row1); 812 tmp2 = _mm_unpacklo_ps(row2, row3); 813 tmp1 = _mm_unpackhi_ps(row0, row1); 814 tmp3 = _mm_unpackhi_ps(row2, row3); 815 row0 = _mm_movelh_ps(tmp0, tmp2); 816 row1 = _mm_movehl_ps(tmp2, tmp0); 817 row2 = _mm_movelh_ps(tmp1, tmp3); 818 row3 = _mm_movehl_ps(tmp3, tmp1); 819 } 820 821 version(LDC) 822 { 823 alias _mm_ucomieq_ss = __builtin_ia32_ucomieq; 824 } 825 // TODO 826 827 version(LDC) 828 { 829 alias _mm_ucomige_ss = __builtin_ia32_ucomige; 830 } 831 // TODO 832 833 version(LDC) 834 { 835 alias _mm_ucomigt_ss = __builtin_ia32_ucomigt; 836 } 837 // TODO 838 839 version(LDC) 840 { 841 alias _mm_ucomile_ss = __builtin_ia32_ucomile; 842 } 843 // TODO 844 845 version(LDC) 846 { 847 alias _mm_ucomilt_ss = __builtin_ia32_ucomilt; 848 } 849 // TODO 850 851 version(LDC) 852 { 853 alias _mm_ucomineq_ss = __builtin_ia32_ucomineq; 854 } 855 // TODO 856 857 858 __m128 _mm_undefined_ps() pure @safe 859 { 860 __m128 undef = void; 861 return undef; 862 } 863 864 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe 865 { 866 return shufflevector!(float4, 2, 6, 3, 7)(a, b); 867 } 868 869 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe 870 { 871 return shufflevector!(float4, 0, 4, 1, 5)(a, b); 872 } 873 874 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe 875 { 876 return a ^ b; 877 }