1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 7 module inteli.xmmintrin; 8 9 public import inteli.types; 10 11 import inteli.internals; 12 13 // SSE1 14 // Note: intrinsics noted MMXREG are actually using MMX registers, 15 // and were not translated. These intrinsics are for instruction 16 // introduced with SSE1, that also work on MMX registers. 17 18 nothrow @nogc: 19 20 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 21 { 22 return a + b; 23 } 24 25 unittest 26 { 27 __m128 a = [1, 2, 3, 4]; 28 a = _mm_add_ps(a, a); 29 assert(a.array[0] == 2); 30 assert(a.array[1] == 4); 31 assert(a.array[2] == 6); 32 assert(a.array[3] == 8); 33 } 34 35 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 36 { 37 a[0] += b[0]; 38 return a; 39 } 40 unittest 41 { 42 __m128 a = [1, 2, 3, 4]; 43 a = _mm_add_ss(a, a); 44 assert(a.array == [2.0f, 2, 3, 4]); 45 } 46 47 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 48 { 49 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 50 } 51 unittest 52 { 53 // Note: tested in emmintrin.d 54 } 55 56 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe 57 { 58 return (~a) & b; 59 } 60 61 62 // MMXREG: _mm_avg_pu16 63 // MMXREG: _mm_avg_pu8 64 65 version(LDC) 66 { 67 pragma(LDC_intrinsic, "llvm.x86.sse.cmp.ps") 68 __m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe; 69 } 70 else 71 { 72 // unimplemented 73 /*__m128 __builtin_ia32_cmpps(__m128, __m128, byte) pure @safe 74 { 75 assert(false, "unimplemented"); 76 }*/ 77 } 78 79 version(LDC) 80 { 81 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 82 { 83 return __builtin_ia32_cmpps(a, b, 0); 84 } 85 86 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 87 { 88 return __builtin_ia32_cmpss(a, b, 0); 89 } 90 91 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 92 { 93 return __builtin_ia32_cmpps(b, a, 2); // CMPLEPS reversed 94 } 95 96 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 97 { 98 return __builtin_ia32_cmpss(b, a, 2); // CMPLESS reversed 99 } 100 101 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 102 { 103 return __builtin_ia32_cmpps(b, a, 1); // CMPLTPS reversed 104 } 105 106 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 107 { 108 return __builtin_ia32_cmpss(b, a, 1); // CMPLTSS reversed 109 } 110 111 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 112 { 113 return __builtin_ia32_cmpps(a, b, 2); // CMPLEPS 114 } 115 116 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 117 { 118 return __builtin_ia32_cmpss(a, b, 2); // CMPLESS 119 } 120 121 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 122 { 123 return __builtin_ia32_cmpps(a, b, 1); // CMPLTPS 124 } 125 126 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 127 { 128 return __builtin_ia32_cmpss(a, b, 1); // CMPLTSS 129 } 130 131 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 132 { 133 return __builtin_ia32_cmpps(a, b, 4); // CMPNEQPS 134 } 135 136 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 137 { 138 return __builtin_ia32_cmpss(a, b, 4); // CMPNEQSS 139 } 140 141 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 142 { 143 return __builtin_ia32_cmpps(b, a, 6); // CMPNLEPS reversed 144 } 145 146 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 147 { 148 return __builtin_ia32_cmpss(b, a, 6); // CMPNLESS reversed 149 } 150 151 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 152 { 153 return __builtin_ia32_cmpps(b, a, 5); // CMPNLTPS reversed 154 } 155 156 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 157 { 158 return __builtin_ia32_cmpss(b, a, 5); // CMPNLTPS reversed 159 } 160 161 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 162 { 163 return __builtin_ia32_cmpps(a, b, 6); // CMPNLEPS 164 } 165 166 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 167 { 168 return __builtin_ia32_cmpss(a, b, 6); // CMPNLESS 169 } 170 171 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 172 { 173 return __builtin_ia32_cmpps(a, b, 5); // CMPNLTPS 174 } 175 176 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 177 { 178 return __builtin_ia32_cmpss(a, b, 5); // CMPNLTSS 179 } 180 181 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 182 { 183 return __builtin_ia32_cmpps(a, b, 7); // CMPORDPS 184 } 185 186 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 187 { 188 return __builtin_ia32_cmpss(a, b, 7); // CMPORDSS 189 } 190 191 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 192 { 193 return __builtin_ia32_cmpps(a, b, 3); // CMPUNORDPS 194 } 195 196 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 197 { 198 return __builtin_ia32_cmpss(a, b, 3); // CMPUNORDSS 199 } 200 } 201 else 202 { 203 // TODO 204 } 205 206 version(LDC) 207 { 208 alias _mm_comieq_ss = __builtin_ia32_comieq; 209 } 210 else 211 { 212 // TODO 213 /*__m128i _mm_comieq_ss(__m128, __m128) pure @safe 214 { 215 assert(false, "unimplemented"); 216 } 217 */ 218 } 219 220 221 version(LDC) 222 { 223 alias _mm_comige_ss = __builtin_ia32_comige; 224 } 225 else 226 { 227 // TODO 228 /* 229 __m128i _mm_comige_ss(__m128, __m128) pure @safe 230 { 231 assert(false, "unimplemented"); 232 } 233 */ 234 } 235 236 237 version(LDC) 238 { 239 alias _mm_comigt_ss = __builtin_ia32_comigt; 240 } 241 else 242 { 243 // TODO 244 /* 245 __m128i _mm_comigt_ss(__m128, __m128) pure @safe 246 { 247 assert(false, "unimplemented"); 248 } 249 */ 250 } 251 252 253 version(LDC) 254 { 255 alias _mm_comile_ss = __builtin_ia32_comile; 256 } 257 else 258 { 259 // TODO 260 /* 261 __m128i _mm_comile_ss(__m128, __m128) pure @safe 262 { 263 assert(false, "unimplemented"); 264 } 265 */ 266 } 267 268 269 version(LDC) 270 { 271 alias _mm_comilt_ss = __builtin_ia32_comilt; 272 } 273 else 274 { 275 // TODO 276 /* 277 __m128i _mm_comilt_ss(__m128, __m128) pure @safe 278 { 279 assert(false, "unimplemented"); 280 } 281 */ 282 } 283 284 version(LDC) 285 { 286 alias _mm_comineq_ss = __builtin_ia32_comineq; 287 } 288 else 289 { 290 // TODO 291 /* 292 __m128i _mm_comineq_ss(__m128, __m128) pure @safe 293 { 294 assert(false, "unimplemented"); 295 } 296 */ 297 } 298 299 // MMXREG: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b) 300 // MMXREG: __m64 _mm_cvt_ps2pi (__m128 a) 301 302 303 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe 304 { 305 v[0] = cast(float)x; 306 return v; 307 } 308 unittest 309 { 310 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 311 assert(a.array == [42f, 0, 0, 0]); 312 } 313 314 version(LDC) 315 { 316 alias _mm_cvt_ss2si = __builtin_ia32_cvtss2si; 317 } 318 else 319 { 320 // TODO 321 /* 322 int _mm_cvt_ss2si(__m128 v) pure @safe 323 { 324 assert(false, "unimplemented"); 325 } 326 */ 327 } 328 329 // MMXREG: __m128 _mm_cvtpi16_ps (__m64 a) 330 // MMXREG: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) 331 // MMXREG: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) 332 // MMXREG: __m128 _mm_cvtpi8_ps (__m64 a) 333 // MMXREG: __m64 _mm_cvtps_pi16 (__m128 a) 334 // MMXREG: __m64 _mm_cvtps_pi32 (__m128 a) 335 // MMXREG: __m64 _mm_cvtps_pi8 (__m128 a) 336 // MMXREG: __m128 _mm_cvtpu16_ps (__m64 a) 337 // MMXREG: __m128 _mm_cvtpu8_ps (__m64 a) 338 339 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe 340 { 341 v[0] = cast(float)x; 342 return v; 343 } 344 unittest 345 { 346 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 347 assert(a.array == [42.0f, 0, 0, 0]); 348 } 349 350 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 351 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe 352 { 353 v[0] = cast(float)x; 354 return v; 355 } 356 unittest 357 { 358 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 359 assert(a.array == [42.0f, 0, 0, 0]); 360 } 361 362 float _mm_cvtss_f32(__m128 a) pure @safe 363 { 364 return a[0]; 365 } 366 367 version(LDC) 368 { 369 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 370 } 371 else 372 { 373 // TODO 374 } 375 376 version(LDC) 377 { 378 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 379 } 380 else 381 { 382 // TODO 383 } 384 385 // MMXREG: __m64 _mm_cvtt_ps2pi (__m128 a) 386 387 version(LDC) 388 { 389 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 390 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 391 } 392 else 393 { 394 // TODO 395 } 396 397 // MMXREG: _mm_cvttps_pi32 398 399 version(LDC) 400 { 401 alias _mm_cvttss_si64 = __builtin_ia32_cvttss2si64; 402 } 403 else 404 { 405 // TODO 406 } 407 408 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 409 { 410 return a / b; 411 } 412 unittest 413 { 414 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 415 a = _mm_div_ps(a, a); 416 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 417 assert(a.array == correct); 418 } 419 420 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 421 { 422 a[0] /= b[0]; 423 return a; 424 } 425 unittest 426 { 427 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 428 a = _mm_div_ss(a, a); 429 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 430 assert(a.array == correct); 431 } 432 433 // MMXREG: int _mm_extract_pi16 (__m64 a, int imm8) 434 435 // TODO: unsigned int _MM_GET_EXCEPTION_MASK () 436 // TODO: unsigned int _MM_GET_EXCEPTION_STATE () 437 // TODO: unsigned int _MM_GET_FLUSH_ZERO_MODE () 438 // TODO: unsigned int _MM_GET_ROUNDING_MODE () 439 // TODO: stmxcsr 440 // TODO: unsigned int _mm_getcsr (void) 441 442 // MMXREG: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8) 443 444 __m128 _mm_load_ps(const(float)*p) pure @trusted 445 { 446 return *cast(__m128*)p; 447 } 448 449 __m128 _mm_load_ps1(const(float)*p) pure @trusted 450 { 451 float[4] f = [ *p, *p, *p, *p ]; 452 return loadUnaligned!(float4)(f.ptr); 453 } 454 455 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 456 { 457 float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ]; 458 return loadUnaligned!(float4)(f.ptr); 459 } 460 461 alias _mm_load1_ps = _mm_load_ps1; 462 463 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe 464 { 465 long2 la = cast(long2)a; 466 la[1] = *mem_addr; 467 return cast(__m128)la; 468 } 469 470 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe 471 { 472 long2 la = cast(long2)a; 473 la[0] = *mem_addr; 474 return cast(__m128)la; 475 } 476 477 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 478 { 479 __m128* aligned = cast(__m128*)mem_addr; 480 __m128 a = *aligned; 481 return shufflevector!(__m128, 3, 2, 1, 0)(a, a); 482 } 483 484 __m128 _mm_loadu_ps(float*p) pure @safe 485 { 486 return loadUnaligned!(__m128)(p); 487 } 488 489 // MMXREG: _mm_maskmove_si64 490 // MMXREG: _m_maskmovq 491 492 // MMXREG: _mm_max_pi16 493 version(LDC) 494 { 495 alias _mm_max_ps = __builtin_ia32_maxps; 496 } 497 else 498 { 499 // TODO 500 } 501 502 // MMXREG: _mm_max_pu8 503 version(LDC) 504 { 505 alias _mm_max_ss = __builtin_ia32_maxss; 506 } 507 else 508 { 509 // TODO 510 } 511 512 // MMXREG: _mm_min_pi16 513 version(LDC) 514 { 515 alias _mm_min_ps = __builtin_ia32_minps; 516 } 517 else 518 { 519 // TODO 520 } 521 522 // MMXREG: _mm_min_pi8 523 524 version(LDC) 525 { 526 alias _mm_min_ss = __builtin_ia32_minss; 527 } 528 529 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe 530 { 531 return shufflevector!(__m128, 4, 1, 2, 3)(a, b); 532 } 533 534 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe 535 { 536 return shufflevector!(float4, 2, 3, 6, 7)(a, b); 537 } 538 539 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe 540 { 541 return shufflevector!(float4, 0, 1, 4, 5)(a, b); 542 } 543 544 // TODO: int _mm_movemask_pi8 545 version(LDC) 546 { 547 alias _mm_movemask_ps = __builtin_ia32_movmskps; 548 } 549 550 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 551 { 552 return a * b; 553 } 554 unittest 555 { 556 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 557 a = _mm_mul_ps(a, a); 558 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 559 assert(a.array == correct); 560 } 561 562 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 563 { 564 a[0] *= b[0]; 565 return a; 566 } 567 unittest 568 { 569 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 570 a = _mm_mul_ss(a, a); 571 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 572 assert(a.array == correct); 573 } 574 575 // MMXREG: _mm_mulhi_pu16 576 577 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 578 { 579 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 580 } 581 582 // MMXREG: __m64 _m_pavgb (__m64 a, __m64 b) 583 // MMXREG: __m64 _m_pavgw (__m64 a, __m64 b) 584 // MMXREG: int _m_pextrw (__m64 a, int imm8) 585 // MMXREG: __m64 _m_pinsrw (__m64 a, int i, int imm8) 586 // MMXREG: __m64 _m_pmaxsw (__m64 a, __m64 b) 587 // MMXREG: __m64 _m_pmaxub (__m64 a, __m64 b) 588 // MMXREG: __m64 _m_pminsw (__m64 a, __m64 b) 589 // MMXREG: __m64 _m_pminub (__m64 a, __m64 b) 590 // MMXREG: int _m_pmovmskb (__m64 a) 591 592 // MMXREG: __m64 _m_pmulhuw (__m64 a, __m64 b) 593 594 enum _MM_HINT_NTA = 0; 595 enum _MM_HINT_T0 = 1; 596 enum _MM_HINT_T1 = 2; 597 enum _MM_HINT_T2 = 3; 598 599 // Note: locality must be compile-time 600 void _mm_prefetch(int locality)(void* p) pure @safe 601 { 602 llvm_prefetch(p, 0, locality, 1); 603 } 604 605 // MMXREG: __m64 _m_psadbw (__m64 a, __m64 b) 606 // MMXREG: __m64 _m_pshufw (__m64 a, int imm8) 607 608 version(LDC) 609 { 610 alias _mm_rcp_ps = __builtin_ia32_rcpps; 611 } 612 // TODO 613 614 version(LDC) 615 { 616 alias _mm_rcp_ss = __builtin_ia32_rcpss; 617 } 618 // TODO 619 620 version(LDC) 621 { 622 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 623 } 624 // TODO 625 626 version(LDC) 627 { 628 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 629 } 630 // TODO 631 632 // TODO: _mm_sad_pu8 633 // TODO: void _MM_SET_EXCEPTION_MASK (unsigned int a) 634 // TODO: void _MM_SET_EXCEPTION_STATE (unsigned int a) 635 // TODO: void _MM_SET_FLUSH_ZERO_MODE (unsigned int a) 636 637 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 638 { 639 float[4] result = [e0, e1, e2, e3]; 640 return loadUnaligned!(float4)(result.ptr); 641 } 642 643 alias _mm_set_ps1 = _mm_set1_ps; 644 645 // TODO: _MM_SET_ROUNDING_MODE 646 647 __m128 _mm_set_ss (float a) pure @trusted 648 { 649 float[4] result = [a, 0.0f, 0.0f, 0.0f]; 650 return loadUnaligned!(float4)(result.ptr); 651 } 652 653 __m128 _mm_set1_ps (float a) pure @trusted 654 { 655 float[4] result = [a, a, a, a]; 656 return loadUnaligned!(float4)(result.ptr); 657 } 658 659 // TODO: _mm_setcsr 660 661 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 662 { 663 float[4] result = [e3, e2, e1, e0]; 664 return loadUnaligned!(float4)(result.ptr); 665 } 666 667 __m128 _mm_setzero_ps() pure @trusted 668 { 669 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 670 return loadUnaligned!(float4)(result.ptr); 671 } 672 673 version(LDC) 674 { 675 alias _mm_sfence = __builtin_ia32_sfence; 676 } 677 // TODO 678 679 // MMXREG: mm_shuffle_pi16 680 681 // Note: the immediate shuffle value is given at compile-time instead of runtime. 682 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 683 { 684 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 685 } 686 687 version(LDC) 688 { 689 // Disappeared with LDC 1.11 690 static if (__VERSION__ < 2081) 691 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 692 else 693 { 694 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 695 { 696 vec.array[0] = llvm_sqrt(vec.array[0]); 697 vec.array[1] = llvm_sqrt(vec.array[1]); 698 vec.array[2] = llvm_sqrt(vec.array[2]); 699 vec.array[3] = llvm_sqrt(vec.array[3]); 700 return vec; 701 } 702 } 703 } 704 else 705 { 706 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 707 { 708 import std.math: sqrt; 709 vec.array[0] = sqrt(vec.array[0]); 710 vec.array[1] = sqrt(vec.array[1]); 711 vec.array[2] = sqrt(vec.array[2]); 712 vec.array[3] = sqrt(vec.array[3]); 713 return vec; 714 } 715 } 716 unittest 717 { 718 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 719 assert(A.array[0] == 2.0f); 720 assert(A.array[1] == 2.0f); 721 assert(A.array[2] == 2.0f); 722 assert(A.array[3] == 2.0f); 723 } 724 725 version(LDC) 726 { 727 // Disappeared with LDC 1.11 728 static if (__VERSION__ < 2081) 729 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 730 else 731 { 732 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 733 { 734 vec.array[0] = llvm_sqrt(vec.array[0]); 735 vec.array[1] = vec.array[1]; 736 vec.array[2] = vec.array[2]; 737 vec.array[3] = vec.array[3]; 738 return vec; 739 } 740 } 741 } 742 else 743 { 744 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 745 { 746 import std.math: sqrt; 747 vec.array[0] = sqrt(vec.array[0]); 748 return vec; 749 } 750 } 751 unittest 752 { 753 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 754 assert(A.array[0] == 2.0f); 755 assert(A.array[1] == 4.0f); 756 assert(A.array[2] == 4.0f); 757 assert(A.array[3] == 4.0f); 758 } 759 760 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 761 { 762 __m128* aligned = cast(__m128*)mem_addr; 763 *aligned = a; 764 } 765 766 alias _mm_store_ps1 = _mm_store1_ps; 767 768 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 769 { 770 *mem_addr = a[0]; 771 } 772 773 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 774 { 775 __m128* aligned = cast(__m128*)mem_addr; 776 *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a); 777 } 778 779 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe 780 { 781 *p = extractelement!(long2, 1)(a); 782 } 783 784 void _mm_storel_pi(__m64* p, __m128 a) pure @safe 785 { 786 *p = extractelement!(long2, 0)(a); 787 } 788 789 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 790 { 791 __m128* aligned = cast(__m128*)mem_addr; 792 *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a); 793 } 794 795 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 796 { 797 storeUnaligned!(float4)(a, mem_addr); 798 } 799 800 // TODO: _mm_stream_pi, does not seem possible 801 // TODO: _mm_stream_ps, does not seem possible 802 803 804 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 805 { 806 return a - b; 807 } 808 unittest 809 { 810 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 811 a = _mm_sub_ps(a, a); 812 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 813 assert(a.array == correct); 814 } 815 816 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 817 { 818 a[0] -= b[0]; 819 return a; 820 } 821 unittest 822 { 823 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 824 a = _mm_sub_ss(a, a); 825 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 826 assert(a.array == correct); 827 } 828 829 830 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 831 { 832 __m128 tmp3, tmp2, tmp1, tmp0; 833 tmp0 = _mm_unpacklo_ps(row0, row1); 834 tmp2 = _mm_unpacklo_ps(row2, row3); 835 tmp1 = _mm_unpackhi_ps(row0, row1); 836 tmp3 = _mm_unpackhi_ps(row2, row3); 837 row0 = _mm_movelh_ps(tmp0, tmp2); 838 row1 = _mm_movehl_ps(tmp2, tmp0); 839 row2 = _mm_movelh_ps(tmp1, tmp3); 840 row3 = _mm_movehl_ps(tmp3, tmp1); 841 } 842 843 version(LDC) 844 { 845 alias _mm_ucomieq_ss = __builtin_ia32_ucomieq; 846 } 847 // TODO 848 849 version(LDC) 850 { 851 alias _mm_ucomige_ss = __builtin_ia32_ucomige; 852 } 853 // TODO 854 855 version(LDC) 856 { 857 alias _mm_ucomigt_ss = __builtin_ia32_ucomigt; 858 } 859 // TODO 860 861 version(LDC) 862 { 863 alias _mm_ucomile_ss = __builtin_ia32_ucomile; 864 } 865 // TODO 866 867 version(LDC) 868 { 869 alias _mm_ucomilt_ss = __builtin_ia32_ucomilt; 870 } 871 // TODO 872 873 version(LDC) 874 { 875 alias _mm_ucomineq_ss = __builtin_ia32_ucomineq; 876 } 877 // TODO 878 879 880 __m128 _mm_undefined_ps() pure @safe 881 { 882 __m128 undef = void; 883 return undef; 884 } 885 886 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe 887 { 888 return shufflevector!(float4, 2, 6, 3, 7)(a, b); 889 } 890 891 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe 892 { 893 return shufflevector!(float4, 0, 4, 1, 5)(a, b); 894 } 895 896 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe 897 { 898 return a ^ b; 899 }