1 /** 2 * SSE4.1 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2021. 5 * Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.smmintrin; 9 10 // SSE4.1 instructions 11 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 12 // Note: this header will work whether you have SSE4.1 enabled or not. 13 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 14 // generate SSE4.1 instructions. 15 16 public import inteli.types; 17 import inteli.internals; 18 19 // smmintrin pulls in all previous instruction set intrinsics. 20 public import inteli.tmmintrin; 21 22 nothrow @nogc: 23 24 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 25 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 26 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 27 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 28 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 29 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 30 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 31 32 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 33 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 34 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 35 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 36 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 37 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 38 39 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 40 // Note: changed signature, GDC needs a compile-time value for imm8. 41 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted 42 { 43 // PERF DMD 44 // PERF LDC: clang has access to __builtin_ia32_pblendw128 but we do not, for some reason. 45 // Not sure how to get vblendw 46 static if (GDC_with_SSE41) 47 { 48 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 49 } 50 else 51 { 52 short8 r; 53 short8 sa = cast(short8)a; 54 short8 sb = cast(short8)b; 55 for (int n = 0; n < 8; ++n) 56 { 57 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 58 } 59 return cast(__m128i)r; 60 } 61 } 62 unittest 63 { 64 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 65 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 66 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 67 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 68 assert(C.array == correct); 69 } 70 71 72 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst. 73 // Note: changed signature, GDC needs a compile-time value for imm8. 74 __m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8) @trusted 75 { 76 // PERF DMD 77 static if (GDC_with_SSE41) 78 { 79 return cast(__m128i) __builtin_ia32_blendpd(cast(short8)a, cast(short8)b, imm8); 80 } 81 else 82 { 83 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 84 double2 r; 85 for (int n = 0; n < 2; ++n) 86 { 87 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 88 } 89 return cast(__m128d)r; 90 } 91 } 92 unittest 93 { 94 __m128d A = _mm_setr_pd(0, 1); 95 __m128d B = _mm_setr_pd(8, 9); 96 double2 C = _mm_blend_pd(A, B, 2); // 10 97 double[2] correct = [0, 9]; 98 assert(C.array == correct); 99 } 100 101 102 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst. 103 // Note: changed signature, GDC needs a compile-time value for imm8. 104 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted 105 { 106 // PERF DMD 107 static assert(imm8 >= 0 && imm8 < 16); 108 static if (GDC_with_SSE41) 109 { 110 return __builtin_ia32_blendps(a, b, imm8); 111 } 112 else version(LDC) 113 { 114 // LDC x86: generates blendps since LDC 1.1 -O2 115 // arm64: pretty good, two instructions worst case 116 return shufflevector!(float4, (imm8 & 1) ? 4 : 0, 117 (imm8 & 2) ? 5 : 1, 118 (imm8 & 4) ? 6 : 2, 119 (imm8 & 8) ? 7 : 3)(a, b); 120 } 121 else 122 { 123 __m128 r; 124 for (int n = 0; n < 4; ++n) 125 { 126 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 127 } 128 return r; 129 } 130 } 131 unittest 132 { 133 __m128 A = _mm_setr_ps(0, 1, 2, 3); 134 __m128 B = _mm_setr_ps(8, 9, 10, 11); 135 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 136 float[4] correct = [8, 1, 10, 11]; 137 assert(C.array == correct); 138 } 139 140 141 /* 142 /// Blend packed 8-bit integers from a and b using mask, and store the results in dst. 143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 144 { 145 } 146 unittest 147 { 148 } 149 */ 150 151 /* 152 /// Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst. 153 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 154 { 155 } 156 unittest 157 { 158 } 159 */ 160 161 /* 162 /// Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst. 163 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 164 { 165 } 166 unittest 167 { 168 } 169 */ 170 171 /* 172 /// Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst. 173 __m128d _mm_ceil_pd (__m128d a) @trusted 174 { 175 } 176 unittest 177 { 178 } 179 */ 180 181 /* 182 /// Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst. 183 __m128 _mm_ceil_ps (__m128 a) @trusted 184 { 185 } 186 unittest 187 { 188 } 189 */ 190 191 /* 192 /// Round the lower double-precision (64-bit) floating-point element in b up to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst. 193 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 194 { 195 } 196 unittest 197 { 198 } 199 */ 200 201 /* 202 /// Round the lower single-precision (32-bit) floating-point element in b up to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. 203 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 204 { 205 } 206 unittest 207 { 208 } 209 */ 210 211 /* 212 /// Compare packed 64-bit integers in a and b for equality, and store the results in dst. 213 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 214 { 215 } 216 unittest 217 { 218 } 219 */ 220 221 /* 222 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst. 223 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 224 { 225 } 226 unittest 227 { 228 } 229 */ 230 231 /* 232 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst. 233 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 234 { 235 } 236 unittest 237 { 238 } 239 */ 240 241 /* 242 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst. 243 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 244 { 245 } 246 unittest 247 { 248 } 249 */ 250 251 /* 252 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst. 253 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 254 { 255 } 256 unittest 257 { 258 } 259 */ 260 261 /* 262 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst. 263 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 264 { 265 } 266 unittest 267 { 268 } 269 */ 270 271 /* 272 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst. 273 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 274 { 275 } 276 unittest 277 { 278 } 279 */ 280 281 /* 282 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst. 283 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 284 { 285 } 286 unittest 287 { 288 } 289 */ 290 291 /* 292 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst. 293 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 294 { 295 } 296 unittest 297 { 298 } 299 */ 300 301 /* 302 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst. 303 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 304 { 305 } 306 unittest 307 { 308 } 309 */ 310 311 /* 312 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst. 313 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 314 { 315 } 316 unittest 317 { 318 } 319 */ 320 321 /* 322 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst. 323 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 324 { 325 } 326 unittest 327 { 328 } 329 */ 330 331 /* 332 /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst. 333 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 334 { 335 } 336 unittest 337 { 338 } 339 */ 340 341 /* 342 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8. 343 __m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8) @trusted 344 { 345 } 346 unittest 347 { 348 } 349 */ 350 351 /* 352 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8. 353 __m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8) @trusted 354 { 355 } 356 unittest 357 { 358 } 359 */ 360 361 /* 362 /// Extract a 32-bit integer from a, selected with imm8, and store the result in dst. 363 int _mm_extract_epi32 (__m128i a, const int imm8) @trusted 364 { 365 } 366 unittest 367 { 368 } 369 */ 370 371 /* 372 /// Extract a 64-bit integer from a, selected with imm8, and store the result in dst. 373 __int64 _mm_extract_epi64 (__m128i a, const int imm8) @trusted 374 { 375 } 376 unittest 377 { 378 } 379 */ 380 381 /* 382 /// Extract an 8-bit integer from a, selected with imm8, and store the result in the lower element of dst. 383 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 384 { 385 } 386 unittest 387 { 388 } 389 */ 390 391 /* 392 /// Extract a single-precision (32-bit) floating-point element from a, selected with imm8, and store the result in dst. 393 int _mm_extract_ps (__m128 a, const int imm8) @trusted 394 { 395 } 396 unittest 397 { 398 } 399 */ 400 401 /* 402 /// Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst. 403 __m128d _mm_floor_pd (__m128d a) @trusted 404 { 405 } 406 unittest 407 { 408 } 409 */ 410 411 /* 412 /// Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst. 413 __m128 _mm_floor_ps (__m128 a) @trusted 414 { 415 } 416 unittest 417 { 418 } 419 */ 420 421 /* 422 /// Round the lower double-precision (64-bit) floating-point element in b down to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst. 423 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 424 { 425 } 426 unittest 427 { 428 } 429 */ 430 431 /* 432 /// Round the lower single-precision (32-bit) floating-point element in b down to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. 433 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 434 { 435 } 436 unittest 437 { 438 } 439 */ 440 441 /* 442 /// Copy a to dst, and insert the 32-bit integer i into dst at the location specified by imm8. 443 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) @trusted 444 { 445 } 446 unittest 447 { 448 } 449 */ 450 451 /* 452 /// Copy a to dst, and insert the 64-bit integer i into dst at the location specified by imm8. 453 __m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8) @trusted 454 { 455 } 456 unittest 457 { 458 } 459 */ 460 461 /* 462 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 463 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 464 { 465 } 466 unittest 467 { 468 } 469 */ 470 471 /* 472 /// Copy a to tmp, then insert a single-precision (32-bit) floating-point element from b into tmp using the control in imm8. Store tmp to dst using the mask in imm8 (elements are zeroed out when the corresponding bit is set). 473 __m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8) @trusted 474 { 475 } 476 unittest 477 { 478 } 479 */ 480 481 /* 482 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst. 483 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 484 { 485 } 486 unittest 487 { 488 } 489 */ 490 491 /* 492 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst. 493 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 494 { 495 } 496 unittest 497 { 498 } 499 */ 500 501 /* 502 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst. 503 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 504 { 505 } 506 unittest 507 { 508 } 509 */ 510 511 /* 512 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst. 513 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 514 { 515 } 516 unittest 517 { 518 } 519 */ 520 521 /* 522 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst. 523 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 524 { 525 } 526 unittest 527 { 528 } 529 */ 530 531 /* 532 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst. 533 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 534 { 535 } 536 unittest 537 { 538 } 539 */ 540 541 /* 542 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 543 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 544 { 545 } 546 unittest 547 { 548 } 549 */ 550 551 /* 552 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 553 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 554 { 555 } 556 unittest 557 { 558 } 559 */ 560 561 /* 562 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in a, store the minimum and index in dst, and zero the remaining bits in dst. 563 __m128i _mm_minpos_epu16 (__m128i a) @trusted 564 { 565 } 566 unittest 567 { 568 } 569 */ 570 571 /* 572 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Eight SADs are performed using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8. 573 __m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8) @trusted 574 { 575 } 576 unittest 577 { 578 } 579 */ 580 581 /* 582 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 583 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 584 { 585 } 586 unittest 587 { 588 } 589 */ 590 591 /* 592 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst. 593 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 594 { 595 } 596 unittest 597 { 598 } 599 */ 600 601 /* 602 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst. 603 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 604 { 605 } 606 unittest 607 { 608 } 609 */ 610 611 /// Round the packed double-precision (64-bit) floating-point elements in a using the rounding parameter, and store the results as packed double-precision floating-point elements in dst. 612 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 613 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 614 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 615 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 616 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 617 /* 618 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 619 __m128d _mm_round_pd (__m128d a, int rounding) @trusted 620 { 621 } 622 unittest 623 { 624 } 625 */ 626 627 /// Round the packed single-precision (32-bit) floating-point elements in a using the rounding parameter, and store the results as packed single-precision floating-point elements in dst. 628 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 629 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 630 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 631 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 632 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 633 /* 634 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 635 __m128 _mm_round_ps (__m128 a, int rounding) @trusted 636 { 637 } 638 unittest 639 { 640 } 641 */ 642 643 /// Round the lower double-precision (64-bit) floating-point element in b using the rounding parameter, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst. 644 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 645 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 646 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 647 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 648 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 649 /* 650 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 651 __m128d _mm_round_sd (__m128d a, __m128d b, int rounding) @trusted 652 { 653 } 654 unittest 655 { 656 } 657 */ 658 659 /// Round the lower single-precision (32-bit) floating-point element in b using the rounding parameter, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. 660 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 661 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 662 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 663 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 664 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 665 /* 666 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 667 __m128 _mm_round_ss (__m128 a, __m128 b, int rounding) @trusted 668 { 669 } 670 unittest 671 { 672 } 673 */ 674 675 /* 676 /// Load 128-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. 677 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted 678 { 679 } 680 unittest 681 { 682 } 683 */ 684 685 /* 686 /// Compute the bitwise NOT of a and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0. 687 int _mm_test_all_ones (__m128i a) @trusted 688 { 689 } 690 unittest 691 { 692 } 693 */ 694 695 /* 696 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and return 1 if the result is zero, otherwise return 0. 697 int _mm_test_all_zeros (__m128i a, __m128i mask) @trusted 698 { 699 } 700 unittest 701 { 702 } 703 */ 704 705 /* 706 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 707 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 708 { 709 } 710 unittest 711 { 712 } 713 */ 714 715 /* 716 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value. 717 int _mm_testc_si128 (__m128i a, __m128i b) @trusted 718 { 719 } 720 unittest 721 { 722 } 723 */ 724 725 /* 726 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 727 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 728 { 729 } 730 unittest 731 { 732 } 733 */ 734 735 /* 736 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value. 737 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 738 { 739 } 740 unittest 741 { 742 } 743 */ 744 745 746 // LDC intrinsics present from 1.0.0 to 747 748 /* 749 750 pragma(LDC_intrinsic, "llvm.x86.sse41.blendvpd") 751 double2 __builtin_ia32_blendvpd(double2, double2, double2) pure @safe; 752 753 pragma(LDC_intrinsic, "llvm.x86.sse41.blendvps") 754 float4 __builtin_ia32_blendvps(float4, float4, float4) pure @safe; 755 756 pragma(LDC_intrinsic, "llvm.x86.sse41.dppd") 757 double2 __builtin_ia32_dppd(double2, double2, byte) pure @safe; 758 759 pragma(LDC_intrinsic, "llvm.x86.sse41.dpps") 760 float4 __builtin_ia32_dpps(float4, float4, byte) pure @safe; 761 762 pragma(LDC_intrinsic, "llvm.x86.sse41.insertps") 763 float4 __builtin_ia32_insertps128(float4, float4, byte) pure @safe; 764 765 pragma(LDC_intrinsic, "llvm.x86.sse41.mpsadbw") 766 short8 __builtin_ia32_mpsadbw128(byte16, byte16, byte) pure @safe; 767 768 pragma(LDC_intrinsic, "llvm.x86.sse41.packusdw") 769 short8 __builtin_ia32_packusdw128(int4, int4) pure @safe; 770 771 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 772 byte16 __builtin_ia32_pblendvb128(byte16, byte16, byte16) pure @safe; 773 774 pragma(LDC_intrinsic, "llvm.x86.sse41.phminposuw") 775 short8 __builtin_ia32_phminposuw128(short8) pure @safe; 776 777 778 pragma(LDC_intrinsic, "llvm.x86.sse41.ptestc") 779 int __builtin_ia32_ptestc128(long2, long2) pure @safe; 780 781 pragma(LDC_intrinsic, "llvm.x86.sse41.ptestnzc") 782 int __builtin_ia32_ptestnzc128(long2, long2) pure @safe; 783 784 pragma(LDC_intrinsic, "llvm.x86.sse41.ptestz") 785 int __builtin_ia32_ptestz128(long2, long2) pure @safe; 786 787 pragma(LDC_intrinsic, "llvm.x86.sse41.round.pd") 788 double2 __builtin_ia32_roundpd(double2, int) pure @safe; 789 790 pragma(LDC_intrinsic, "llvm.x86.sse41.round.ps") 791 float4 __builtin_ia32_roundps(float4, int) pure @safe; 792 793 pragma(LDC_intrinsic, "llvm.x86.sse41.round.sd") 794 double2 __builtin_ia32_roundsd(double2, double2, int) pure @safe; 795 796 pragma(LDC_intrinsic, "llvm.x86.sse41.round.ss") 797 float4 __builtin_ia32_roundss(float4, float4, int) pure @safe; 798 799 */