1 /** 2 * SSE4.1 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2021. 5 * Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.smmintrin; 9 10 // SSE4.1 instructions 11 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 12 // Note: this header will work whether you have SSE4.1 enabled or not. 13 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 14 // generate SSE4.1 instructions. 15 16 public import inteli.types; 17 import inteli.internals; 18 19 // smmintrin pulls in all previous instruction set intrinsics. 20 public import inteli.tmmintrin; 21 22 nothrow @nogc: 23 24 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 25 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 26 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 27 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 28 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 29 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 30 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 31 32 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 33 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 34 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 35 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 36 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 37 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 38 39 /* 40 /// Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst. 41 __m128i _mm_blend_epi16 (__m128i a, __m128i b, const int imm8) @trusted 42 { 43 } 44 unittest 45 { 46 } 47 */ 48 49 /* 50 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst. 51 __m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8) @trusted 52 { 53 } 54 unittest 55 { 56 } 57 */ 58 59 /* 60 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst. 61 __m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8) @trusted 62 { 63 } 64 unittest 65 { 66 } 67 */ 68 69 /* 70 /// Blend packed 8-bit integers from a and b using mask, and store the results in dst. 71 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 72 { 73 } 74 unittest 75 { 76 } 77 */ 78 79 /* 80 /// Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst. 81 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 82 { 83 } 84 unittest 85 { 86 } 87 */ 88 89 /* 90 /// Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst. 91 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 92 { 93 } 94 unittest 95 { 96 } 97 */ 98 99 /* 100 /// Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst. 101 __m128d _mm_ceil_pd (__m128d a) @trusted 102 { 103 } 104 unittest 105 { 106 } 107 */ 108 109 /* 110 /// Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst. 111 __m128 _mm_ceil_ps (__m128 a) @trusted 112 { 113 } 114 unittest 115 { 116 } 117 */ 118 119 /* 120 /// Round the lower double-precision (64-bit) floating-point element in b up to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst. 121 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 122 { 123 } 124 unittest 125 { 126 } 127 */ 128 129 /* 130 /// Round the lower single-precision (32-bit) floating-point element in b up to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. 131 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 132 { 133 } 134 unittest 135 { 136 } 137 */ 138 139 /* 140 /// Compare packed 64-bit integers in a and b for equality, and store the results in dst. 141 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 142 { 143 } 144 unittest 145 { 146 } 147 */ 148 149 /* 150 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst. 151 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 152 { 153 } 154 unittest 155 { 156 } 157 */ 158 159 /* 160 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst. 161 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 162 { 163 } 164 unittest 165 { 166 } 167 */ 168 169 /* 170 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst. 171 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 172 { 173 } 174 unittest 175 { 176 } 177 */ 178 179 /* 180 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst. 181 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 182 { 183 } 184 unittest 185 { 186 } 187 */ 188 189 /* 190 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst. 191 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 192 { 193 } 194 unittest 195 { 196 } 197 */ 198 199 /* 200 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst. 201 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 202 { 203 } 204 unittest 205 { 206 } 207 */ 208 209 /* 210 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst. 211 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 212 { 213 } 214 unittest 215 { 216 } 217 */ 218 219 /* 220 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst. 221 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 222 { 223 } 224 unittest 225 { 226 } 227 */ 228 229 /* 230 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst. 231 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 232 { 233 } 234 unittest 235 { 236 } 237 */ 238 239 /* 240 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst. 241 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 242 { 243 } 244 unittest 245 { 246 } 247 */ 248 249 /* 250 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst. 251 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 252 { 253 } 254 unittest 255 { 256 } 257 */ 258 259 /* 260 /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst. 261 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 262 { 263 } 264 unittest 265 { 266 } 267 */ 268 269 /* 270 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8. 271 __m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8) @trusted 272 { 273 } 274 unittest 275 { 276 } 277 */ 278 279 /* 280 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8. 281 __m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8) @trusted 282 { 283 } 284 unittest 285 { 286 } 287 */ 288 289 /* 290 /// Extract a 32-bit integer from a, selected with imm8, and store the result in dst. 291 int _mm_extract_epi32 (__m128i a, const int imm8) @trusted 292 { 293 } 294 unittest 295 { 296 } 297 */ 298 299 /* 300 /// Extract a 64-bit integer from a, selected with imm8, and store the result in dst. 301 __int64 _mm_extract_epi64 (__m128i a, const int imm8) @trusted 302 { 303 } 304 unittest 305 { 306 } 307 */ 308 309 /* 310 /// Extract an 8-bit integer from a, selected with imm8, and store the result in the lower element of dst. 311 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 312 { 313 } 314 unittest 315 { 316 } 317 */ 318 319 /* 320 /// Extract a single-precision (32-bit) floating-point element from a, selected with imm8, and store the result in dst. 321 int _mm_extract_ps (__m128 a, const int imm8) @trusted 322 { 323 } 324 unittest 325 { 326 } 327 */ 328 329 /* 330 /// Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst. 331 __m128d _mm_floor_pd (__m128d a) @trusted 332 { 333 } 334 unittest 335 { 336 } 337 */ 338 339 /* 340 /// Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst. 341 __m128 _mm_floor_ps (__m128 a) @trusted 342 { 343 } 344 unittest 345 { 346 } 347 */ 348 349 /* 350 /// Round the lower double-precision (64-bit) floating-point element in b down to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst. 351 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 352 { 353 } 354 unittest 355 { 356 } 357 */ 358 359 /* 360 /// Round the lower single-precision (32-bit) floating-point element in b down to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. 361 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 362 { 363 } 364 unittest 365 { 366 } 367 */ 368 369 /* 370 /// Copy a to dst, and insert the 32-bit integer i into dst at the location specified by imm8. 371 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) @trusted 372 { 373 } 374 unittest 375 { 376 } 377 */ 378 379 /* 380 /// Copy a to dst, and insert the 64-bit integer i into dst at the location specified by imm8. 381 __m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8) @trusted 382 { 383 } 384 unittest 385 { 386 } 387 */ 388 389 /* 390 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 391 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 392 { 393 } 394 unittest 395 { 396 } 397 */ 398 399 /* 400 /// Copy a to tmp, then insert a single-precision (32-bit) floating-point element from b into tmp using the control in imm8. Store tmp to dst using the mask in imm8 (elements are zeroed out when the corresponding bit is set). 401 __m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8) @trusted 402 { 403 } 404 unittest 405 { 406 } 407 */ 408 409 /* 410 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst. 411 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 412 { 413 } 414 unittest 415 { 416 } 417 */ 418 419 /* 420 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst. 421 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 422 { 423 } 424 unittest 425 { 426 } 427 */ 428 429 /* 430 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst. 431 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 432 { 433 } 434 unittest 435 { 436 } 437 */ 438 439 /* 440 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst. 441 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 442 { 443 } 444 unittest 445 { 446 } 447 */ 448 449 /* 450 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst. 451 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 452 { 453 } 454 unittest 455 { 456 } 457 */ 458 459 /* 460 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst. 461 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 462 { 463 } 464 unittest 465 { 466 } 467 */ 468 469 /* 470 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 471 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 472 { 473 } 474 unittest 475 { 476 } 477 */ 478 479 /* 480 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 481 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 482 { 483 } 484 unittest 485 { 486 } 487 */ 488 489 /* 490 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in a, store the minimum and index in dst, and zero the remaining bits in dst. 491 __m128i _mm_minpos_epu16 (__m128i a) @trusted 492 { 493 } 494 unittest 495 { 496 } 497 */ 498 499 /* 500 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Eight SADs are performed using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8. 501 __m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8) @trusted 502 { 503 } 504 unittest 505 { 506 } 507 */ 508 509 /* 510 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 511 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 512 { 513 } 514 unittest 515 { 516 } 517 */ 518 519 /* 520 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst. 521 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 522 { 523 } 524 unittest 525 { 526 } 527 */ 528 529 /* 530 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst. 531 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 532 { 533 } 534 unittest 535 { 536 } 537 */ 538 539 /// Round the packed double-precision (64-bit) floating-point elements in a using the rounding parameter, and store the results as packed double-precision floating-point elements in dst. 540 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 541 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 542 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 543 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 544 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 545 /* 546 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 547 __m128d _mm_round_pd (__m128d a, int rounding) @trusted 548 { 549 } 550 unittest 551 { 552 } 553 */ 554 555 /// Round the packed single-precision (32-bit) floating-point elements in a using the rounding parameter, and store the results as packed single-precision floating-point elements in dst. 556 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 557 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 558 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 559 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 560 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 561 /* 562 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 563 __m128 _mm_round_ps (__m128 a, int rounding) @trusted 564 { 565 } 566 unittest 567 { 568 } 569 */ 570 571 /// Round the lower double-precision (64-bit) floating-point element in b using the rounding parameter, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst. 572 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 573 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 574 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 575 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 576 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 577 /* 578 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 579 __m128d _mm_round_sd (__m128d a, __m128d b, int rounding) @trusted 580 { 581 } 582 unittest 583 { 584 } 585 */ 586 587 /// Round the lower single-precision (32-bit) floating-point element in b using the rounding parameter, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. 588 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 589 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 590 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 591 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 592 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 593 /* 594 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 595 __m128 _mm_round_ss (__m128 a, __m128 b, int rounding) @trusted 596 { 597 } 598 unittest 599 { 600 } 601 */ 602 603 /* 604 /// Load 128-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. 605 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted 606 { 607 } 608 unittest 609 { 610 } 611 */ 612 613 /* 614 /// Compute the bitwise NOT of a and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0. 615 int _mm_test_all_ones (__m128i a) @trusted 616 { 617 } 618 unittest 619 { 620 } 621 */ 622 623 /* 624 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and return 1 if the result is zero, otherwise return 0. 625 int _mm_test_all_zeros (__m128i a, __m128i mask) @trusted 626 { 627 } 628 unittest 629 { 630 } 631 */ 632 633 /* 634 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 635 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 636 { 637 } 638 unittest 639 { 640 } 641 */ 642 643 /* 644 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value. 645 int _mm_testc_si128 (__m128i a, __m128i b) @trusted 646 { 647 } 648 unittest 649 { 650 } 651 */ 652 653 /* 654 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 655 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 656 { 657 } 658 unittest 659 { 660 } 661 */ 662 663 /* 664 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value. 665 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 666 { 667 } 668 unittest 669 { 670 } 671 */