1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 // SSE2 instructions 16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 17 18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 19 { 20 return cast(__m128i)(cast(short8)a + cast(short8)b); 21 } 22 23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 24 { 25 return cast(__m128i)(cast(int4)a + cast(int4)b); 26 } 27 28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 29 { 30 return cast(__m128i)(cast(long2)a + cast(long2)b); 31 } 32 33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 36 } 37 38 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 39 { 40 a[0] += b[0]; 41 return a; 42 } 43 unittest 44 { 45 __m128d a = [1.5, -2.0]; 46 a = _mm_add_sd(a, a); 47 assert(a.array == [3.0, -2.0]); 48 } 49 50 51 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 52 { 53 return a + b; 54 } 55 unittest 56 { 57 __m128d a = [1.5, -2.0]; 58 a = _mm_add_pd(a, a); 59 assert(a.array == [3.0, -4.0]); 60 } 61 62 // MMXREG: _mm_add_si64 63 64 version(LDC) 65 { 66 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 67 } 68 else 69 { 70 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 71 { 72 short[8] res; 73 short8 sa = cast(short8)a; 74 short8 sb = cast(short8)b; 75 foreach(i; 0..8) 76 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 77 return _mm_loadu_si128(cast(int4*)res.ptr); 78 } 79 } 80 unittest 81 { 82 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 version(LDC) 89 { 90 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 91 } 92 else 93 { 94 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 95 { 96 byte[16] res; 97 byte16 sa = cast(byte16)a; 98 byte16 sb = cast(byte16)b; 99 foreach(i; 0..16) 100 res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 101 return _mm_loadu_si128(cast(int4*)res.ptr); 102 } 103 } 104 unittest 105 { 106 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 107 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 108 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 109 16, 18, 20, 22, 24, 26, 28, 30]; 110 assert(res.array == correctResult); 111 } 112 113 version(LDC) 114 { 115 alias _mm_adds_epu8 = __builtin_ia32_paddusb128; 116 } 117 else 118 { 119 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 120 { 121 ubyte[16] res; 122 byte16 sa = cast(byte16)a; 123 byte16 sb = cast(byte16)b; 124 foreach(i; 0..16) 125 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 126 return _mm_loadu_si128(cast(int4*)res.ptr); 127 } 128 } 129 130 version(LDC) 131 { 132 alias _mm_adds_epu16 = __builtin_ia32_paddusw128; 133 } 134 else 135 { 136 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 137 { 138 ushort[8] res; 139 short8 sa = cast(short8)a; 140 short8 sb = cast(short8)b; 141 foreach(i; 0..8) 142 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 143 return _mm_loadu_si128(cast(int4*)res.ptr); 144 } 145 } 146 147 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 148 { 149 return cast(__m128d)( cast(__m128i)a & cast(__m128i)b ); 150 } 151 152 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 153 { 154 return a & b; 155 } 156 157 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 158 { 159 return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b ); 160 } 161 162 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 163 { 164 return (~a) & b; 165 } 166 167 version(LDC) 168 { 169 pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.w") 170 short8 _mm_avg_epu16(short8, short8) pure @safe; 171 172 pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.b") 173 byte16 _mm_avg_epu8(byte16, byte16) pure @safe; 174 } 175 // TODO 176 177 178 // TODO: __m128i _mm_bslli_si128 (__m128i a, int imm8) 179 // TODO: __m128i _mm_bsrli_si128 (__m128i a, int imm8) 180 181 __m128 _mm_castpd_ps (__m128d a) pure @safe 182 { 183 return cast(__m128)a; 184 } 185 186 __m128i _mm_castpd_si128 (__m128d a) pure @safe 187 { 188 return cast(__m128i)a; 189 } 190 191 __m128d _mm_castps_pd (__m128 a) pure @safe 192 { 193 return cast(__m128d)a; 194 } 195 196 __m128i _mm_castps_si128 (__m128 a) pure @safe 197 { 198 return cast(__m128i)a; 199 } 200 201 __m128d _mm_castsi128_pd (__m128i a) pure @safe 202 { 203 return cast(__m128d)a; 204 } 205 206 __m128 _mm_castsi128_ps (__m128i a) pure @safe 207 { 208 return cast(__m128)a; 209 } 210 211 version(LDC) 212 { 213 alias _mm_clflush = __builtin_ia32_clflush; 214 } 215 // TODO 216 217 version(LDC) 218 { 219 pragma(LDC_intrinsic, "llvm.x86.sse2.cmp.pd") 220 double2 __builtin_ia32_cmppd(double2, double2, byte) pure @safe; 221 } 222 // TODO 223 224 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 225 { 226 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 227 } 228 229 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 230 { 231 return equalMask!__m128i(a, b); 232 } 233 234 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 235 { 236 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 237 } 238 unittest 239 { 240 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 241 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 242 243 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 244 static immutable byte[16] correct = 245 [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 246 247 __m128i D = _mm_cmpeq_epi8(A, B); 248 assert(C.array == correct); 249 } 250 251 252 version(LDC) 253 { 254 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 255 { 256 return __builtin_ia32_cmppd(a, b, 0); 257 } 258 259 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 260 { 261 return __builtin_ia32_cmpsd(a, b, 0); 262 } 263 264 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 265 { 266 return __builtin_ia32_cmppd(b, a, 2); 267 } 268 269 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 270 { 271 return __builtin_ia32_cmpsd(b, a, 2); 272 } 273 } 274 // TODO 275 276 277 // TODO 278 /+__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 279 { 280 return cast(__m128i)( cast(short8)a > cast(short8)b ); 281 } 282 283 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 284 { 285 return cast(__m128i)( cast(int4)a > cast(int4)b ); 286 } 287 288 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 289 { 290 return cast(__m128i)( cast(byte16)a > cast(byte16)b ); 291 }+/ 292 293 version(LDC) 294 { 295 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 296 { 297 return __builtin_ia32_cmppd(b, a, 1); 298 } 299 300 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 301 { 302 return __builtin_ia32_cmpsd(b, a, 1); 303 } 304 305 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 306 { 307 return __builtin_ia32_cmppd(a, b, 2); 308 } 309 310 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 311 { 312 return __builtin_ia32_cmpsd(a, b, 2); 313 } 314 } 315 // TODO 316 317 // TODO 318 /+__m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 319 { 320 return cast(__m128i)( cast(short8)a < cast(short8)b ); 321 } 322 323 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 324 { 325 return cast(__m128i)( cast(int4)a < cast(int4)b ); 326 } 327 328 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 329 { 330 return cast(__m128i)( cast(byte8)a < cast(byte8)b ); 331 }+/ 332 333 version(LDC) 334 { 335 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 336 { 337 return __builtin_ia32_cmppd(a, b, 1); 338 } 339 340 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 341 { 342 return __builtin_ia32_cmpsd(a, b, 1); 343 } 344 345 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 346 { 347 return __builtin_ia32_cmppd(a, b, 4); 348 } 349 350 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 351 { 352 return __builtin_ia32_cmpsd(a, b, 4); 353 } 354 355 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 356 { 357 return __builtin_ia32_cmppd(b, a, 6); 358 } 359 360 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 361 { 362 return __builtin_ia32_cmpsd(b, a, 6); 363 } 364 365 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 366 { 367 return __builtin_ia32_cmppd(b, a, 5); 368 } 369 370 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 371 { 372 return __builtin_ia32_cmpsd(b, a, 5); 373 } 374 375 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 376 { 377 return __builtin_ia32_cmppd(a, b, 6); 378 } 379 380 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 381 { 382 return __builtin_ia32_cmpsd(a, b, 6); 383 } 384 385 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 386 { 387 return __builtin_ia32_cmppd(a, b, 5); 388 } 389 390 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 391 { 392 return __builtin_ia32_cmpsd(a, b, 5); 393 } 394 395 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 396 { 397 return __builtin_ia32_cmppd(a, b, 7); 398 } 399 400 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 401 { 402 return __builtin_ia32_cmpsd(a, b, 7); 403 } 404 405 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 406 { 407 return __builtin_ia32_cmppd(a, b, 3); 408 } 409 410 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 411 { 412 return __builtin_ia32_cmpsd(a, b, 3); 413 } 414 } 415 // TODO 416 417 version(LDC) 418 { 419 alias _mm_comieq_sd = __builtin_ia32_comisdeq; 420 alias _mm_comige_sd = __builtin_ia32_comisdge; 421 alias _mm_comigt_sd = __builtin_ia32_comisdgt; 422 alias _mm_comile_sd = __builtin_ia32_comisdle; 423 alias _mm_comilt_sd = __builtin_ia32_comisdlt; 424 alias _mm_comineq_sd = __builtin_ia32_comisdneq; 425 } 426 // TODO 427 428 // TODO: alias _mm_cvtepi32_pd = __builtin_ia32_cvtdq2pd; 429 430 // PERF: replace with __builtin_convertvector when available 431 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe 432 { 433 __m128 res; 434 res.array[0] = cast(float)a.array[0]; 435 res.array[1] = cast(float)a.array[1]; 436 res.array[2] = cast(float)a.array[2]; 437 res.array[3] = cast(float)a.array[3]; 438 return res; 439 } 440 unittest 441 { 442 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 443 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 444 } 445 446 447 version(LDC) // TODO 448 { 449 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 450 } 451 452 // MMXREG: _mm_cvtpd_pi32 453 version(LDC) 454 { 455 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; 456 // MMXREG: _mm_cvtpi32_pd 457 alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 458 } 459 // TODO 460 461 // TODO: alias _mm_cvtps_pd = __builtin_ia32_cvtps2pd; 462 463 double _mm_cvtsd_f64 (__m128d a) pure @safe 464 { 465 return extractelement!(double2, 0)(a); 466 } 467 468 version(LDC) 469 { 470 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 471 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 472 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 473 } 474 // TODO 475 476 version(LDC) 477 { 478 alias _mm_cvtsd_ss = __builtin_ia32_cvtsd2ss; 479 } 480 // TODO 481 482 int _mm_cvtsi128_si32 (__m128i a) pure @safe 483 { 484 return a[0]; 485 } 486 487 long _mm_cvtsi128_si64 (__m128i a) pure @safe 488 { 489 long2 la = cast(long2)a; 490 return la[0]; 491 } 492 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 493 494 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe 495 { 496 v[0] = cast(double)x; 497 return v; 498 } 499 unittest 500 { 501 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 502 assert(a.array == [42.0, 0]); 503 } 504 505 __m128i _mm_cvtsi32_si128 (int a) pure @safe 506 { 507 int4 r = [0, 0, 0, 0]; 508 r[0] = a; 509 return r; 510 } 511 512 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 513 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe 514 { 515 v[0] = cast(double)x; 516 return v; 517 } 518 unittest 519 { 520 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 521 assert(a.array == [42.0, 0]); 522 } 523 524 __m128i _mm_cvtsi64_si128 (long a) pure @safe 525 { 526 long2 r = [0, 0]; 527 r[0] = a; 528 return cast(__m128i)(r); 529 } 530 531 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 532 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 533 534 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe 535 { 536 v[0] = x[0]; 537 return v; 538 } 539 unittest 540 { 541 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 542 assert(a.array == [42.0, 0]); 543 } 544 545 version(LDC) 546 { 547 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 548 //MMXREG: _mm_cvttpd_pi32 549 alias _mm_cvttps_epi32 = __builtin_ia32_cvttps2dq; 550 alias _mm_cvttsd_si32 = __builtin_ia32_cvttsd2si; 551 alias _mm_cvttsd_si64 = __builtin_ia32_cvttsd2si64; 552 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 553 } 554 // TODO 555 556 557 558 __m128d _mm_div_ps(__m128d a, __m128d b) 559 { 560 return a / b; 561 } 562 563 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 564 { 565 a[0] /= b[0]; 566 return a; 567 } 568 unittest 569 { 570 __m128d a = [2.0, 4.5]; 571 a = _mm_div_sd(a, a); 572 assert(a.array == [1.0, 4.5]); 573 } 574 575 int _mm_extract_epi16(int imm8)(__m128i a) pure @safe 576 { 577 return shufflevector!(short8, imm8)(a); 578 } 579 580 __m128i _mm_insert_epi16(int imm8)(__m128i a, int i) pure @safe 581 { 582 return insertelement!(short8, imm8)(a, i); 583 } 584 585 version(LDC) 586 { 587 alias _mm_lfence = __builtin_ia32_lfence; 588 } 589 // TODO 590 591 592 __m128d _mm_load_pd (const(double) * mem_addr) pure 593 { 594 __m128d* aligned = cast(__m128d*)mem_addr; 595 return *aligned; 596 } 597 598 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 599 { 600 double[2] arr = [*mem_addr, *mem_addr]; 601 return loadUnaligned!(double2)(&arr[0]); 602 } 603 604 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe 605 { 606 double2 r = [0, 0]; 607 r[0] = *mem_addr; 608 return r; 609 } 610 unittest 611 { 612 double x = -42; 613 __m128d a = _mm_load_sd(&x); 614 assert(a.array == [-42.0, 0.0]); 615 } 616 617 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 618 { 619 return *mem_addr; 620 } 621 622 alias _mm_load1_pd = _mm_load_pd1; 623 624 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe 625 { 626 a[1] = *mem_addr; 627 return a; 628 } 629 630 // Note: strange signature since the memory doesn't have to aligned 631 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe 632 { 633 auto pLong = cast(const(long)*)mem_addr; 634 long2 r = [0, 0]; 635 r[0] = *pLong; 636 return cast(__m128i)(r); 637 } 638 639 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe 640 { 641 a[0] = *mem_addr; 642 return a; 643 } 644 645 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 646 { 647 __m128d a = _mm_load_pd(mem_addr); 648 return shufflevector!(__m128d, 1, 0)(a, a); 649 } 650 651 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 652 { 653 return loadUnaligned!(double2)(mem_addr); 654 } 655 656 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 657 { 658 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 659 } 660 661 version(LDC) 662 { 663 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 664 665 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; 666 667 pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxs.w") 668 short8 __builtin_ia32_pmaxsw128(short8, short8) pure @safe; 669 alias _mm_max_epi16 = __builtin_ia32_pmaxsw128; 670 671 pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxu.b") 672 byte16 __builtin_ia32_pmaxub128(byte16, byte16) pure @safe; 673 alias _mm_max_epu8 = __builtin_ia32_pmaxub128; 674 675 alias _mm_max_pd = __builtin_ia32_maxpd; 676 alias _mm_max_sd = __builtin_ia32_maxsd; 677 678 alias _mm_mfence = __builtin_ia32_mfence; 679 680 pragma(LDC_intrinsic, "llvm.x86.sse2.pmins.w") 681 short8 __builtin_ia32_pminsw128(short8, short8) pure @safe; 682 alias _mm_min_epi16 = __builtin_ia32_pminsw128; 683 684 pragma(LDC_intrinsic, "llvm.x86.sse2.pminu.b") 685 byte16 __builtin_ia32_pminub128(byte16, byte16) pure @safe; 686 alias _mm_min_epu8 = __builtin_ia32_pminub128; 687 688 alias _mm_min_pd = __builtin_ia32_minpd; 689 alias _mm_min_sd = __builtin_ia32_minsd; 690 } 691 // TODO 692 693 __m128i _mm_move_epi64 (__m128i a) pure @safe 694 { 695 long2 result = [ 0, 0 ]; 696 long2 la = cast(long2) a; 697 result[0] = la[0]; 698 return cast(__m128i)(result); 699 } 700 unittest 701 { 702 long2 A = [13, 47]; 703 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 704 long[2] correct = [13, 0]; 705 assert(B.array == correct); 706 } 707 708 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 709 { 710 b[1] = a[1]; 711 return b; 712 } 713 unittest 714 { 715 double2 A = [13.0, 47.0]; 716 double2 B = [34.0, 58.0]; 717 double2 C = _mm_move_sd(A, B); 718 double[2] correct = [34.0, 47.0]; 719 assert(C.array == correct); 720 } 721 722 version(LDC) 723 { 724 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 725 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 726 } 727 728 // MMXREG: _mm_movepi64_pi64 729 // MMXREG: __m128i _mm_movpi64_epi64 (__m64 a) 730 731 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC 732 // but seems there in clang 733 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe 734 { 735 __m128i zero = _mm_setzero_si128(); 736 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 737 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 738 static if (__VERSION__ >= 2076) 739 { 740 return cast(__m128i)(la * lb); 741 } 742 else 743 { 744 // long2 mul not supported before 745 la[0] *= lb[0]; 746 la[1] *= lb[1]; 747 return cast(__m128i)(la); 748 } 749 } 750 unittest 751 { 752 __m128i A = _mm_set_epi32(0, 0xDEADBEEF, 0, 0xffffffff); 753 __m128i B = _mm_set_epi32(0, 0xCAFEBABE, 0, 0xffffffff); 754 __m128i C = _mm_mul_epu32(A, B); 755 long2 LC = cast(long2)C; 756 assert(LC.array[0] == 18446744065119617025uL); 757 assert(LC.array[1] == 12723420444339690338uL); 758 } 759 760 761 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 762 { 763 return a * b; 764 } 765 unittest 766 { 767 __m128d a = [-2.0, 1.5]; 768 a = _mm_mul_pd(a, a); 769 assert(a.array == [4.0, 2.25]); 770 } 771 772 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 773 { 774 a[0] *= b[0]; 775 return a; 776 } 777 unittest 778 { 779 __m128d a = [-2.0, 1.5]; 780 a = _mm_mul_sd(a, a); 781 assert(a.array == [4.0, 1.5]); 782 } 783 784 785 // MMXREG: _mm_mul_su32 786 787 version(LDC) 788 { 789 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 790 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 791 } 792 // TODO 793 794 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) 795 { 796 return cast(__m128i)(cast(short8)a * cast(short8)b); 797 } 798 799 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 800 { 801 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 802 } 803 804 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 805 { 806 return a | b; 807 } 808 809 version(LDC) 810 { 811 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 812 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 813 } 814 version(LDC) 815 { 816 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 817 } 818 else 819 { 820 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure 821 { 822 short8 sa = cast(short8)a; 823 short8 sb = cast(short8)b; 824 ubyte[16] result = void; 825 for (int i = 0; i < 8; ++i) 826 { 827 short s = sa[i]; 828 if (s < 0) s = 0; 829 if (s > 255) s = 255; 830 result[i] = cast(ubyte)s; 831 832 s = sb[i]; 833 if (s < 0) s = 0; 834 if (s > 255) s = 255; 835 result[i+8] = cast(ubyte)s; 836 } 837 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 838 } 839 } 840 unittest 841 { 842 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 843 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 844 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 845 0, 255, 0, 255, 255, 2, 1, 0]; 846 foreach(i; 0..16) 847 assert(AA[i] == cast(byte)(correctResult[i])); 848 } 849 850 // TODO 851 version(LDC) 852 { 853 alias _mm_pause = __builtin_ia32_pause; 854 } 855 // TODO 856 857 version(LDC) 858 { 859 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 860 } 861 // TODO 862 863 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 864 { 865 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 866 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 867 } 868 unittest 869 { 870 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 871 short8 B = cast(short8) A; 872 foreach(i; 0..8) 873 assert(B.array[i] == i); 874 } 875 876 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 877 { 878 int[4] result = [e0, e1, e2, e3]; 879 return loadUnaligned!(int4)(result.ptr); 880 } 881 unittest 882 { 883 __m128i A = _mm_set_epi32(3, 2, 1, 0); 884 foreach(i; 0..4) 885 assert(A.array[i] == i); 886 } 887 888 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 889 { 890 long[2] result = [e0, e1]; 891 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 892 } 893 unittest 894 { 895 __m128i A = _mm_set_epi64x(1234, 5678); 896 long2 B = cast(long2) A; 897 assert(B.array[0] == 5678); 898 assert(B.array[1] == 1234); 899 } 900 901 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 902 byte e11, byte e10, byte e9, byte e8, 903 byte e7, byte e6, byte e5, byte e4, 904 byte e3, byte e2, byte e1, byte e0) pure @trusted 905 { 906 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 907 e8, e9, e10, e11, e12, e13, e14, e15]; 908 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 909 } 910 911 __m128d _mm_set_pd (double e1, double e0) pure @trusted 912 { 913 double[2] result = [e0, e1]; 914 return loadUnaligned!(double2)(result.ptr); 915 } 916 917 __m128d _mm_set_pd1 (double a) pure @trusted 918 { 919 double[2] result = [a, a]; 920 return loadUnaligned!(double2)(result.ptr); 921 } 922 923 __m128d _mm_set_sd (double a) pure @trusted 924 { 925 double[2] result = [a, 0]; 926 return loadUnaligned!(double2)(result.ptr); 927 } 928 929 __m128i _mm_set1_epi16 (short a) pure @trusted 930 { 931 short[8] result = [a, a, a, a, a, a, a, a]; 932 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 933 } 934 935 __m128i _mm_set1_epi32 (int a) pure @trusted 936 { 937 int[4] result = [a, a, a, a]; 938 return loadUnaligned!(int4)(result.ptr); 939 } 940 unittest 941 { 942 __m128 a = _mm_set1_ps(-1.0f); 943 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 944 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 945 } 946 947 __m128i _mm_set1_epi64x (long a) pure @trusted 948 { 949 long[2] result = [a, a]; 950 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 951 } 952 953 __m128i _mm_set1_epi8 (char a) pure @trusted 954 { 955 byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a]; 956 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 957 } 958 959 alias _mm_set1_pd = _mm_set_pd1; 960 961 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 962 { 963 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 964 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 965 } 966 967 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 968 { 969 int[4] result = [e3, e2, e1, e0]; 970 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 971 } 972 973 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 974 { 975 long[2] result = [e1, e0]; 976 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 977 } 978 979 __m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, 980 char e11, char e10, char e9, char e8, 981 char e7, char e6, char e5, char e4, 982 char e3, char e2, char e1, char e0) pure @trusted 983 { 984 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 985 e7, e6, e5, e4, e3, e2, e1, e0]; 986 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 987 } 988 989 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 990 { 991 double[2] result = [e1, e0]; 992 return loadUnaligned!(double2)(result.ptr); 993 } 994 995 __m128d _mm_setzero_pd () pure @trusted 996 { 997 double[2] result = [0.0, 0.0]; 998 return loadUnaligned!(double2)(result.ptr); 999 } 1000 1001 __m128i _mm_setzero_si128() pure @trusted 1002 { 1003 int[4] result = [0, 0, 0, 0]; 1004 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 1005 } 1006 1007 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 1008 { 1009 return shufflevector!(int4, (imm8 >> 0) & 3, 1010 (imm8 >> 2) & 3, 1011 (imm8 >> 4) & 3, 1012 (imm8 >> 6) & 3)(a, a); 1013 } 1014 unittest 1015 { 1016 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 1017 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1018 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 1019 int[4] expectedB = [ 3, 2, 1, 0 ]; 1020 assert(B.array == expectedB); 1021 } 1022 1023 __m128d _mm_shuffle_pd (int imm8)(__m128d a) pure @safe 1024 { 1025 return shufflevector!(double2, 0 + ( imm8 & 1 ), 1026 2 + ( (imm8 >> 1) & 1 ))(a, a); 1027 } 1028 unittest 1029 { 1030 __m128d A = _mm_setr_pd(0.5f, 2.0f); 1031 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 1032 __m128d B = _mm_shuffle_pd!SHUFFLE(A); 1033 double[2] expectedB = [ 2.0f, 2.0f ]; 1034 assert(B.array == expectedB); 1035 } 1036 1037 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 1038 { 1039 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 1040 4 + ( (imm8 >> 0) & 3 ), 1041 4 + ( (imm8 >> 2) & 3 ), 1042 4 + ( (imm8 >> 4) & 3 ), 1043 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 1044 } 1045 unittest 1046 { 1047 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1048 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1049 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 1050 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 1051 assert(C.array == expectedC); 1052 } 1053 1054 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 1055 { 1056 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 1057 ( (imm8 >> 2) & 3 ), 1058 ( (imm8 >> 4) & 3 ), 1059 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 1060 } 1061 unittest 1062 { 1063 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1064 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1065 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 1066 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 1067 assert(B.array == expectedB); 1068 } 1069 1070 version(LDC) 1071 { 1072 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 1073 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 1074 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 1075 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 1076 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 1077 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 1078 } 1079 // TODO 1080 1081 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe 1082 { 1083 static if (imm8 & 0xF0) 1084 return _mm_setzero_si128(); 1085 else 1086 return shufflevector!(byte16, 1087 16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8, 1088 24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8) 1089 (_mm_setzero_si128(), op); 1090 } 1091 1092 version(LDC) 1093 { 1094 // Disappeared with LDC 1.11 1095 static if (__VERSION__ < 2081) 1096 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 1097 else 1098 { 1099 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 1100 { 1101 vec.array[0] = llvm_sqrt(vec.array[0]); 1102 vec.array[1] = llvm_sqrt(vec.array[1]); 1103 return vec; 1104 } 1105 } 1106 } 1107 else 1108 { 1109 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 1110 { 1111 import std.math: sqrt; 1112 vec.array[0] = sqrt(vec.array[0]); 1113 vec.array[1] = sqrt(vec.array[1]); 1114 return vec; 1115 } 1116 } 1117 1118 1119 version(LDC) 1120 { 1121 // Disappeared with LDC 1.11 1122 static if (__VERSION__ < 2081) 1123 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 1124 else 1125 { 1126 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 1127 { 1128 vec.array[0] = llvm_sqrt(vec.array[0]); 1129 vec.array[1] = vec.array[1]; 1130 return vec; 1131 } 1132 } 1133 } 1134 else 1135 { 1136 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 1137 { 1138 import std.math: sqrt; 1139 vec.array[0] = sqrt(vec.array[0]); 1140 vec.array[1] = vec.array[1]; 1141 return vec; 1142 } 1143 } 1144 1145 1146 version(LDC) 1147 { 1148 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 1149 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 1150 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 1151 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 1152 1153 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 1154 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 1155 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 1156 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 1157 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 1158 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 1159 } 1160 // TODO 1161 1162 __m128i _mm_srli_si128(ubyte imm8)(__m128i op) pure @safe 1163 { 1164 static if (imm8 & 0xF0) 1165 return _mm_setzero_si128(); 1166 else 1167 return cast(__m128i) shufflevector!(byte16, 1168 imm8+0, imm8+1, imm8+2, imm8+3, imm8+4, imm8+5, imm8+6, imm8+7, 1169 imm8+8, imm8+9, imm8+10, imm8+11, imm8+12, imm8+13, imm8+14, imm8+15) 1170 (cast(byte16) op, cast(byte16)_mm_setzero_si128()); 1171 } 1172 1173 // Note: this is a bonus intrinsic 1174 __m128 _mm_srli_si128(ubyte imm8)(__m128 op) @safe 1175 { 1176 return cast(__m128)_mm_srli_si128!imm8(cast(__m128i)op); 1177 } 1178 unittest 1179 { 1180 // test that cast works at all 1181 __m128 A = cast(__m128) _mm_set1_epi32(0x3F800000); 1182 assert(A.array == [1.0f, 1.0f, 1.0f, 1.0f]); 1183 1184 // test _mm_srli_si128 for __m128i 1185 assert(_mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)).array == [2, 3, 4, 0]); 1186 assert(_mm_srli_si128!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)).array == [3.0f, 4.0f, 0, 0]); 1187 } 1188 1189 __m128d _mm_srli_si128(ubyte imm8)(__m128d op) pure @safe 1190 { 1191 return cast(__m128d) _mm_srli_si128!imm8(cast(__m128i)op); 1192 } 1193 1194 void _mm_store_pd (double* mem_addr, __m128d a) pure 1195 { 1196 __m128d* aligned = cast(__m128d*)mem_addr; 1197 *aligned = a; 1198 } 1199 1200 void _mm_store_pd1 (double* mem_addr, __m128d a) pure 1201 { 1202 __m128d* aligned = cast(__m128d*)mem_addr; 1203 *aligned = shufflevector!(double2, 0, 0)(a, a); 1204 } 1205 1206 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 1207 { 1208 *mem_addr = extractelement!(double2, 0)(a); 1209 } 1210 1211 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 1212 { 1213 *mem_addr = a; 1214 } 1215 1216 alias _mm_store1_pd = _mm_store_pd1; 1217 1218 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 1219 { 1220 *mem_addr = extractelement!(double2, 1)(a); 1221 } 1222 1223 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 1224 { 1225 long* dest = cast(long*)mem_addr; 1226 *dest = extractelement!(long2, 0)(cast(long2)a); 1227 } 1228 1229 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 1230 { 1231 *mem_addr = extractelement!(double2, 0)(a); 1232 } 1233 1234 void _mm_storer_pd (double* mem_addr, __m128d a) pure 1235 { 1236 __m128d* aligned = cast(__m128d*)mem_addr; 1237 *aligned = shufflevector!(double2, 1, 0)(a, a); 1238 } 1239 1240 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 1241 { 1242 storeUnaligned!double2(a, mem_addr); 1243 } 1244 1245 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 1246 { 1247 storeUnaligned!__m128i(a, cast(int*)mem_addr); 1248 } 1249 1250 // TODO: _mm_stream_pd 1251 // TODO: _mm_stream_si128 1252 // TODO: _mm_stream_si32 1253 // TODO: _mm_stream_si64 1254 1255 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 1256 { 1257 return cast(__m128i)(cast(short8)a - cast(short8)b); 1258 } 1259 1260 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 1261 { 1262 return cast(__m128i)(cast(int4)a - cast(int4)b); 1263 } 1264 1265 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 1266 { 1267 return cast(__m128i)(cast(long2)a - cast(long2)b); 1268 } 1269 1270 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 1271 { 1272 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 1273 } 1274 1275 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 1276 { 1277 return a - b; 1278 } 1279 1280 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 1281 { 1282 a[0] -= b[0]; 1283 return a; 1284 } 1285 unittest 1286 { 1287 __m128d a = [1.5, -2.0]; 1288 a = _mm_sub_sd(a, a); 1289 assert(a.array == [0.0, -2.0]); 1290 } 1291 1292 1293 // MMXREG: _mm_sub_si64 1294 1295 version(LDC) 1296 { 1297 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 1298 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 1299 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 1300 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 1301 1302 alias _mm_ucomieq_sd = __builtin_ia32_ucomisdeq; 1303 alias _mm_ucomige_sd = __builtin_ia32_ucomisdge; 1304 alias _mm_ucomigt_sd = __builtin_ia32_ucomisdgt; 1305 alias _mm_ucomile_sd = __builtin_ia32_ucomisdle; 1306 alias _mm_ucomilt_sd = __builtin_ia32_ucomisdlt; 1307 alias _mm_ucomineq_sd = __builtin_ia32_ucomisdneq; 1308 } 1309 // TODO 1310 1311 __m128d _mm_undefined_pd() pure @safe 1312 { 1313 __m128d result = void; 1314 return result; 1315 } 1316 __m128i _mm_undefined_si128() pure @safe 1317 { 1318 __m128i result = void; 1319 return result; 1320 } 1321 1322 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 1323 { 1324 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 1325 (cast(short8)a, cast(short8)b); 1326 } 1327 1328 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 1329 { 1330 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 1331 } 1332 1333 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe 1334 { 1335 return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b); 1336 } 1337 1338 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 1339 { 1340 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 1341 12, 28, 13, 29, 14, 30, 15, 31) 1342 (cast(byte16)a, cast(byte16)b); 1343 } 1344 1345 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 1346 { 1347 return shufflevector!(__m128d, 1, 3)(a, b); 1348 } 1349 1350 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 1351 { 1352 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 1353 (cast(short8)a, cast(short8)b); 1354 } 1355 1356 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 1357 { 1358 return shufflevector!(int4, 0, 4, 1, 6) 1359 (cast(int4)a, cast(int4)b); 1360 } 1361 1362 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe 1363 { 1364 return cast(__m128i) shufflevector!(long2, 0, 2) 1365 (cast(long2)a, cast(long2)b); 1366 } 1367 1368 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 1369 { 1370 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 1371 4, 20, 5, 21, 6, 22, 7, 23) 1372 (cast(byte16)a, cast(byte16)b); 1373 } 1374 1375 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 1376 { 1377 return shufflevector!(__m128d, 0, 2)(a, b); 1378 } 1379 1380 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 1381 { 1382 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 1383 } 1384 1385 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 1386 { 1387 return a ^ b; 1388 } 1389 1390 unittest 1391 { 1392 // distance between two points in 4D 1393 float distance(float[4] a, float[4] b) nothrow @nogc 1394 { 1395 __m128 va = _mm_loadu_ps(a.ptr); 1396 __m128 vb = _mm_loadu_ps(b.ptr); 1397 __m128 diffSquared = _mm_sub_ps(va, vb); 1398 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 1399 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_si128!8(diffSquared)); 1400 sum = _mm_add_ps(sum, _mm_srli_si128!4(sum)); 1401 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 1402 } 1403 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 1404 }