1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 // SSE2 instructions 16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 17 18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 19 { 20 return cast(__m128i)(cast(short8)a + cast(short8)b); 21 } 22 23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 24 { 25 return cast(__m128i)(cast(int4)a + cast(int4)b); 26 } 27 28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 29 { 30 return cast(__m128i)(cast(long2)a + cast(long2)b); 31 } 32 33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 36 } 37 38 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 39 { 40 a[0] += b[0]; 41 return a; 42 } 43 unittest 44 { 45 __m128d a = [1.5, -2.0]; 46 a = _mm_add_sd(a, a); 47 assert(a.array == [3.0, -2.0]); 48 } 49 50 51 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 52 { 53 return a + b; 54 } 55 unittest 56 { 57 __m128d a = [1.5, -2.0]; 58 a = _mm_add_pd(a, a); 59 assert(a.array == [3.0, -4.0]); 60 } 61 62 // MMXREG: _mm_add_si64 63 64 version(LDC) 65 { 66 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 67 } 68 else 69 { 70 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 71 { 72 short[8] res; 73 short8 sa = cast(short8)a; 74 short8 sb = cast(short8)b; 75 foreach(i; 0..8) 76 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 77 return _mm_loadu_si128(cast(int4*)res.ptr); 78 } 79 } 80 unittest 81 { 82 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 version(LDC) 89 { 90 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 91 } 92 else 93 { 94 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 95 { 96 byte[16] res; 97 byte16 sa = cast(byte16)a; 98 byte16 sb = cast(byte16)b; 99 foreach(i; 0..16) 100 res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 101 return _mm_loadu_si128(cast(int4*)res.ptr); 102 } 103 } 104 unittest 105 { 106 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 107 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 108 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 109 16, 18, 20, 22, 24, 26, 28, 30]; 110 assert(res.array == correctResult); 111 } 112 113 version(LDC) 114 { 115 alias _mm_adds_epu8 = __builtin_ia32_paddusb128; 116 } 117 else 118 { 119 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 120 { 121 ubyte[16] res; 122 byte16 sa = cast(byte16)a; 123 byte16 sb = cast(byte16)b; 124 foreach(i; 0..16) 125 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 126 return _mm_loadu_si128(cast(int4*)res.ptr); 127 } 128 } 129 130 version(LDC) 131 { 132 alias _mm_adds_epu16 = __builtin_ia32_paddusw128; 133 } 134 else 135 { 136 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 137 { 138 ushort[8] res; 139 short8 sa = cast(short8)a; 140 short8 sb = cast(short8)b; 141 foreach(i; 0..8) 142 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 143 return _mm_loadu_si128(cast(int4*)res.ptr); 144 } 145 } 146 147 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 148 { 149 return cast(__m128d)( cast(__m128i)a & cast(__m128i)b ); 150 } 151 152 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 153 { 154 return a & b; 155 } 156 157 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 158 { 159 return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b ); 160 } 161 162 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 163 { 164 return (~a) & b; 165 } 166 167 version(LDC) 168 { 169 pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.w") 170 short8 _mm_avg_epu16(short8, short8) pure @safe; 171 172 pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.b") 173 byte16 _mm_avg_epu8(byte16, byte16) pure @safe; 174 } 175 // TODO 176 177 178 // TODO: __m128i _mm_bslli_si128 (__m128i a, int imm8) 179 // TODO: __m128i _mm_bsrli_si128 (__m128i a, int imm8) 180 181 __m128 _mm_castpd_ps (__m128d a) pure @safe 182 { 183 return cast(__m128)a; 184 } 185 186 __m128i _mm_castpd_si128 (__m128d a) pure @safe 187 { 188 return cast(__m128i)a; 189 } 190 191 __m128d _mm_castps_pd (__m128 a) pure @safe 192 { 193 return cast(__m128d)a; 194 } 195 196 __m128i _mm_castps_si128 (__m128 a) pure @safe 197 { 198 return cast(__m128i)a; 199 } 200 201 __m128d _mm_castsi128_pd (__m128i a) pure @safe 202 { 203 return cast(__m128d)a; 204 } 205 206 __m128 _mm_castsi128_ps (__m128i a) pure @safe 207 { 208 return cast(__m128)a; 209 } 210 211 version(LDC) 212 { 213 alias _mm_clflush = __builtin_ia32_clflush; 214 } 215 // TODO 216 217 version(LDC) 218 { 219 // just used for "ord" intrinsics 220 pragma(LDC_intrinsic, "llvm.x86.sse2.cmp.pd") 221 double2 __builtin_ia32_cmppd(double2, double2, byte) pure @safe; 222 } 223 224 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 225 { 226 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 227 } 228 unittest 229 { 230 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 231 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 232 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 233 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 234 assert(R.array == E); 235 } 236 237 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 238 { 239 return equalMask!__m128i(a, b); 240 } 241 unittest 242 { 243 int4 A = [-3, -2, -1, 0]; 244 int4 B = [ 4, -2, 2, 0]; 245 int[4] E = [ 0, -1, 0, -1]; 246 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 247 assert(R.array == E); 248 } 249 250 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 251 { 252 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 253 } 254 unittest 255 { 256 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 257 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 258 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 259 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 260 __m128i D = _mm_cmpeq_epi8(A, B); 261 assert(C.array == correct); 262 } 263 264 265 version(LDC) 266 { 267 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe // TODO 268 { 269 return cast(__m128d) equalMask!double2(a, b); 270 } 271 272 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe // TODO 273 { 274 return __builtin_ia32_cmpsd(a, b, 0); 275 } 276 277 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe // TODO 278 { 279 return cast(__m128d) greaterOrEqualMask!double2(a, b); 280 } 281 282 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe // TODO 283 { 284 return __builtin_ia32_cmpsd(b, a, 2); 285 } 286 } 287 288 289 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 290 { 291 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 292 } 293 unittest 294 { 295 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 296 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 297 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 298 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 299 assert(R.array == E); 300 } 301 302 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 303 { 304 return cast(__m128i)( greaterMask!int4(a, b)); 305 } 306 unittest 307 { 308 int4 A = [-3, 2, -1, 0]; 309 int4 B = [ 4, -2, 2, 0]; 310 int[4] E = [ 0, -1, 0, 0]; 311 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 312 assert(R.array == E); 313 } 314 315 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 316 { 317 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 318 } 319 unittest 320 { 321 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 322 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 323 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 324 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 325 __m128i D = _mm_cmpeq_epi8(A, B); 326 assert(C.array == correct); 327 } 328 329 version(LDC) 330 { 331 332 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe // TODO 333 { 334 return cast(__m128d) greaterMask!double2(a, b); 335 } 336 337 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe // TODO 338 { 339 return __builtin_ia32_cmpsd(b, a, 1); 340 } 341 342 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe // TODO 343 { 344 return cast(__m128d) greaterOrEqualMask!double2(b, a); 345 } 346 347 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe // TODO 348 { 349 return __builtin_ia32_cmpsd(a, b, 2); 350 } 351 } 352 353 354 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 355 { 356 return _mm_cmpgt_epi16(b, a); 357 } 358 359 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 360 { 361 return _mm_cmpgt_epi32(b, a); 362 } 363 364 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 365 { 366 return _mm_cmpgt_epi8(b, a); 367 } 368 369 version(LDC) 370 { 371 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe // TODO 372 { 373 return cast(__m128d) greaterMask!double2(b, a); 374 } 375 376 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe // TODO 377 { 378 return __builtin_ia32_cmpsd(a, b, 1); 379 } 380 381 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe // TODO 382 { 383 return cast(__m128d) notEqualMask!double2(a, b); 384 } 385 386 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe // TODO 387 { 388 return __builtin_ia32_cmpsd(a, b, 4); 389 } 390 391 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe // TODO 392 { 393 return _mm_cmplt_pd(b, a); 394 } 395 396 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe // TODO 397 { 398 return __builtin_ia32_cmpsd(b, a, 6); 399 } 400 401 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe // TODO 402 { 403 return _mm_cmple_pd(b, a); 404 } 405 406 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe // TODO 407 { 408 return __builtin_ia32_cmpsd(b, a, 5); 409 } 410 411 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe // TODO 412 { 413 return _mm_cmpgt_pd(b, a); 414 } 415 416 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe // TODO 417 { 418 return __builtin_ia32_cmpsd(a, b, 6); 419 } 420 421 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe // TODO 422 { 423 return _mm_cmpge_pd(b, a); 424 } 425 426 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe // TODO 427 { 428 return __builtin_ia32_cmpsd(a, b, 5); 429 } 430 431 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe // TODO 432 { 433 return __builtin_ia32_cmppd(a, b, 7); 434 } 435 436 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe // TODO 437 { 438 return __builtin_ia32_cmpsd(a, b, 7); 439 } 440 441 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe // TODO 442 { 443 return __builtin_ia32_cmppd(a, b, 3); 444 } 445 446 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe // TODO 447 { 448 return __builtin_ia32_cmpsd(a, b, 3); 449 } 450 } 451 452 version(LDC) 453 { 454 alias _mm_comieq_sd = __builtin_ia32_comisdeq; // TODO 455 alias _mm_comige_sd = __builtin_ia32_comisdge; // TODO 456 alias _mm_comigt_sd = __builtin_ia32_comisdgt; // TODO 457 alias _mm_comile_sd = __builtin_ia32_comisdle; // TODO 458 alias _mm_comilt_sd = __builtin_ia32_comisdlt; // TODO 459 alias _mm_comineq_sd = __builtin_ia32_comisdneq; // TODO 460 } 461 462 // TODO: alias _mm_cvtepi32_pd = __builtin_ia32_cvtdq2pd; 463 464 // PERF: replace with __builtin_convertvector when available 465 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe 466 { 467 __m128 res; 468 res.array[0] = cast(float)a.array[0]; 469 res.array[1] = cast(float)a.array[1]; 470 res.array[2] = cast(float)a.array[2]; 471 res.array[3] = cast(float)a.array[3]; 472 return res; 473 } 474 unittest 475 { 476 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 477 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 478 } 479 480 481 version(LDC) // TODO 482 { 483 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 484 } 485 486 // MMXREG: _mm_cvtpd_pi32 487 version(LDC) 488 { 489 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; 490 // MMXREG: _mm_cvtpi32_pd 491 alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 492 } 493 // TODO 494 495 // TODO: alias _mm_cvtps_pd = __builtin_ia32_cvtps2pd; 496 497 double _mm_cvtsd_f64 (__m128d a) pure @safe 498 { 499 return extractelement!(double2, 0)(a); 500 } 501 502 version(LDC) 503 { 504 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 505 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 506 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 507 } 508 // TODO 509 510 version(LDC) 511 { 512 alias _mm_cvtsd_ss = __builtin_ia32_cvtsd2ss; 513 } 514 // TODO 515 516 int _mm_cvtsi128_si32 (__m128i a) pure @safe 517 { 518 return a[0]; 519 } 520 521 long _mm_cvtsi128_si64 (__m128i a) pure @safe 522 { 523 long2 la = cast(long2)a; 524 return la[0]; 525 } 526 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 527 528 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe 529 { 530 v[0] = cast(double)x; 531 return v; 532 } 533 unittest 534 { 535 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 536 assert(a.array == [42.0, 0]); 537 } 538 539 __m128i _mm_cvtsi32_si128 (int a) pure @safe 540 { 541 int4 r = [0, 0, 0, 0]; 542 r[0] = a; 543 return r; 544 } 545 546 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 547 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe 548 { 549 v[0] = cast(double)x; 550 return v; 551 } 552 unittest 553 { 554 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 555 assert(a.array == [42.0, 0]); 556 } 557 558 __m128i _mm_cvtsi64_si128 (long a) pure @safe 559 { 560 long2 r = [0, 0]; 561 r[0] = a; 562 return cast(__m128i)(r); 563 } 564 565 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 566 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 567 568 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe 569 { 570 v[0] = x[0]; 571 return v; 572 } 573 unittest 574 { 575 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 576 assert(a.array == [42.0, 0]); 577 } 578 579 version(LDC) 580 { 581 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 582 //MMXREG: _mm_cvttpd_pi32 583 alias _mm_cvttps_epi32 = __builtin_ia32_cvttps2dq; 584 alias _mm_cvttsd_si32 = __builtin_ia32_cvttsd2si; 585 alias _mm_cvttsd_si64 = __builtin_ia32_cvttsd2si64; 586 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 587 } 588 // TODO 589 590 591 592 __m128d _mm_div_ps(__m128d a, __m128d b) 593 { 594 return a / b; 595 } 596 597 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 598 { 599 a[0] /= b[0]; 600 return a; 601 } 602 unittest 603 { 604 __m128d a = [2.0, 4.5]; 605 a = _mm_div_sd(a, a); 606 assert(a.array == [1.0, 4.5]); 607 } 608 609 int _mm_extract_epi16(int imm8)(__m128i a) pure @safe 610 { 611 return shufflevector!(short8, imm8)(a); 612 } 613 614 __m128i _mm_insert_epi16(int imm8)(__m128i a, int i) pure @safe 615 { 616 return insertelement!(short8, imm8)(a, i); 617 } 618 619 version(LDC) 620 { 621 alias _mm_lfence = __builtin_ia32_lfence; 622 } 623 // TODO 624 625 626 __m128d _mm_load_pd (const(double) * mem_addr) pure 627 { 628 __m128d* aligned = cast(__m128d*)mem_addr; 629 return *aligned; 630 } 631 632 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 633 { 634 double[2] arr = [*mem_addr, *mem_addr]; 635 return loadUnaligned!(double2)(&arr[0]); 636 } 637 638 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe 639 { 640 double2 r = [0, 0]; 641 r[0] = *mem_addr; 642 return r; 643 } 644 unittest 645 { 646 double x = -42; 647 __m128d a = _mm_load_sd(&x); 648 assert(a.array == [-42.0, 0.0]); 649 } 650 651 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 652 { 653 return *mem_addr; 654 } 655 656 alias _mm_load1_pd = _mm_load_pd1; 657 658 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe 659 { 660 a[1] = *mem_addr; 661 return a; 662 } 663 664 // Note: strange signature since the memory doesn't have to aligned 665 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe 666 { 667 auto pLong = cast(const(long)*)mem_addr; 668 long2 r = [0, 0]; 669 r[0] = *pLong; 670 return cast(__m128i)(r); 671 } 672 673 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe 674 { 675 a[0] = *mem_addr; 676 return a; 677 } 678 679 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 680 { 681 __m128d a = _mm_load_pd(mem_addr); 682 return shufflevector!(__m128d, 1, 0)(a, a); 683 } 684 685 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 686 { 687 return loadUnaligned!(double2)(mem_addr); 688 } 689 690 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 691 { 692 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 693 } 694 695 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 696 { 697 int r = *cast(int*)(mem_addr); 698 int4 result = [0, 0, 0, 0]; 699 result[0] = r; 700 return result; 701 } 702 unittest 703 { 704 int r = 42; 705 __m128i A = _mm_loadu_si32(&r); 706 int[4] correct = [42, 0, 0, 0]; 707 assert(A.array == correct); 708 } 709 710 version(LDC) 711 { 712 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 713 714 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; 715 716 pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxs.w") 717 short8 __builtin_ia32_pmaxsw128(short8, short8) pure @safe; 718 alias _mm_max_epi16 = __builtin_ia32_pmaxsw128; 719 720 pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxu.b") 721 byte16 __builtin_ia32_pmaxub128(byte16, byte16) pure @safe; 722 alias _mm_max_epu8 = __builtin_ia32_pmaxub128; 723 724 alias _mm_max_pd = __builtin_ia32_maxpd; 725 alias _mm_max_sd = __builtin_ia32_maxsd; 726 727 alias _mm_mfence = __builtin_ia32_mfence; 728 729 pragma(LDC_intrinsic, "llvm.x86.sse2.pmins.w") 730 short8 __builtin_ia32_pminsw128(short8, short8) pure @safe; 731 alias _mm_min_epi16 = __builtin_ia32_pminsw128; 732 733 pragma(LDC_intrinsic, "llvm.x86.sse2.pminu.b") 734 byte16 __builtin_ia32_pminub128(byte16, byte16) pure @safe; 735 alias _mm_min_epu8 = __builtin_ia32_pminub128; 736 737 alias _mm_min_pd = __builtin_ia32_minpd; 738 alias _mm_min_sd = __builtin_ia32_minsd; 739 } 740 // TODO 741 742 __m128i _mm_move_epi64 (__m128i a) pure @safe 743 { 744 long2 result = [ 0, 0 ]; 745 long2 la = cast(long2) a; 746 result[0] = la[0]; 747 return cast(__m128i)(result); 748 } 749 unittest 750 { 751 long2 A = [13, 47]; 752 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 753 long[2] correct = [13, 0]; 754 assert(B.array == correct); 755 } 756 757 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 758 { 759 b[1] = a[1]; 760 return b; 761 } 762 unittest 763 { 764 double2 A = [13.0, 47.0]; 765 double2 B = [34.0, 58.0]; 766 double2 C = _mm_move_sd(A, B); 767 double[2] correct = [34.0, 47.0]; 768 assert(C.array == correct); 769 } 770 771 version(LDC) 772 { 773 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 774 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 775 } 776 777 // MMXREG: _mm_movepi64_pi64 778 // MMXREG: __m128i _mm_movpi64_epi64 (__m64 a) 779 780 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC 781 // but seems there in clang 782 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe 783 { 784 __m128i zero = _mm_setzero_si128(); 785 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 786 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 787 static if (__VERSION__ >= 2076) 788 { 789 return cast(__m128i)(la * lb); 790 } 791 else 792 { 793 // long2 mul not supported before LDC 1.5 794 la[0] *= lb[0]; 795 la[1] *= lb[1]; 796 return cast(__m128i)(la); 797 } 798 } 799 unittest 800 { 801 __m128i A = _mm_set_epi32(0, 0xDEADBEEF, 0, 0xffffffff); 802 __m128i B = _mm_set_epi32(0, 0xCAFEBABE, 0, 0xffffffff); 803 __m128i C = _mm_mul_epu32(A, B); 804 long2 LC = cast(long2)C; 805 assert(LC.array[0] == 18446744065119617025uL); 806 assert(LC.array[1] == 12723420444339690338uL); 807 } 808 809 810 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 811 { 812 return a * b; 813 } 814 unittest 815 { 816 __m128d a = [-2.0, 1.5]; 817 a = _mm_mul_pd(a, a); 818 assert(a.array == [4.0, 2.25]); 819 } 820 821 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 822 { 823 a[0] *= b[0]; 824 return a; 825 } 826 unittest 827 { 828 __m128d a = [-2.0, 1.5]; 829 a = _mm_mul_sd(a, a); 830 assert(a.array == [4.0, 1.5]); 831 } 832 833 834 // MMXREG: _mm_mul_su32 835 836 version(LDC) 837 { 838 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 839 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 840 } 841 // TODO 842 843 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) 844 { 845 return cast(__m128i)(cast(short8)a * cast(short8)b); 846 } 847 848 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 849 { 850 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 851 } 852 853 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 854 { 855 return a | b; 856 } 857 858 version(LDC) 859 { 860 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 861 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 862 } 863 version(LDC) 864 { 865 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 866 } 867 else 868 { 869 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure 870 { 871 short8 sa = cast(short8)a; 872 short8 sb = cast(short8)b; 873 ubyte[16] result = void; 874 for (int i = 0; i < 8; ++i) 875 { 876 short s = sa[i]; 877 if (s < 0) s = 0; 878 if (s > 255) s = 255; 879 result[i] = cast(ubyte)s; 880 881 s = sb[i]; 882 if (s < 0) s = 0; 883 if (s > 255) s = 255; 884 result[i+8] = cast(ubyte)s; 885 } 886 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 887 } 888 } 889 unittest 890 { 891 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 892 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 893 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 894 0, 255, 0, 255, 255, 2, 1, 0]; 895 foreach(i; 0..16) 896 assert(AA[i] == cast(byte)(correctResult[i])); 897 } 898 899 // TODO 900 version(LDC) 901 { 902 alias _mm_pause = __builtin_ia32_pause; 903 } 904 // TODO 905 906 version(LDC) 907 { 908 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 909 } 910 // TODO 911 912 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 913 { 914 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 915 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 916 } 917 unittest 918 { 919 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 920 short8 B = cast(short8) A; 921 foreach(i; 0..8) 922 assert(B.array[i] == i); 923 } 924 925 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 926 { 927 int[4] result = [e0, e1, e2, e3]; 928 return loadUnaligned!(int4)(result.ptr); 929 } 930 unittest 931 { 932 __m128i A = _mm_set_epi32(3, 2, 1, 0); 933 foreach(i; 0..4) 934 assert(A.array[i] == i); 935 } 936 937 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 938 { 939 long[2] result = [e0, e1]; 940 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 941 } 942 unittest 943 { 944 __m128i A = _mm_set_epi64x(1234, 5678); 945 long2 B = cast(long2) A; 946 assert(B.array[0] == 5678); 947 assert(B.array[1] == 1234); 948 } 949 950 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 951 byte e11, byte e10, byte e9, byte e8, 952 byte e7, byte e6, byte e5, byte e4, 953 byte e3, byte e2, byte e1, byte e0) pure @trusted 954 { 955 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 956 e8, e9, e10, e11, e12, e13, e14, e15]; 957 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 958 } 959 960 __m128d _mm_set_pd (double e1, double e0) pure @trusted 961 { 962 double[2] result = [e0, e1]; 963 return loadUnaligned!(double2)(result.ptr); 964 } 965 966 __m128d _mm_set_pd1 (double a) pure @trusted 967 { 968 double[2] result = [a, a]; 969 return loadUnaligned!(double2)(result.ptr); 970 } 971 972 __m128d _mm_set_sd (double a) pure @trusted 973 { 974 double[2] result = [a, 0]; 975 return loadUnaligned!(double2)(result.ptr); 976 } 977 978 __m128i _mm_set1_epi16 (short a) pure @trusted 979 { 980 short[8] result = [a, a, a, a, a, a, a, a]; 981 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 982 } 983 984 __m128i _mm_set1_epi32 (int a) pure @trusted 985 { 986 int[4] result = [a, a, a, a]; 987 return loadUnaligned!(int4)(result.ptr); 988 } 989 unittest 990 { 991 __m128 a = _mm_set1_ps(-1.0f); 992 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 993 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 994 } 995 996 __m128i _mm_set1_epi64x (long a) pure @trusted 997 { 998 long[2] result = [a, a]; 999 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1000 } 1001 1002 __m128i _mm_set1_epi8 (char a) pure @trusted 1003 { 1004 byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a]; 1005 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1006 } 1007 1008 alias _mm_set1_pd = _mm_set_pd1; 1009 1010 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1011 { 1012 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 1013 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 1014 } 1015 1016 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 1017 { 1018 int[4] result = [e3, e2, e1, e0]; 1019 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 1020 } 1021 1022 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 1023 { 1024 long[2] result = [e1, e0]; 1025 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1026 } 1027 1028 __m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, 1029 char e11, char e10, char e9, char e8, 1030 char e7, char e6, char e5, char e4, 1031 char e3, char e2, char e1, char e0) pure @trusted 1032 { 1033 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 1034 e7, e6, e5, e4, e3, e2, e1, e0]; 1035 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1036 } 1037 1038 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 1039 { 1040 double[2] result = [e1, e0]; 1041 return loadUnaligned!(double2)(result.ptr); 1042 } 1043 1044 __m128d _mm_setzero_pd () pure @trusted 1045 { 1046 double[2] result = [0.0, 0.0]; 1047 return loadUnaligned!(double2)(result.ptr); 1048 } 1049 1050 __m128i _mm_setzero_si128() pure @trusted 1051 { 1052 int[4] result = [0, 0, 0, 0]; 1053 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 1054 } 1055 1056 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 1057 { 1058 return shufflevector!(int4, (imm8 >> 0) & 3, 1059 (imm8 >> 2) & 3, 1060 (imm8 >> 4) & 3, 1061 (imm8 >> 6) & 3)(a, a); 1062 } 1063 unittest 1064 { 1065 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 1066 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1067 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 1068 int[4] expectedB = [ 3, 2, 1, 0 ]; 1069 assert(B.array == expectedB); 1070 } 1071 1072 __m128d _mm_shuffle_pd (int imm8)(__m128d a) pure @safe 1073 { 1074 return shufflevector!(double2, 0 + ( imm8 & 1 ), 1075 2 + ( (imm8 >> 1) & 1 ))(a, a); 1076 } 1077 unittest 1078 { 1079 __m128d A = _mm_setr_pd(0.5f, 2.0f); 1080 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 1081 __m128d B = _mm_shuffle_pd!SHUFFLE(A); 1082 double[2] expectedB = [ 2.0f, 2.0f ]; 1083 assert(B.array == expectedB); 1084 } 1085 1086 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 1087 { 1088 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 1089 4 + ( (imm8 >> 0) & 3 ), 1090 4 + ( (imm8 >> 2) & 3 ), 1091 4 + ( (imm8 >> 4) & 3 ), 1092 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 1093 } 1094 unittest 1095 { 1096 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1097 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1098 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 1099 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 1100 assert(C.array == expectedC); 1101 } 1102 1103 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 1104 { 1105 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 1106 ( (imm8 >> 2) & 3 ), 1107 ( (imm8 >> 4) & 3 ), 1108 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 1109 } 1110 unittest 1111 { 1112 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1113 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1114 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 1115 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 1116 assert(B.array == expectedB); 1117 } 1118 1119 version(LDC) 1120 { 1121 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 1122 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 1123 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 1124 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 1125 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 1126 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 1127 } 1128 // TODO 1129 1130 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe 1131 { 1132 static if (imm8 & 0xF0) 1133 return _mm_setzero_si128(); 1134 else 1135 return shufflevector!(byte16, 1136 16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8, 1137 24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8) 1138 (_mm_setzero_si128(), op); 1139 } 1140 1141 version(LDC) 1142 { 1143 // Disappeared with LDC 1.11 1144 static if (__VERSION__ < 2081) 1145 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 1146 else 1147 { 1148 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 1149 { 1150 vec.array[0] = llvm_sqrt(vec.array[0]); 1151 vec.array[1] = llvm_sqrt(vec.array[1]); 1152 return vec; 1153 } 1154 } 1155 } 1156 else 1157 { 1158 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 1159 { 1160 import std.math: sqrt; 1161 vec.array[0] = sqrt(vec.array[0]); 1162 vec.array[1] = sqrt(vec.array[1]); 1163 return vec; 1164 } 1165 } 1166 1167 1168 version(LDC) 1169 { 1170 // Disappeared with LDC 1.11 1171 static if (__VERSION__ < 2081) 1172 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 1173 else 1174 { 1175 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 1176 { 1177 vec.array[0] = llvm_sqrt(vec.array[0]); 1178 vec.array[1] = vec.array[1]; 1179 return vec; 1180 } 1181 } 1182 } 1183 else 1184 { 1185 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 1186 { 1187 import std.math: sqrt; 1188 vec.array[0] = sqrt(vec.array[0]); 1189 vec.array[1] = vec.array[1]; 1190 return vec; 1191 } 1192 } 1193 1194 1195 version(LDC) 1196 { 1197 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 1198 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 1199 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 1200 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 1201 1202 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 1203 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 1204 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 1205 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 1206 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 1207 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 1208 } 1209 // TODO 1210 1211 __m128i _mm_srli_si128(ubyte imm8)(__m128i op) pure @safe 1212 { 1213 static if (imm8 & 0xF0) 1214 return _mm_setzero_si128(); 1215 else 1216 return cast(__m128i) shufflevector!(byte16, 1217 imm8+0, imm8+1, imm8+2, imm8+3, imm8+4, imm8+5, imm8+6, imm8+7, 1218 imm8+8, imm8+9, imm8+10, imm8+11, imm8+12, imm8+13, imm8+14, imm8+15) 1219 (cast(byte16) op, cast(byte16)_mm_setzero_si128()); 1220 } 1221 1222 // Note: this is a bonus intrinsic 1223 __m128 _mm_srli_si128(ubyte imm8)(__m128 op) @safe 1224 { 1225 return cast(__m128)_mm_srli_si128!imm8(cast(__m128i)op); 1226 } 1227 unittest 1228 { 1229 // test that cast works at all 1230 __m128 A = cast(__m128) _mm_set1_epi32(0x3F800000); 1231 assert(A.array == [1.0f, 1.0f, 1.0f, 1.0f]); 1232 1233 // test _mm_srli_si128 for __m128i 1234 assert(_mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)).array == [2, 3, 4, 0]); 1235 assert(_mm_srli_si128!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)).array == [3.0f, 4.0f, 0, 0]); 1236 } 1237 1238 __m128d _mm_srli_si128(ubyte imm8)(__m128d op) pure @safe 1239 { 1240 return cast(__m128d) _mm_srli_si128!imm8(cast(__m128i)op); 1241 } 1242 1243 void _mm_store_pd (double* mem_addr, __m128d a) pure 1244 { 1245 __m128d* aligned = cast(__m128d*)mem_addr; 1246 *aligned = a; 1247 } 1248 1249 void _mm_store_pd1 (double* mem_addr, __m128d a) pure 1250 { 1251 __m128d* aligned = cast(__m128d*)mem_addr; 1252 *aligned = shufflevector!(double2, 0, 0)(a, a); 1253 } 1254 1255 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 1256 { 1257 *mem_addr = extractelement!(double2, 0)(a); 1258 } 1259 1260 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 1261 { 1262 *mem_addr = a; 1263 } 1264 1265 alias _mm_store1_pd = _mm_store_pd1; 1266 1267 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 1268 { 1269 *mem_addr = extractelement!(double2, 1)(a); 1270 } 1271 1272 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 1273 { 1274 long* dest = cast(long*)mem_addr; 1275 *dest = extractelement!(long2, 0)(cast(long2)a); 1276 } 1277 1278 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 1279 { 1280 *mem_addr = extractelement!(double2, 0)(a); 1281 } 1282 1283 void _mm_storer_pd (double* mem_addr, __m128d a) pure 1284 { 1285 __m128d* aligned = cast(__m128d*)mem_addr; 1286 *aligned = shufflevector!(double2, 1, 0)(a, a); 1287 } 1288 1289 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 1290 { 1291 storeUnaligned!double2(a, mem_addr); 1292 } 1293 1294 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 1295 { 1296 storeUnaligned!__m128i(a, cast(int*)mem_addr); 1297 } 1298 1299 // TODO: _mm_stream_pd 1300 // TODO: _mm_stream_si128 1301 // TODO: _mm_stream_si32 1302 // TODO: _mm_stream_si64 1303 1304 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 1305 { 1306 return cast(__m128i)(cast(short8)a - cast(short8)b); 1307 } 1308 1309 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 1310 { 1311 return cast(__m128i)(cast(int4)a - cast(int4)b); 1312 } 1313 1314 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 1315 { 1316 return cast(__m128i)(cast(long2)a - cast(long2)b); 1317 } 1318 1319 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 1320 { 1321 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 1322 } 1323 1324 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 1325 { 1326 return a - b; 1327 } 1328 1329 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 1330 { 1331 a[0] -= b[0]; 1332 return a; 1333 } 1334 unittest 1335 { 1336 __m128d a = [1.5, -2.0]; 1337 a = _mm_sub_sd(a, a); 1338 assert(a.array == [0.0, -2.0]); 1339 } 1340 1341 1342 // MMXREG: _mm_sub_si64 1343 1344 version(LDC) 1345 { 1346 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 1347 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 1348 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 1349 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 1350 1351 alias _mm_ucomieq_sd = __builtin_ia32_ucomisdeq; 1352 alias _mm_ucomige_sd = __builtin_ia32_ucomisdge; 1353 alias _mm_ucomigt_sd = __builtin_ia32_ucomisdgt; 1354 alias _mm_ucomile_sd = __builtin_ia32_ucomisdle; 1355 alias _mm_ucomilt_sd = __builtin_ia32_ucomisdlt; 1356 alias _mm_ucomineq_sd = __builtin_ia32_ucomisdneq; 1357 } 1358 // TODO 1359 1360 __m128d _mm_undefined_pd() pure @safe 1361 { 1362 __m128d result = void; 1363 return result; 1364 } 1365 __m128i _mm_undefined_si128() pure @safe 1366 { 1367 __m128i result = void; 1368 return result; 1369 } 1370 1371 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 1372 { 1373 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 1374 (cast(short8)a, cast(short8)b); 1375 } 1376 1377 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 1378 { 1379 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 1380 } 1381 1382 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe 1383 { 1384 return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b); 1385 } 1386 1387 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 1388 { 1389 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 1390 12, 28, 13, 29, 14, 30, 15, 31) 1391 (cast(byte16)a, cast(byte16)b); 1392 } 1393 1394 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 1395 { 1396 return shufflevector!(__m128d, 1, 3)(a, b); 1397 } 1398 1399 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 1400 { 1401 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 1402 (cast(short8)a, cast(short8)b); 1403 } 1404 1405 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 1406 { 1407 return shufflevector!(int4, 0, 4, 1, 6) 1408 (cast(int4)a, cast(int4)b); 1409 } 1410 1411 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe 1412 { 1413 return cast(__m128i) shufflevector!(long2, 0, 2) 1414 (cast(long2)a, cast(long2)b); 1415 } 1416 1417 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 1418 { 1419 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 1420 4, 20, 5, 21, 6, 22, 7, 23) 1421 (cast(byte16)a, cast(byte16)b); 1422 } 1423 1424 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 1425 { 1426 return shufflevector!(__m128d, 0, 2)(a, b); 1427 } 1428 1429 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 1430 { 1431 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 1432 } 1433 1434 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 1435 { 1436 return a ^ b; 1437 } 1438 1439 unittest 1440 { 1441 // distance between two points in 4D 1442 float distance(float[4] a, float[4] b) nothrow @nogc 1443 { 1444 __m128 va = _mm_loadu_ps(a.ptr); 1445 __m128 vb = _mm_loadu_ps(b.ptr); 1446 __m128 diffSquared = _mm_sub_ps(va, vb); 1447 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 1448 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_si128!8(diffSquared)); 1449 sum = _mm_add_ps(sum, _mm_srli_si128!4(sum)); 1450 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 1451 } 1452 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 1453 }