1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.xmmintrin; 7 8 public import inteli.types; 9 10 import inteli.internals; 11 12 import core.stdc.stdlib: malloc, free; 13 import core.exception: onOutOfMemoryError; 14 15 version(D_InlineAsm_X86) 16 version = InlineX86Asm; 17 else version(D_InlineAsm_X86_64) 18 version = InlineX86Asm; 19 20 21 // SSE1 22 // Note: intrinsics noted MMXREG are actually using MMX registers, 23 // and were not translated. These intrinsics are for instruction 24 // introduced with SSE1, that also work on MMX registers. 25 26 nothrow @nogc: 27 28 29 enum int _MM_EXCEPT_INVALID = 0x0001; 30 enum int _MM_EXCEPT_DENORM = 0x0002; 31 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; 32 enum int _MM_EXCEPT_OVERFLOW = 0x0008; 33 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; 34 enum int _MM_EXCEPT_INEXACT = 0x0020; 35 enum int _MM_EXCEPT_MASK = 0x003f; 36 37 enum int _MM_MASK_INVALID = 0x0080; 38 enum int _MM_MASK_DENORM = 0x0100; 39 enum int _MM_MASK_DIV_ZERO = 0x0200; 40 enum int _MM_MASK_OVERFLOW = 0x0400; 41 enum int _MM_MASK_UNDERFLOW = 0x0800; 42 enum int _MM_MASK_INEXACT = 0x1000; 43 enum int _MM_MASK_MASK = 0x1f80; 44 45 enum int _MM_ROUND_NEAREST = 0x0000; 46 enum int _MM_ROUND_DOWN = 0x2000; 47 enum int _MM_ROUND_UP = 0x4000; 48 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; 49 enum int _MM_ROUND_MASK = 0x6000; 50 51 enum int _MM_FLUSH_ZERO_MASK = 0x8000; 52 enum int _MM_FLUSH_ZERO_ON = 0x8000; 53 enum int _MM_FLUSH_ZERO_OFF = 0x0000; 54 55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 56 { 57 return a + b; 58 } 59 60 unittest 61 { 62 __m128 a = [1, 2, 3, 4]; 63 a = _mm_add_ps(a, a); 64 assert(a.array[0] == 2); 65 assert(a.array[1] == 4); 66 assert(a.array[2] == 6); 67 assert(a.array[3] == 8); 68 } 69 70 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 71 { 72 a[0] += b[0]; 73 return a; 74 } 75 unittest 76 { 77 __m128 a = [1, 2, 3, 4]; 78 a = _mm_add_ss(a, a); 79 assert(a.array == [2.0f, 2, 3, 4]); 80 } 81 82 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 83 { 84 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 85 } 86 unittest 87 { 88 // Note: tested in emmintrin.d 89 } 90 91 __m128i _mm_andnot_ps (__m128i a, __m128i b) pure @safe 92 { 93 return (~a) & b; 94 } 95 96 97 // MMXREG: _mm_avg_pu16 98 // MMXREG: _mm_avg_pu8 99 100 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 101 { 102 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 103 } 104 105 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 106 { 107 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 108 } 109 110 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 111 { 112 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 113 } 114 115 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 116 { 117 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 118 } 119 120 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 121 { 122 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 123 } 124 125 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 126 { 127 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 128 } 129 130 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 131 { 132 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 133 } 134 135 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 136 { 137 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 138 } 139 140 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 141 { 142 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 143 } 144 145 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 146 { 147 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 148 } 149 150 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 151 { 152 return cast(__m128) cmpps!(FPComparison.une)(a, b); 153 } 154 155 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 156 { 157 return cast(__m128) cmpss!(FPComparison.une)(a, b); 158 } 159 160 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 161 { 162 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 163 } 164 165 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 166 { 167 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 168 } 169 170 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 171 { 172 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 173 } 174 175 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 176 { 177 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 178 } 179 180 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 181 { 182 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 183 } 184 185 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 186 { 187 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 188 } 189 190 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 191 { 192 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 193 } 194 195 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 196 { 197 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 198 } 199 200 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 201 { 202 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 203 } 204 205 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 206 { 207 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 208 } 209 210 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 211 { 212 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 213 } 214 215 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 216 { 217 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 218 } 219 220 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 221 // Some such comparisons yields true for NaNs, other don't. 222 223 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 224 { 225 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 226 } 227 228 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 229 { 230 return comss!(FPComparison.oge)(a, b); 231 } 232 233 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 234 { 235 return comss!(FPComparison.ogt)(a, b); 236 } 237 238 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 239 { 240 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 241 } 242 243 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 244 { 245 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 246 } 247 248 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 249 { 250 return comss!(FPComparison.one)(a, b); 251 } 252 253 254 // MMXREG: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b) 255 // MMXREG: __m64 _mm_cvt_ps2pi (__m128 a) 256 257 258 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe 259 { 260 v[0] = cast(float)x; 261 return v; 262 } 263 unittest 264 { 265 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 266 assert(a.array == [42f, 0, 0, 0]); 267 } 268 269 // Note: is just another name for _mm_cvtss_si32 270 alias _mm_cvt_ss2si = _mm_cvtss_si32; 271 272 273 // MMXREG: __m128 _mm_cvtpi16_ps (__m64 a) 274 // MMXREG: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) 275 // MMXREG: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) 276 // MMXREG: __m128 _mm_cvtpi8_ps (__m64 a) 277 // MMXREG: __m64 _mm_cvtps_pi16 (__m128 a) 278 // MMXREG: __m64 _mm_cvtps_pi32 (__m128 a) 279 // MMXREG: __m64 _mm_cvtps_pi8 (__m128 a) 280 // MMXREG: __m128 _mm_cvtpu16_ps (__m64 a) 281 // MMXREG: __m128 _mm_cvtpu8_ps (__m64 a) 282 283 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe 284 { 285 v[0] = cast(float)x; 286 return v; 287 } 288 unittest 289 { 290 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 291 assert(a.array == [42.0f, 0, 0, 0]); 292 } 293 294 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 295 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe 296 { 297 v[0] = cast(float)x; 298 return v; 299 } 300 unittest 301 { 302 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 303 assert(a.array == [42.0f, 0, 0, 0]); 304 } 305 306 float _mm_cvtss_f32(__m128 a) pure @safe 307 { 308 return a[0]; 309 } 310 311 version(LDC) 312 { 313 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 314 } 315 else 316 { 317 int _mm_cvtss_si32 (__m128 a) pure @safe 318 { 319 return convertFloatToInt32UsingMXCSR(a[0]); 320 } 321 } 322 unittest 323 { 324 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 325 } 326 327 version(LDC) 328 { 329 version(X86_64) 330 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 331 else 332 { 333 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 334 long _mm_cvtss_si64 (__m128 a) pure @safe 335 { 336 return convertFloatToInt64UsingMXCSR(a[0]); 337 } 338 } 339 } 340 else 341 { 342 long _mm_cvtss_si64 (__m128 a) pure @safe 343 { 344 return convertFloatToInt64UsingMXCSR(a[0]); 345 } 346 } 347 unittest 348 { 349 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 350 351 uint savedRounding = _MM_GET_ROUNDING_MODE(); 352 353 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 354 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.5f))); 355 356 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 357 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 358 359 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 360 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 361 362 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 363 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 364 365 _MM_SET_ROUNDING_MODE(savedRounding); 366 } 367 368 369 version(LDC) 370 { 371 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 372 } 373 else 374 { 375 int _mm_cvtt_ss2si (__m128 a) pure @safe 376 { 377 return cast(int)(a[0]); 378 } 379 } 380 unittest 381 { 382 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 383 } 384 385 // MMXREG: __m64 _mm_cvtt_ps2pi (__m128 a) 386 387 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 388 389 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code. 390 long _mm_cvttss_si64 (__m128 a) pure @safe 391 { 392 return cast(long)(a[0]); // Generates cvttss2si as expected 393 } 394 unittest 395 { 396 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 397 } 398 399 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 400 { 401 return a / b; 402 } 403 unittest 404 { 405 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 406 a = _mm_div_ps(a, a); 407 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 408 assert(a.array == correct); 409 } 410 411 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 412 { 413 a[0] /= b[0]; 414 return a; 415 } 416 unittest 417 { 418 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 419 a = _mm_div_ss(a, a); 420 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 421 assert(a.array == correct); 422 } 423 424 // MMXREG: int _mm_extract_pi16 (__m64 a, int imm8) 425 426 /// Free aligned memory that was allocated with `_mm_malloc`. 427 void _mm_free(void * mem_addr) @trusted 428 { 429 // support for free(NULL) 430 if (mem_addr is null) 431 return; 432 433 // Technically we don't need to store size and alignement in the chunk, but we do in case we 434 // have to implement _mm_realloc 435 436 size_t pointerSize = (void*).sizeof; 437 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 438 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 439 size_t alignment = *alignmentLocation; 440 assert(alignment != 0); 441 assert(isPointerAligned(mem_addr, alignment)); 442 free(*rawLocation); 443 } 444 445 uint _MM_GET_EXCEPTION_MASK() pure @safe 446 { 447 return _mm_getcsr() & _MM_MASK_MASK; 448 } 449 450 uint _MM_GET_EXCEPTION_STATE() pure @safe 451 { 452 return _mm_getcsr() & _MM_EXCEPT_MASK; 453 } 454 455 uint _MM_GET_FLUSH_ZERO_MODE() pure @safe 456 { 457 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 458 } 459 460 uint _MM_GET_ROUNDING_MODE() pure @safe 461 { 462 return _mm_getcsr() & _MM_ROUND_MASK; 463 } 464 465 uint _mm_getcsr() pure @safe 466 { 467 version (InlineX86Asm) 468 { 469 uint controlWord; 470 asm nothrow @nogc pure @safe 471 { 472 stmxcsr controlWord; 473 } 474 return controlWord; 475 } 476 else 477 static assert(0, "Not yet supported"); 478 } 479 480 // MMXREG: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8) 481 482 __m128 _mm_load_ps(const(float)*p) pure @trusted 483 { 484 return *cast(__m128*)p; 485 } 486 487 __m128 _mm_load_ps1(const(float)*p) pure @trusted 488 { 489 float[4] f = [ *p, *p, *p, *p ]; 490 return loadUnaligned!(float4)(f.ptr); 491 } 492 493 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 494 { 495 float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ]; 496 return loadUnaligned!(float4)(f.ptr); 497 } 498 499 alias _mm_load1_ps = _mm_load_ps1; 500 501 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe 502 { 503 long2 la = cast(long2)a; 504 la[1] = *mem_addr; 505 return cast(__m128)la; 506 } 507 508 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe 509 { 510 long2 la = cast(long2)a; 511 la[0] = *mem_addr; 512 return cast(__m128)la; 513 } 514 515 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 516 { 517 __m128* aligned = cast(__m128*)mem_addr; 518 __m128 a = *aligned; 519 return shufflevector!(__m128, 3, 2, 1, 0)(a, a); 520 } 521 522 __m128 _mm_loadu_ps(const(float)*p) pure @safe 523 { 524 return loadUnaligned!(__m128)(p); 525 } 526 527 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 528 { 529 short r = *cast(short*)(mem_addr); 530 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 531 result[0] = r; 532 return cast(__m128i)result; 533 } 534 unittest 535 { 536 short r = 13; 537 short8 A = cast(short8) _mm_loadu_si16(&r); 538 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 539 assert(A.array == correct); 540 } 541 542 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 543 { 544 long r = *cast(long*)(mem_addr); 545 long2 result = [0, 0]; 546 result[0] = r; 547 return cast(__m128i)result; 548 } 549 unittest 550 { 551 long r = 446446446446; 552 long2 A = cast(long2) _mm_loadu_si64(&r); 553 long[2] correct = [446446446446, 0]; 554 assert(A.array == correct); 555 } 556 557 /// Allocate size bytes of memory, aligned to the alignment specified in align, 558 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 559 /// memory that is allocated with `_mm_malloc`. 560 void* _mm_malloc(size_t size, size_t alignment) @trusted 561 { 562 assert(alignment != 0); 563 size_t request = requestedSize(size, alignment); 564 void* raw = malloc(request); 565 if (request > 0 && raw == null) // malloc(0) can validly return anything 566 onOutOfMemoryError(); 567 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 568 } 569 570 // MMXREG: _mm_maskmove_si64 571 // MMXREG: _m_maskmovq 572 573 // MMXREG: _mm_max_pi16 574 version(LDC) 575 { 576 alias _mm_max_ps = __builtin_ia32_maxps; 577 } 578 else 579 { 580 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 581 { 582 __m128 r; 583 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 584 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 585 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 586 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 587 return r; 588 } 589 } 590 unittest 591 { 592 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 593 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 594 __m128 M = _mm_max_ps(A, B); 595 assert(M[0] == 4); 596 assert(M[1] == 2); 597 assert(M[2] == 4); // in case of NaN, second operand prevails (as it seems) 598 assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems) 599 } 600 601 // MMXREG: _mm_max_pu8 602 603 version(LDC) 604 { 605 alias _mm_max_ss = __builtin_ia32_maxss; 606 } 607 else 608 { 609 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 610 { 611 __m128 r = a; 612 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 613 return r; 614 } 615 } 616 unittest 617 { 618 __m128 A = _mm_setr_ps(1, 2, 3, 4); 619 __m128 B = _mm_setr_ps(4, 1, 4, 1); 620 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 621 __m128 M = _mm_max_ss(A, B); 622 assert(M[0] == 4); 623 assert(M[1] == 2); 624 assert(M[2] == 3); 625 assert(M[3] == 4); 626 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 627 assert(M[0] != M[0]); 628 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 629 assert(M[0] == 1); 630 } 631 632 // MMXREG: _mm_min_pi16 633 634 version(LDC) 635 { 636 alias _mm_min_ps = __builtin_ia32_minps; 637 } 638 else 639 { 640 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 641 { 642 __m128 r; 643 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 644 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 645 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 646 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 647 return r; 648 } 649 } 650 unittest 651 { 652 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 653 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 654 __m128 M = _mm_min_ps(A, B); 655 assert(M[0] == 1); 656 assert(M[1] == 1); 657 assert(M[2] == 4); // in case of NaN, second operand prevails (as it seems) 658 assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems) 659 } 660 661 // MMXREG: _mm_min_pi8 662 663 version(LDC) 664 { 665 alias _mm_min_ss = __builtin_ia32_minss; 666 } 667 else 668 { 669 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 670 { 671 __m128 r = a; 672 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 673 return r; 674 } 675 } 676 unittest 677 { 678 __m128 A = _mm_setr_ps(1, 2, 3, 4); 679 __m128 B = _mm_setr_ps(4, 1, 4, 1); 680 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 681 __m128 M = _mm_min_ss(A, B); 682 assert(M[0] == 1); 683 assert(M[1] == 2); 684 assert(M[2] == 3); 685 assert(M[3] == 4); 686 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 687 assert(M[0] != M[0]); 688 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 689 assert(M[0] == 1); 690 } 691 692 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe 693 { 694 return shufflevector!(__m128, 4, 1, 2, 3)(a, b); 695 } 696 697 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe 698 { 699 return shufflevector!(float4, 2, 3, 6, 7)(a, b); 700 } 701 702 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe 703 { 704 return shufflevector!(float4, 0, 1, 4, 5)(a, b); 705 } 706 707 708 version(LDC) 709 { 710 alias _mm_movemask_ps = __builtin_ia32_movmskps; 711 } 712 else 713 { 714 int _mm_movemask_ps (__m128 a) pure @safe 715 { 716 int4 ai = cast(int4)a; 717 int r = 0; 718 if (ai[0] < 0) r += 1; 719 if (ai[1] < 0) r += 2; 720 if (ai[2] < 0) r += 4; 721 if (ai[3] < 0) r += 8; 722 return r; 723 } 724 } 725 unittest 726 { 727 int4 A = [-1, 0, -43, 0]; 728 assert(5 == _mm_movemask_ps(cast(float4)A)); 729 } 730 731 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 732 { 733 return a * b; 734 } 735 unittest 736 { 737 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 738 a = _mm_mul_ps(a, a); 739 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 740 assert(a.array == correct); 741 } 742 743 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 744 { 745 a[0] *= b[0]; 746 return a; 747 } 748 unittest 749 { 750 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 751 a = _mm_mul_ss(a, a); 752 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 753 assert(a.array == correct); 754 } 755 756 // MMXREG: _mm_mulhi_pu16 757 758 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 759 { 760 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 761 } 762 763 // MMXREG: __m64 _m_pavgb (__m64 a, __m64 b) 764 // MMXREG: __m64 _m_pavgw (__m64 a, __m64 b) 765 // MMXREG: int _m_pextrw (__m64 a, int imm8) 766 // MMXREG: __m64 _m_pinsrw (__m64 a, int i, int imm8) 767 // MMXREG: __m64 _m_pmaxsw (__m64 a, __m64 b) 768 // MMXREG: __m64 _m_pmaxub (__m64 a, __m64 b) 769 // MMXREG: __m64 _m_pminsw (__m64 a, __m64 b) 770 // MMXREG: __m64 _m_pminub (__m64 a, __m64 b) 771 // MMXREG: int _m_pmovmskb (__m64 a) 772 773 // MMXREG: __m64 _m_pmulhuw (__m64 a, __m64 b) 774 775 enum _MM_HINT_NTA = 0; 776 enum _MM_HINT_T0 = 1; 777 enum _MM_HINT_T1 = 2; 778 enum _MM_HINT_T2 = 3; 779 780 // Note: locality must be compile-time, unlike Intel Intrinsics API 781 void _mm_prefetch(int locality)(void* p) pure @safe 782 { 783 llvm_prefetch(p, 0, locality, 1); 784 } 785 786 // MMXREG: __m64 _m_psadbw (__m64 a, __m64 b) 787 // MMXREG: __m64 _m_pshufw (__m64 a, int imm8) 788 789 version(LDC) 790 { 791 alias _mm_rcp_ps = __builtin_ia32_rcpps; 792 } 793 else 794 { 795 __m128 _mm_rcp_ps (__m128 a) pure @safe 796 { 797 a[0] = 1.0f / a[0]; 798 a[1] = 1.0f / a[1]; 799 a[2] = 1.0f / a[2]; 800 a[3] = 1.0f / a[3]; 801 return a; 802 } 803 } 804 805 version(LDC) 806 { 807 alias _mm_rcp_ss = __builtin_ia32_rcpss; 808 } 809 else 810 { 811 __m128 _mm_rcp_ss (__m128 a) pure @safe 812 { 813 a[0] = 1.0f / a[0]; 814 return a; 815 } 816 } 817 818 version(LDC) 819 { 820 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 821 } 822 else 823 { 824 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 825 { 826 a[0] = 1.0f / sqrt(a[0]); 827 a[1] = 1.0f / sqrt(a[1]); 828 a[2] = 1.0f / sqrt(a[2]); 829 a[3] = 1.0f / sqrt(a[3]); 830 return a; 831 } 832 } 833 834 version(LDC) 835 { 836 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 837 } 838 else 839 { 840 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 841 { 842 a[0] = 1.0f / sqrt(a[0]); 843 return a; 844 } 845 } 846 847 unittest 848 { 849 double maxRelativeError = 0.000245; // -72 dB 850 void testInvSqrt(float number) 851 { 852 __m128 A = _mm_set1_ps(number); 853 854 // test _mm_rcp_ps 855 __m128 B = _mm_rcp_ps(A); 856 foreach(i; 0..4) 857 { 858 double exact = 1.0f / A[i]; 859 double ratio = cast(double)(B[i]) / cast(double)(exact); 860 assert(fabs(ratio - 1) <= maxRelativeError); 861 } 862 863 // test _mm_rcp_ss 864 { 865 B = _mm_rcp_ss(A); 866 double exact = 1.0f / A[0]; 867 double ratio = cast(double)(B[0]) / cast(double)(exact); 868 assert(fabs(ratio - 1) <= maxRelativeError); 869 } 870 871 // test _mm_rsqrt_ps 872 B = _mm_rsqrt_ps(A); 873 foreach(i; 0..4) 874 { 875 double exact = 1.0f / sqrt(A[i]); 876 double ratio = cast(double)(B[i]) / cast(double)(exact); 877 assert(fabs(ratio - 1) <= maxRelativeError); 878 } 879 880 // test _mm_rsqrt_ss 881 { 882 B = _mm_rsqrt_ss(A); 883 double exact = 1.0f / sqrt(A[0]); 884 double ratio = cast(double)(B[0]) / cast(double)(exact); 885 assert(fabs(ratio - 1) <= maxRelativeError); 886 } 887 } 888 889 testInvSqrt(1.1f); 890 testInvSqrt(2.45674864151f); 891 testInvSqrt(27841456468.0f); 892 } 893 894 // MMXREG: _mm_sad_pu8 895 896 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) pure @safe 897 { 898 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 899 } 900 901 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) pure @safe 902 { 903 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 904 } 905 906 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) pure @safe 907 { 908 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 909 } 910 911 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 912 { 913 // Note: despite appearances, generates sensible code, 914 // inlines correctly and is constant folded 915 float[4] result = [e0, e1, e2, e3]; 916 return loadUnaligned!(float4)(result.ptr); 917 } 918 919 alias _mm_set_ps1 = _mm_set1_ps; 920 921 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) pure @safe 922 { 923 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 924 } 925 926 __m128 _mm_set_ss (float a) pure @trusted 927 { 928 __m128 r = _mm_setzero_ps(); 929 r[0] = a; 930 return r; 931 } 932 933 __m128 _mm_set1_ps (float a) pure @trusted 934 { 935 // Note: despite appearances, generates sensible code, 936 // inlines correctly and is constant folded 937 float[4] result = [a, a, a, a]; 938 return loadUnaligned!(float4)(result.ptr); 939 } 940 941 void _mm_setcsr(uint controlWord) pure @safe 942 { 943 version (InlineX86Asm) 944 { 945 asm pure nothrow @nogc @safe 946 { 947 ldmxcsr controlWord; 948 } 949 } 950 else 951 static assert(0, "Not yet supported"); 952 } 953 954 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 955 { 956 float[4] result = [e3, e2, e1, e0]; 957 return loadUnaligned!(float4)(result.ptr); 958 } 959 960 __m128 _mm_setzero_ps() pure @trusted 961 { 962 // Compiles to xorps without problems 963 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 964 return loadUnaligned!(float4)(result.ptr); 965 } 966 967 version(LDC) 968 { 969 alias _mm_sfence = __builtin_ia32_sfence; 970 } 971 else 972 { 973 void _mm_sfence() pure @safe 974 { 975 asm nothrow @nogc pure @safe 976 { 977 sfence; 978 } 979 } 980 } 981 unittest 982 { 983 _mm_sfence(); 984 } 985 986 // MMXREG: mm_shuffle_pi16 987 988 // Note: the immediate shuffle value is given at compile-time instead of runtime. 989 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 990 { 991 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 992 } 993 994 version(LDC) 995 { 996 // Disappeared with LDC 1.11 997 static if (__VERSION__ < 2081) 998 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 999 else 1000 { 1001 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1002 { 1003 vec.array[0] = llvm_sqrt(vec.array[0]); 1004 vec.array[1] = llvm_sqrt(vec.array[1]); 1005 vec.array[2] = llvm_sqrt(vec.array[2]); 1006 vec.array[3] = llvm_sqrt(vec.array[3]); 1007 return vec; 1008 } 1009 } 1010 } 1011 else 1012 { 1013 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1014 { 1015 vec.array[0] = sqrt(vec.array[0]); 1016 vec.array[1] = sqrt(vec.array[1]); 1017 vec.array[2] = sqrt(vec.array[2]); 1018 vec.array[3] = sqrt(vec.array[3]); 1019 return vec; 1020 } 1021 } 1022 unittest 1023 { 1024 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1025 assert(A.array[0] == 2.0f); 1026 assert(A.array[1] == 2.0f); 1027 assert(A.array[2] == 2.0f); 1028 assert(A.array[3] == 2.0f); 1029 } 1030 1031 version(LDC) 1032 { 1033 // Disappeared with LDC 1.11 1034 static if (__VERSION__ < 2081) 1035 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1036 else 1037 { 1038 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1039 { 1040 vec.array[0] = llvm_sqrt(vec.array[0]); 1041 vec.array[1] = vec.array[1]; 1042 vec.array[2] = vec.array[2]; 1043 vec.array[3] = vec.array[3]; 1044 return vec; 1045 } 1046 } 1047 } 1048 else 1049 { 1050 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1051 { 1052 vec.array[0] = sqrt(vec.array[0]); 1053 return vec; 1054 } 1055 } 1056 unittest 1057 { 1058 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1059 assert(A.array[0] == 2.0f); 1060 assert(A.array[1] == 4.0f); 1061 assert(A.array[2] == 4.0f); 1062 assert(A.array[3] == 4.0f); 1063 } 1064 1065 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1066 { 1067 __m128* aligned = cast(__m128*)mem_addr; 1068 *aligned = a; 1069 } 1070 1071 alias _mm_store_ps1 = _mm_store1_ps; 1072 1073 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 1074 { 1075 *mem_addr = a[0]; 1076 } 1077 1078 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1079 { 1080 __m128* aligned = cast(__m128*)mem_addr; 1081 *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a); 1082 } 1083 1084 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe 1085 { 1086 *p = extractelement!(long2, 1)(a); 1087 } 1088 1089 void _mm_storel_pi(__m64* p, __m128 a) pure @safe 1090 { 1091 *p = extractelement!(long2, 0)(a); 1092 } 1093 1094 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1095 { 1096 __m128* aligned = cast(__m128*)mem_addr; 1097 *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a); 1098 } 1099 1100 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 1101 { 1102 storeUnaligned!(float4)(a, mem_addr); 1103 } 1104 1105 // MMXREG: _mm_stream_pi, does not seem possible 1106 1107 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 1108 // needs some IR outside this function that would say: 1109 // 1110 // !0 = !{ i32 1 } 1111 // 1112 // It's a LLVM IR metadata description. 1113 // Regardless, non-temporal moves are really dangerous for performance... 1114 void _mm_stream_ps (float* mem_addr, __m128 a) 1115 { 1116 __m128* dest = cast(__m128*)mem_addr; 1117 *dest = a; // it's a regular move instead 1118 } 1119 unittest 1120 { 1121 align(16) float[4] A; 1122 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 1123 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 1124 } 1125 1126 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 1127 { 1128 return a - b; 1129 } 1130 unittest 1131 { 1132 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1133 a = _mm_sub_ps(a, a); 1134 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 1135 assert(a.array == correct); 1136 } 1137 1138 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 1139 { 1140 a[0] -= b[0]; 1141 return a; 1142 } 1143 unittest 1144 { 1145 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1146 a = _mm_sub_ss(a, a); 1147 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 1148 assert(a.array == correct); 1149 } 1150 1151 1152 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 1153 { 1154 __m128 tmp3, tmp2, tmp1, tmp0; 1155 tmp0 = _mm_unpacklo_ps(row0, row1); 1156 tmp2 = _mm_unpacklo_ps(row2, row3); 1157 tmp1 = _mm_unpackhi_ps(row0, row1); 1158 tmp3 = _mm_unpackhi_ps(row2, row3); 1159 row0 = _mm_movelh_ps(tmp0, tmp2); 1160 row1 = _mm_movehl_ps(tmp2, tmp0); 1161 row2 = _mm_movelh_ps(tmp1, tmp3); 1162 row3 = _mm_movehl_ps(tmp3, tmp1); 1163 } 1164 1165 // Note: the only difference between these intrinsics is the signalling 1166 // behaviour of quiet NaNs. This is incorrect but the case where 1167 // you would want to differentiate between qNaN and sNaN and then 1168 // treat them differently on purpose seems extremely rare. 1169 alias _mm_ucomieq_ss = _mm_comieq_ss; 1170 alias _mm_ucomige_ss = _mm_comige_ss; 1171 alias _mm_ucomigt_ss = _mm_comigt_ss; 1172 alias _mm_ucomile_ss = _mm_comile_ss; 1173 alias _mm_ucomilt_ss = _mm_comilt_ss; 1174 alias _mm_ucomineq_ss = _mm_comineq_ss; 1175 1176 1177 __m128 _mm_undefined_ps() pure @safe 1178 { 1179 __m128 undef = void; 1180 return undef; 1181 } 1182 1183 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe 1184 { 1185 return shufflevector!(float4, 2, 6, 3, 7)(a, b); 1186 } 1187 1188 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe 1189 { 1190 return shufflevector!(float4, 0, 4, 1, 5)(a, b); 1191 } 1192 1193 __m128i _mm_xor_ps (__m128i a, __m128i b) pure @safe 1194 { 1195 return a ^ b; 1196 } 1197 1198 1199 private 1200 { 1201 /// Returns: `true` if the pointer is suitably aligned. 1202 bool isPointerAligned(void* p, size_t alignment) pure 1203 { 1204 assert(alignment != 0); 1205 return ( cast(size_t)p & (alignment - 1) ) == 0; 1206 } 1207 1208 /// Returns: next pointer aligned with alignment bytes. 1209 void* nextAlignedPointer(void* start, size_t alignment) pure 1210 { 1211 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 1212 } 1213 1214 // Returns number of bytes to actually allocate when asking 1215 // for a particular alignment 1216 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 1217 { 1218 enum size_t pointerSize = size_t.sizeof; 1219 return askedSize + alignment - 1 + pointerSize * 3; 1220 } 1221 1222 // Store pointer given my malloc, size and alignment 1223 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 1224 { 1225 enum size_t pointerSize = size_t.sizeof; 1226 char* start = cast(char*)raw + pointerSize * 3; 1227 void* aligned = nextAlignedPointer(start, alignment); 1228 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 1229 *rawLocation = raw; 1230 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 1231 *sizeLocation = size; 1232 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 1233 *alignmentLocation = alignment; 1234 assert( isPointerAligned(aligned, alignment) ); 1235 return aligned; 1236 } 1237 1238 // Returns: x, multiple of powerOfTwo, so that x >= n. 1239 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 1240 { 1241 // check power-of-two 1242 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 1243 1244 size_t mask = ~(powerOfTwo - 1); 1245 return (n + powerOfTwo - 1) & mask; 1246 } 1247 } 1248 1249 unittest 1250 { 1251 assert(nextMultipleOf(0, 4) == 0); 1252 assert(nextMultipleOf(1, 4) == 4); 1253 assert(nextMultipleOf(2, 4) == 4); 1254 assert(nextMultipleOf(3, 4) == 4); 1255 assert(nextMultipleOf(4, 4) == 4); 1256 assert(nextMultipleOf(5, 4) == 8); 1257 1258 { 1259 void* p = _mm_malloc(23, 16); 1260 assert(p !is null); 1261 assert(((cast(size_t)p) & 0xf) == 0); 1262 _mm_free(p); 1263 } 1264 1265 void* nullAlloc = _mm_malloc(0, 32); 1266 assert(nullAlloc != null); 1267 _mm_free(nullAlloc); 1268 }