1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.xmmintrin; 7 8 public import inteli.types; 9 10 import inteli.internals; 11 12 import inteli.mmx; 13 14 import core.stdc.stdlib: malloc, free; 15 import core.exception: onOutOfMemoryError; 16 17 version(D_InlineAsm_X86) 18 version = InlineX86Asm; 19 else version(D_InlineAsm_X86_64) 20 version = InlineX86Asm; 21 22 23 // SSE1 24 25 nothrow @nogc: 26 27 28 enum int _MM_EXCEPT_INVALID = 0x0001; 29 enum int _MM_EXCEPT_DENORM = 0x0002; 30 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; 31 enum int _MM_EXCEPT_OVERFLOW = 0x0008; 32 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; 33 enum int _MM_EXCEPT_INEXACT = 0x0020; 34 enum int _MM_EXCEPT_MASK = 0x003f; 35 36 enum int _MM_MASK_INVALID = 0x0080; 37 enum int _MM_MASK_DENORM = 0x0100; 38 enum int _MM_MASK_DIV_ZERO = 0x0200; 39 enum int _MM_MASK_OVERFLOW = 0x0400; 40 enum int _MM_MASK_UNDERFLOW = 0x0800; 41 enum int _MM_MASK_INEXACT = 0x1000; 42 enum int _MM_MASK_MASK = 0x1f80; 43 44 enum int _MM_ROUND_NEAREST = 0x0000; 45 enum int _MM_ROUND_DOWN = 0x2000; 46 enum int _MM_ROUND_UP = 0x4000; 47 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; 48 enum int _MM_ROUND_MASK = 0x6000; 49 50 enum int _MM_FLUSH_ZERO_MASK = 0x8000; 51 enum int _MM_FLUSH_ZERO_ON = 0x8000; 52 enum int _MM_FLUSH_ZERO_OFF = 0x0000; 53 54 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 55 { 56 return a + b; 57 } 58 59 unittest 60 { 61 __m128 a = [1, 2, 3, 4]; 62 a = _mm_add_ps(a, a); 63 assert(a.array[0] == 2); 64 assert(a.array[1] == 4); 65 assert(a.array[2] == 6); 66 assert(a.array[3] == 8); 67 } 68 69 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 70 { 71 a[0] += b[0]; 72 return a; 73 } 74 unittest 75 { 76 __m128 a = [1, 2, 3, 4]; 77 a = _mm_add_ss(a, a); 78 assert(a.array == [2.0f, 2, 3, 4]); 79 } 80 81 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 82 { 83 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 84 } 85 unittest 86 { 87 // Note: tested in emmintrin.d 88 } 89 90 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 91 { 92 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 93 } 94 95 96 // TODO: _mm_avg_pu16 97 // TODO: _mm_avg_pu8 98 99 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 100 { 101 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 102 } 103 104 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 105 { 106 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 107 } 108 109 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 110 { 111 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 112 } 113 114 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 115 { 116 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 117 } 118 119 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 120 { 121 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 122 } 123 124 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 125 { 126 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 127 } 128 129 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 130 { 131 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 132 } 133 134 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 135 { 136 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 137 } 138 139 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 140 { 141 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 142 } 143 144 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 145 { 146 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 147 } 148 149 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 150 { 151 return cast(__m128) cmpps!(FPComparison.une)(a, b); 152 } 153 154 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 155 { 156 return cast(__m128) cmpss!(FPComparison.une)(a, b); 157 } 158 159 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 160 { 161 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 162 } 163 164 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 165 { 166 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 167 } 168 169 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 170 { 171 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 172 } 173 174 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 175 { 176 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 177 } 178 179 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 180 { 181 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 182 } 183 184 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 185 { 186 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 187 } 188 189 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 190 { 191 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 192 } 193 194 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 195 { 196 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 197 } 198 199 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 200 { 201 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 202 } 203 204 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 205 { 206 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 207 } 208 209 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 210 { 211 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 212 } 213 214 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 215 { 216 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 217 } 218 219 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 220 // Some such comparisons yields true for NaNs, other don't. 221 222 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 223 { 224 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 225 } 226 227 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 228 { 229 return comss!(FPComparison.oge)(a, b); 230 } 231 232 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 233 { 234 return comss!(FPComparison.ogt)(a, b); 235 } 236 237 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 238 { 239 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 240 } 241 242 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 243 { 244 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 245 } 246 247 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 248 { 249 return comss!(FPComparison.one)(a, b); 250 } 251 252 253 // TODO: __m128 _mm_cvt_pi2ps (__m128 a, __m64 b) 254 // TODO: __m64 _mm_cvt_ps2pi (__m128 a) 255 256 257 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe 258 { 259 v[0] = cast(float)x; 260 return v; 261 } 262 unittest 263 { 264 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 265 assert(a.array == [42f, 0, 0, 0]); 266 } 267 268 // Note: is just another name for _mm_cvtss_si32 269 alias _mm_cvt_ss2si = _mm_cvtss_si32; 270 271 272 // TODO: __m128 _mm_cvtpi16_ps (__m64 a) 273 // TODO: __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) 274 // TODO: __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) 275 // TODO: __m128 _mm_cvtpi8_ps (__m64 a) 276 // TODO: __m64 _mm_cvtps_pi16 (__m128 a) 277 // TODO: __m64 _mm_cvtps_pi32 (__m128 a) 278 // TODO: __m64 _mm_cvtps_pi8 (__m128 a) 279 // TODO: __m128 _mm_cvtpu16_ps (__m64 a) 280 // TODO: __m128 _mm_cvtpu8_ps (__m64 a) 281 282 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe 283 { 284 v[0] = cast(float)x; 285 return v; 286 } 287 unittest 288 { 289 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 290 assert(a.array == [42.0f, 0, 0, 0]); 291 } 292 293 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 294 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe 295 { 296 v[0] = cast(float)x; 297 return v; 298 } 299 unittest 300 { 301 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 302 assert(a.array == [42.0f, 0, 0, 0]); 303 } 304 305 float _mm_cvtss_f32(__m128 a) pure @safe 306 { 307 return a[0]; 308 } 309 310 version(LDC) 311 { 312 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 313 } 314 else 315 { 316 int _mm_cvtss_si32 (__m128 a) pure @safe 317 { 318 return convertFloatToInt32UsingMXCSR(a[0]); 319 } 320 } 321 unittest 322 { 323 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 324 } 325 326 version(LDC) 327 { 328 version(X86_64) 329 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 330 else 331 { 332 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 333 long _mm_cvtss_si64 (__m128 a) pure @safe 334 { 335 return convertFloatToInt64UsingMXCSR(a[0]); 336 } 337 } 338 } 339 else 340 { 341 long _mm_cvtss_si64 (__m128 a) pure @safe 342 { 343 return convertFloatToInt64UsingMXCSR(a[0]); 344 } 345 } 346 unittest 347 { 348 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 349 350 uint savedRounding = _MM_GET_ROUNDING_MODE(); 351 352 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 353 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.5f))); 354 355 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 356 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 357 358 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 359 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 360 361 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 362 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 363 364 _MM_SET_ROUNDING_MODE(savedRounding); 365 } 366 367 368 version(LDC) 369 { 370 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 371 } 372 else 373 { 374 int _mm_cvtt_ss2si (__m128 a) pure @safe 375 { 376 return cast(int)(a[0]); 377 } 378 } 379 unittest 380 { 381 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 382 } 383 384 // TODO: __m64 _mm_cvtt_ps2pi (__m128 a) 385 386 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 387 388 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code. 389 long _mm_cvttss_si64 (__m128 a) pure @safe 390 { 391 return cast(long)(a[0]); // Generates cvttss2si as expected 392 } 393 unittest 394 { 395 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 396 } 397 398 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 399 { 400 return a / b; 401 } 402 unittest 403 { 404 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 405 a = _mm_div_ps(a, a); 406 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 407 assert(a.array == correct); 408 } 409 410 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 411 { 412 a[0] /= b[0]; 413 return a; 414 } 415 unittest 416 { 417 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 418 a = _mm_div_ss(a, a); 419 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 420 assert(a.array == correct); 421 } 422 423 // TODO: int _mm_extract_pi16 (__m64 a, int imm8) 424 425 /// Free aligned memory that was allocated with `_mm_malloc`. 426 void _mm_free(void * mem_addr) @trusted 427 { 428 // support for free(NULL) 429 if (mem_addr is null) 430 return; 431 432 // Technically we don't need to store size and alignement in the chunk, but we do in case we 433 // have to implement _mm_realloc 434 435 size_t pointerSize = (void*).sizeof; 436 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 437 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 438 size_t alignment = *alignmentLocation; 439 assert(alignment != 0); 440 assert(isPointerAligned(mem_addr, alignment)); 441 free(*rawLocation); 442 } 443 444 uint _MM_GET_EXCEPTION_MASK() pure @safe 445 { 446 return _mm_getcsr() & _MM_MASK_MASK; 447 } 448 449 uint _MM_GET_EXCEPTION_STATE() pure @safe 450 { 451 return _mm_getcsr() & _MM_EXCEPT_MASK; 452 } 453 454 uint _MM_GET_FLUSH_ZERO_MODE() pure @safe 455 { 456 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 457 } 458 459 uint _MM_GET_ROUNDING_MODE() pure @safe 460 { 461 return _mm_getcsr() & _MM_ROUND_MASK; 462 } 463 464 uint _mm_getcsr() pure @safe 465 { 466 version (InlineX86Asm) 467 { 468 uint controlWord; 469 asm nothrow @nogc pure @safe 470 { 471 stmxcsr controlWord; 472 } 473 return controlWord; 474 } 475 else 476 static assert(0, "Not yet supported"); 477 } 478 479 // TODO: __m64 _mm_insert_pi16 (__m64 a, int i, int imm8) 480 481 __m128 _mm_load_ps(const(float)*p) pure @trusted 482 { 483 return *cast(__m128*)p; 484 } 485 486 __m128 _mm_load_ps1(const(float)*p) pure @trusted 487 { 488 float[4] f = [ *p, *p, *p, *p ]; 489 return loadUnaligned!(float4)(f.ptr); 490 } 491 492 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 493 { 494 float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ]; 495 return loadUnaligned!(float4)(f.ptr); 496 } 497 498 alias _mm_load1_ps = _mm_load_ps1; 499 500 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe 501 { 502 long2 la = cast(long2)a; 503 la[1] = (*mem_addr)[0]; 504 return cast(__m128)la; 505 } 506 507 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe 508 { 509 long2 la = cast(long2)a; 510 la[0] = (*mem_addr)[0]; 511 return cast(__m128)la; 512 } 513 514 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 515 { 516 __m128* aligned = cast(__m128*)mem_addr; 517 __m128 a = *aligned; 518 return shufflevector!(__m128, 3, 2, 1, 0)(a, a); 519 } 520 521 __m128 _mm_loadu_ps(const(float)*p) pure @safe 522 { 523 return loadUnaligned!(__m128)(p); 524 } 525 526 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 527 { 528 short r = *cast(short*)(mem_addr); 529 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 530 result[0] = r; 531 return cast(__m128i)result; 532 } 533 unittest 534 { 535 short r = 13; 536 short8 A = cast(short8) _mm_loadu_si16(&r); 537 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 538 assert(A.array == correct); 539 } 540 541 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 542 { 543 long r = *cast(long*)(mem_addr); 544 long2 result = [0, 0]; 545 result[0] = r; 546 return cast(__m128i)result; 547 } 548 unittest 549 { 550 long r = 446446446446; 551 long2 A = cast(long2) _mm_loadu_si64(&r); 552 long[2] correct = [446446446446, 0]; 553 assert(A.array == correct); 554 } 555 556 /// Allocate size bytes of memory, aligned to the alignment specified in align, 557 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 558 /// memory that is allocated with `_mm_malloc`. 559 void* _mm_malloc(size_t size, size_t alignment) @trusted 560 { 561 assert(alignment != 0); 562 size_t request = requestedSize(size, alignment); 563 void* raw = malloc(request); 564 if (request > 0 && raw == null) // malloc(0) can validly return anything 565 onOutOfMemoryError(); 566 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 567 } 568 569 // TODO: _mm_maskmove_si64 570 // TODO: _m_maskmovq 571 572 // TODO: _mm_max_pi16 573 version(LDC) 574 { 575 alias _mm_max_ps = __builtin_ia32_maxps; 576 } 577 else 578 { 579 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 580 { 581 __m128 r; 582 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 583 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 584 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 585 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 586 return r; 587 } 588 } 589 unittest 590 { 591 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 592 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 593 __m128 M = _mm_max_ps(A, B); 594 assert(M[0] == 4); 595 assert(M[1] == 2); 596 assert(M[2] == 4); // in case of NaN, second operand prevails (as it seems) 597 assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems) 598 } 599 600 // TODO: _mm_max_pu8 601 602 version(LDC) 603 { 604 alias _mm_max_ss = __builtin_ia32_maxss; 605 } 606 else 607 { 608 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 609 { 610 __m128 r = a; 611 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 612 return r; 613 } 614 } 615 unittest 616 { 617 __m128 A = _mm_setr_ps(1, 2, 3, 4); 618 __m128 B = _mm_setr_ps(4, 1, 4, 1); 619 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 620 __m128 M = _mm_max_ss(A, B); 621 assert(M[0] == 4); 622 assert(M[1] == 2); 623 assert(M[2] == 3); 624 assert(M[3] == 4); 625 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 626 assert(M[0] != M[0]); 627 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 628 assert(M[0] == 1); 629 } 630 631 // TODO: _mm_min_pi16 632 633 version(LDC) 634 { 635 alias _mm_min_ps = __builtin_ia32_minps; 636 } 637 else 638 { 639 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 640 { 641 __m128 r; 642 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 643 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 644 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 645 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 646 return r; 647 } 648 } 649 unittest 650 { 651 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 652 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 653 __m128 M = _mm_min_ps(A, B); 654 assert(M[0] == 1); 655 assert(M[1] == 1); 656 assert(M[2] == 4); // in case of NaN, second operand prevails (as it seems) 657 assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems) 658 } 659 660 // TODO: _mm_min_pi8 661 662 version(LDC) 663 { 664 alias _mm_min_ss = __builtin_ia32_minss; 665 } 666 else 667 { 668 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 669 { 670 __m128 r = a; 671 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 672 return r; 673 } 674 } 675 unittest 676 { 677 __m128 A = _mm_setr_ps(1, 2, 3, 4); 678 __m128 B = _mm_setr_ps(4, 1, 4, 1); 679 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 680 __m128 M = _mm_min_ss(A, B); 681 assert(M[0] == 1); 682 assert(M[1] == 2); 683 assert(M[2] == 3); 684 assert(M[3] == 4); 685 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 686 assert(M[0] != M[0]); 687 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 688 assert(M[0] == 1); 689 } 690 691 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe 692 { 693 return shufflevector!(__m128, 4, 1, 2, 3)(a, b); 694 } 695 696 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe 697 { 698 return shufflevector!(float4, 2, 3, 6, 7)(a, b); 699 } 700 701 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe 702 { 703 return shufflevector!(float4, 0, 1, 4, 5)(a, b); 704 } 705 706 707 version(LDC) 708 { 709 alias _mm_movemask_ps = __builtin_ia32_movmskps; 710 } 711 else 712 { 713 int _mm_movemask_ps (__m128 a) pure @safe 714 { 715 int4 ai = cast(int4)a; 716 int r = 0; 717 if (ai[0] < 0) r += 1; 718 if (ai[1] < 0) r += 2; 719 if (ai[2] < 0) r += 4; 720 if (ai[3] < 0) r += 8; 721 return r; 722 } 723 } 724 unittest 725 { 726 int4 A = [-1, 0, -43, 0]; 727 assert(5 == _mm_movemask_ps(cast(float4)A)); 728 } 729 730 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 731 { 732 return a * b; 733 } 734 unittest 735 { 736 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 737 a = _mm_mul_ps(a, a); 738 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 739 assert(a.array == correct); 740 } 741 742 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 743 { 744 a[0] *= b[0]; 745 return a; 746 } 747 unittest 748 { 749 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 750 a = _mm_mul_ss(a, a); 751 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 752 assert(a.array == correct); 753 } 754 755 // TODO: _mm_mulhi_pu16 756 757 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 758 { 759 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 760 } 761 762 // TODO: __m64 _m_pavgb (__m64 a, __m64 b) 763 // TODO: __m64 _m_pavgw (__m64 a, __m64 b) 764 // TODO: int _m_pextrw (__m64 a, int imm8) 765 // TODO: __m64 _m_pinsrw (__m64 a, int i, int imm8) 766 // TODO: __m64 _m_pmaxsw (__m64 a, __m64 b) 767 // TODO: __m64 _m_pmaxub (__m64 a, __m64 b) 768 // TODO: __m64 _m_pminsw (__m64 a, __m64 b) 769 // TODO: __m64 _m_pminub (__m64 a, __m64 b) 770 // TODO: int _m_pmovmskb (__m64 a) 771 772 // TODO: __m64 _m_pmulhuw (__m64 a, __m64 b) 773 774 enum _MM_HINT_NTA = 0; 775 enum _MM_HINT_T0 = 1; 776 enum _MM_HINT_T1 = 2; 777 enum _MM_HINT_T2 = 3; 778 779 // Note: locality must be compile-time, unlike Intel Intrinsics API 780 void _mm_prefetch(int locality)(void* p) pure @safe 781 { 782 llvm_prefetch(p, 0, locality, 1); 783 } 784 785 // TODO: __m64 _m_psadbw (__m64 a, __m64 b) 786 // TODO: __m64 _m_pshufw (__m64 a, int imm8) 787 788 version(LDC) 789 { 790 alias _mm_rcp_ps = __builtin_ia32_rcpps; 791 } 792 else 793 { 794 __m128 _mm_rcp_ps (__m128 a) pure @safe 795 { 796 a[0] = 1.0f / a[0]; 797 a[1] = 1.0f / a[1]; 798 a[2] = 1.0f / a[2]; 799 a[3] = 1.0f / a[3]; 800 return a; 801 } 802 } 803 804 version(LDC) 805 { 806 alias _mm_rcp_ss = __builtin_ia32_rcpss; 807 } 808 else 809 { 810 __m128 _mm_rcp_ss (__m128 a) pure @safe 811 { 812 a[0] = 1.0f / a[0]; 813 return a; 814 } 815 } 816 817 version(LDC) 818 { 819 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 820 } 821 else 822 { 823 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 824 { 825 a[0] = 1.0f / sqrt(a[0]); 826 a[1] = 1.0f / sqrt(a[1]); 827 a[2] = 1.0f / sqrt(a[2]); 828 a[3] = 1.0f / sqrt(a[3]); 829 return a; 830 } 831 } 832 833 version(LDC) 834 { 835 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 836 } 837 else 838 { 839 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 840 { 841 a[0] = 1.0f / sqrt(a[0]); 842 return a; 843 } 844 } 845 846 unittest 847 { 848 double maxRelativeError = 0.000245; // -72 dB 849 void testInvSqrt(float number) 850 { 851 __m128 A = _mm_set1_ps(number); 852 853 // test _mm_rcp_ps 854 __m128 B = _mm_rcp_ps(A); 855 foreach(i; 0..4) 856 { 857 double exact = 1.0f / A[i]; 858 double ratio = cast(double)(B[i]) / cast(double)(exact); 859 assert(fabs(ratio - 1) <= maxRelativeError); 860 } 861 862 // test _mm_rcp_ss 863 { 864 B = _mm_rcp_ss(A); 865 double exact = 1.0f / A[0]; 866 double ratio = cast(double)(B[0]) / cast(double)(exact); 867 assert(fabs(ratio - 1) <= maxRelativeError); 868 } 869 870 // test _mm_rsqrt_ps 871 B = _mm_rsqrt_ps(A); 872 foreach(i; 0..4) 873 { 874 double exact = 1.0f / sqrt(A[i]); 875 double ratio = cast(double)(B[i]) / cast(double)(exact); 876 assert(fabs(ratio - 1) <= maxRelativeError); 877 } 878 879 // test _mm_rsqrt_ss 880 { 881 B = _mm_rsqrt_ss(A); 882 double exact = 1.0f / sqrt(A[0]); 883 double ratio = cast(double)(B[0]) / cast(double)(exact); 884 assert(fabs(ratio - 1) <= maxRelativeError); 885 } 886 } 887 888 testInvSqrt(1.1f); 889 testInvSqrt(2.45674864151f); 890 testInvSqrt(27841456468.0f); 891 } 892 893 // TODO: _mm_sad_pu8 894 895 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) pure @safe 896 { 897 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 898 } 899 900 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) pure @safe 901 { 902 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 903 } 904 905 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) pure @safe 906 { 907 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 908 } 909 910 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 911 { 912 // Note: despite appearances, generates sensible code, 913 // inlines correctly and is constant folded 914 float[4] result = [e0, e1, e2, e3]; 915 return loadUnaligned!(float4)(result.ptr); 916 } 917 918 alias _mm_set_ps1 = _mm_set1_ps; 919 920 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) pure @safe 921 { 922 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 923 } 924 925 __m128 _mm_set_ss (float a) pure @trusted 926 { 927 __m128 r = _mm_setzero_ps(); 928 r[0] = a; 929 return r; 930 } 931 932 __m128 _mm_set1_ps (float a) pure @trusted 933 { 934 // Note: despite appearances, generates sensible code, 935 // inlines correctly and is constant folded 936 float[4] result = [a, a, a, a]; 937 return loadUnaligned!(float4)(result.ptr); 938 } 939 940 void _mm_setcsr(uint controlWord) pure @safe 941 { 942 version (InlineX86Asm) 943 { 944 asm pure nothrow @nogc @safe 945 { 946 ldmxcsr controlWord; 947 } 948 } 949 else 950 static assert(0, "Not yet supported"); 951 } 952 953 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 954 { 955 float[4] result = [e3, e2, e1, e0]; 956 return loadUnaligned!(float4)(result.ptr); 957 } 958 959 __m128 _mm_setzero_ps() pure @trusted 960 { 961 // Compiles to xorps without problems 962 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 963 return loadUnaligned!(float4)(result.ptr); 964 } 965 966 version(LDC) 967 { 968 alias _mm_sfence = __builtin_ia32_sfence; 969 } 970 else 971 { 972 void _mm_sfence() pure @safe 973 { 974 asm nothrow @nogc pure @safe 975 { 976 sfence; 977 } 978 } 979 } 980 unittest 981 { 982 _mm_sfence(); 983 } 984 985 // TODO: mm_shuffle_pi16 986 987 // Note: the immediate shuffle value is given at compile-time instead of runtime. 988 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 989 { 990 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 991 } 992 993 version(LDC) 994 { 995 // Disappeared with LDC 1.11 996 static if (__VERSION__ < 2081) 997 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 998 else 999 { 1000 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1001 { 1002 vec.array[0] = llvm_sqrt(vec.array[0]); 1003 vec.array[1] = llvm_sqrt(vec.array[1]); 1004 vec.array[2] = llvm_sqrt(vec.array[2]); 1005 vec.array[3] = llvm_sqrt(vec.array[3]); 1006 return vec; 1007 } 1008 } 1009 } 1010 else 1011 { 1012 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1013 { 1014 vec.array[0] = sqrt(vec.array[0]); 1015 vec.array[1] = sqrt(vec.array[1]); 1016 vec.array[2] = sqrt(vec.array[2]); 1017 vec.array[3] = sqrt(vec.array[3]); 1018 return vec; 1019 } 1020 } 1021 unittest 1022 { 1023 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1024 assert(A.array[0] == 2.0f); 1025 assert(A.array[1] == 2.0f); 1026 assert(A.array[2] == 2.0f); 1027 assert(A.array[3] == 2.0f); 1028 } 1029 1030 version(LDC) 1031 { 1032 // Disappeared with LDC 1.11 1033 static if (__VERSION__ < 2081) 1034 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1035 else 1036 { 1037 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1038 { 1039 vec.array[0] = llvm_sqrt(vec.array[0]); 1040 vec.array[1] = vec.array[1]; 1041 vec.array[2] = vec.array[2]; 1042 vec.array[3] = vec.array[3]; 1043 return vec; 1044 } 1045 } 1046 } 1047 else 1048 { 1049 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1050 { 1051 vec.array[0] = sqrt(vec.array[0]); 1052 return vec; 1053 } 1054 } 1055 unittest 1056 { 1057 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1058 assert(A.array[0] == 2.0f); 1059 assert(A.array[1] == 4.0f); 1060 assert(A.array[2] == 4.0f); 1061 assert(A.array[3] == 4.0f); 1062 } 1063 1064 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1065 { 1066 __m128* aligned = cast(__m128*)mem_addr; 1067 *aligned = a; 1068 } 1069 1070 alias _mm_store_ps1 = _mm_store1_ps; 1071 1072 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 1073 { 1074 *mem_addr = a[0]; 1075 } 1076 1077 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1078 { 1079 __m128* aligned = cast(__m128*)mem_addr; 1080 *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a); 1081 } 1082 1083 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe 1084 { 1085 long2 la = cast(long2)a; 1086 (*p)[0] = la[1]; 1087 } 1088 unittest 1089 { 1090 __m64 R = _mm_setzero_si64(); 1091 long2 A = [13, 25]; 1092 _mm_storeh_pi(&R, cast(__m128)A); 1093 assert(R[0] == 25); 1094 } 1095 1096 void _mm_storel_pi(__m64* p, __m128 a) pure @safe 1097 { 1098 long2 la = cast(long2)a; 1099 (*p)[0] = la[0]; 1100 } 1101 unittest 1102 { 1103 __m64 R = _mm_setzero_si64(); 1104 long2 A = [13, 25]; 1105 _mm_storel_pi(&R, cast(__m128)A); 1106 assert(R[0] == 13); 1107 } 1108 1109 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1110 { 1111 __m128* aligned = cast(__m128*)mem_addr; 1112 *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a); 1113 } 1114 1115 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 1116 { 1117 storeUnaligned!(float4)(a, mem_addr); 1118 } 1119 1120 // TODO: _mm_stream_pi, does not seem possible 1121 1122 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 1123 // needs some IR outside this function that would say: 1124 // 1125 // !0 = !{ i32 1 } 1126 // 1127 // It's a LLVM IR metadata description. 1128 // Regardless, non-temporal moves are really dangerous for performance... 1129 void _mm_stream_ps (float* mem_addr, __m128 a) 1130 { 1131 __m128* dest = cast(__m128*)mem_addr; 1132 *dest = a; // it's a regular move instead 1133 } 1134 unittest 1135 { 1136 align(16) float[4] A; 1137 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 1138 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 1139 } 1140 1141 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 1142 { 1143 return a - b; 1144 } 1145 unittest 1146 { 1147 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1148 a = _mm_sub_ps(a, a); 1149 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 1150 assert(a.array == correct); 1151 } 1152 1153 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 1154 { 1155 a[0] -= b[0]; 1156 return a; 1157 } 1158 unittest 1159 { 1160 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1161 a = _mm_sub_ss(a, a); 1162 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 1163 assert(a.array == correct); 1164 } 1165 1166 1167 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 1168 { 1169 __m128 tmp3, tmp2, tmp1, tmp0; 1170 tmp0 = _mm_unpacklo_ps(row0, row1); 1171 tmp2 = _mm_unpacklo_ps(row2, row3); 1172 tmp1 = _mm_unpackhi_ps(row0, row1); 1173 tmp3 = _mm_unpackhi_ps(row2, row3); 1174 row0 = _mm_movelh_ps(tmp0, tmp2); 1175 row1 = _mm_movehl_ps(tmp2, tmp0); 1176 row2 = _mm_movelh_ps(tmp1, tmp3); 1177 row3 = _mm_movehl_ps(tmp3, tmp1); 1178 } 1179 1180 // Note: the only difference between these intrinsics is the signalling 1181 // behaviour of quiet NaNs. This is incorrect but the case where 1182 // you would want to differentiate between qNaN and sNaN and then 1183 // treat them differently on purpose seems extremely rare. 1184 alias _mm_ucomieq_ss = _mm_comieq_ss; 1185 alias _mm_ucomige_ss = _mm_comige_ss; 1186 alias _mm_ucomigt_ss = _mm_comigt_ss; 1187 alias _mm_ucomile_ss = _mm_comile_ss; 1188 alias _mm_ucomilt_ss = _mm_comilt_ss; 1189 alias _mm_ucomineq_ss = _mm_comineq_ss; 1190 1191 1192 __m128 _mm_undefined_ps() pure @safe 1193 { 1194 __m128 undef = void; 1195 return undef; 1196 } 1197 1198 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe 1199 { 1200 return shufflevector!(float4, 2, 6, 3, 7)(a, b); 1201 } 1202 1203 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe 1204 { 1205 return shufflevector!(float4, 0, 4, 1, 5)(a, b); 1206 } 1207 1208 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 1209 { 1210 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 1211 } 1212 1213 1214 private 1215 { 1216 /// Returns: `true` if the pointer is suitably aligned. 1217 bool isPointerAligned(void* p, size_t alignment) pure 1218 { 1219 assert(alignment != 0); 1220 return ( cast(size_t)p & (alignment - 1) ) == 0; 1221 } 1222 1223 /// Returns: next pointer aligned with alignment bytes. 1224 void* nextAlignedPointer(void* start, size_t alignment) pure 1225 { 1226 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 1227 } 1228 1229 // Returns number of bytes to actually allocate when asking 1230 // for a particular alignment 1231 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 1232 { 1233 enum size_t pointerSize = size_t.sizeof; 1234 return askedSize + alignment - 1 + pointerSize * 3; 1235 } 1236 1237 // Store pointer given my malloc, size and alignment 1238 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 1239 { 1240 enum size_t pointerSize = size_t.sizeof; 1241 char* start = cast(char*)raw + pointerSize * 3; 1242 void* aligned = nextAlignedPointer(start, alignment); 1243 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 1244 *rawLocation = raw; 1245 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 1246 *sizeLocation = size; 1247 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 1248 *alignmentLocation = alignment; 1249 assert( isPointerAligned(aligned, alignment) ); 1250 return aligned; 1251 } 1252 1253 // Returns: x, multiple of powerOfTwo, so that x >= n. 1254 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 1255 { 1256 // check power-of-two 1257 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 1258 1259 size_t mask = ~(powerOfTwo - 1); 1260 return (n + powerOfTwo - 1) & mask; 1261 } 1262 } 1263 1264 unittest 1265 { 1266 assert(nextMultipleOf(0, 4) == 0); 1267 assert(nextMultipleOf(1, 4) == 4); 1268 assert(nextMultipleOf(2, 4) == 4); 1269 assert(nextMultipleOf(3, 4) == 4); 1270 assert(nextMultipleOf(4, 4) == 4); 1271 assert(nextMultipleOf(5, 4) == 8); 1272 1273 { 1274 void* p = _mm_malloc(23, 16); 1275 assert(p !is null); 1276 assert(((cast(size_t)p) & 0xf) == 0); 1277 _mm_free(p); 1278 } 1279 1280 void* nullAlloc = _mm_malloc(0, 32); 1281 assert(nullAlloc != null); 1282 _mm_free(nullAlloc); 1283 }