1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.internals; 7 8 import inteli.types; 9 10 // The only math functions needed for intel-intrinsics 11 public import core.math: sqrt; // since it's an intrinsics 12 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit 13 14 15 version(GNU) 16 { 17 version (X86) 18 { 19 // For 32-bit x86, disable vector extensions with GDC. 20 // It just doesn't work well. 21 enum GDC_with_x86 = true; 22 enum GDC_with_MMX = false; 23 enum GDC_with_SSE = false; 24 enum GDC_with_SSE2 = false; 25 enum GDC_with_SSE3 = false; 26 enum LDC_with_ARM32 = false; 27 enum LDC_with_ARM64 = false; 28 enum LDC_with_SSE1 = false; 29 enum LDC_with_SSE2 = false; 30 enum LDC_with_SSE3 = false; 31 } 32 else version (X86_64) 33 { 34 // GDC support uses extended inline assembly: 35 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 36 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 37 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 38 39 public import core.simd; 40 41 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 42 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 43 public import gcc.builtins; 44 45 enum GDC_with_x86 = true; 46 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 47 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 48 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 49 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 50 enum LDC_with_ARM32 = false; 51 enum LDC_with_ARM64 = false; 52 enum LDC_with_SSE1 = false; 53 enum LDC_with_SSE2 = false; 54 enum LDC_with_SSE3 = false; 55 } 56 else 57 { 58 enum GDC_with_x86 = false; 59 enum GDC_with_MMX = false; 60 enum GDC_with_SSE = false; 61 enum GDC_with_SSE2 = false; 62 enum GDC_with_SSE3 = false; 63 enum LDC_with_ARM32 = false; 64 enum LDC_with_ARM64 = false; 65 enum LDC_with_SSE1 = false; 66 enum LDC_with_SSE2 = false; 67 enum LDC_with_SSE3 = false; 68 } 69 } 70 else version(LDC) 71 { 72 public import core.simd; 73 public import ldc.simd; 74 public import ldc.intrinsics; 75 public import ldc.llvmasm: __asm; 76 77 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 78 static if (__VERSION__ >= 2083) 79 { 80 import ldc.llvmasm; 81 alias LDCInlineIR = __ir_pure; 82 83 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 84 alias LDCInlineIREx = __irEx_pure; 85 } 86 else 87 { 88 alias LDCInlineIR = inlineIR; 89 } 90 91 package(inteli) 92 { 93 enum GDC_with_x86 = false; 94 enum GDC_with_MMX = false; 95 enum GDC_with_SSE = false; 96 enum GDC_with_SSE2 = false; 97 enum GDC_with_SSE3 = false; 98 } 99 100 version(ARM) 101 { 102 public import ldc.gccbuiltins_arm; 103 enum LDC_with_ARM32 = true; 104 enum LDC_with_ARM64 = false; 105 enum LDC_with_SSE1 = false; 106 enum LDC_with_SSE2 = false; 107 enum LDC_with_SSE3 = false; 108 } 109 else version(AArch64) 110 { 111 //public import ldc.gccbuiltins_arm; 112 enum LDC_with_ARM32 = false; 113 enum LDC_with_ARM64 = true; 114 enum LDC_with_SSE1 = false; 115 enum LDC_with_SSE2 = false; 116 enum LDC_with_SSE3 = false; 117 } 118 else 119 { 120 public import ldc.gccbuiltins_x86; 121 enum LDC_with_ARM32 = false; 122 enum LDC_with_ARM64 = false; 123 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 124 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 125 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 126 } 127 } 128 else version(DigitalMars) 129 { 130 package(inteli) 131 { 132 enum GDC_with_x86 = false; 133 enum GDC_with_MMX = false; 134 enum GDC_with_SSE = false; 135 enum GDC_with_SSE2 = false; 136 enum GDC_with_SSE3 = false; 137 enum LDC_with_ARM32 = false; 138 enum LDC_with_ARM64 = false; 139 enum LDC_with_SSE1 = false; 140 enum LDC_with_SSE2 = false; 141 enum LDC_with_SSE3 = false; 142 } 143 } 144 else 145 { 146 static assert(false, "Unknown compiler"); 147 } 148 149 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though 150 151 static if (LDC_with_ARM32) 152 { 153 package uint arm_get_fpcr() nothrow @nogc @trusted 154 { 155 return __builtin_arm_get_fpscr(); 156 } 157 158 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 159 { 160 __builtin_arm_set_fpscr(cw); 161 } 162 } 163 164 static if (LDC_with_ARM64) 165 { 166 package uint arm_get_fpcr() pure nothrow @nogc @trusted 167 { 168 return __asm!uint("mrs $0, fpcr", "=r"); 169 } 170 171 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 172 { 173 __asm!void("ldr w2, $0 \n msr fpcr, x2", "m", cw); 174 } 175 } 176 177 version(DigitalMars) 178 { 179 version(D_InlineAsm_X86) 180 enum DMD_with_asm = true; 181 else version(D_InlineAsm_X86_64) 182 enum DMD_with_asm = true; 183 else 184 enum DMD_with_asm = false; 185 186 version(D_InlineAsm_X86) 187 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 188 else 189 enum DMD_with_32bit_asm = false; 190 } 191 else 192 { 193 enum DMD_with_asm = false; 194 enum DMD_with_32bit_asm = false; 195 } 196 197 198 package: 199 nothrow @nogc: 200 201 202 // For internal use only, since public API deals with a x86 semantic emulation 203 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 204 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 205 enum uint _MM_ROUND_UP_ARM = 0x00400000; 206 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 207 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 208 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 209 210 211 // 212 // <ROUNDING> 213 // 214 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 215 // doesn't change the FPU rounding mode, and isn't expected to do so. 216 // So we devised these rounding function to help having consistent rouding between 217 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 218 // 219 // Note: There is no MXCSR in ARM. But there is fpscr that implements similar 220 // functionality the same. 221 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 222 // There is no 223 // We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 224 225 int convertFloatToInt32UsingMXCSR(float value) @trusted 226 { 227 int result; 228 version(GNU) 229 { 230 asm pure nothrow @nogc @trusted 231 { 232 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 233 } 234 } 235 else static if (LDC_with_ARM32) 236 { 237 result = __asm!int(`vldr s2, $1 238 vcvtr.s32.f32 s2, s2 239 vmov $0, s2`, "=r,m", value); 240 } 241 else static if (LDC_with_ARM64) 242 { 243 // Get current rounding mode. 244 uint fpscr = arm_get_fpcr(); 245 246 switch(fpscr & _MM_ROUND_MASK_ARM) 247 { 248 default: 249 case _MM_ROUND_NEAREST_ARM: 250 result = __asm!int(`ldr s2, $1 251 fcvtns $0,s2`, "=r,m", value); 252 break; 253 case _MM_ROUND_DOWN_ARM: 254 result = __asm!int(`ldr s2, $1 255 fcvtms $0,s2`, "=r,m", value); 256 break; 257 case _MM_ROUND_UP_ARM: 258 result = __asm!int(`ldr s2, $1 259 fcvtps $0,s2`, "=r,m", value); 260 break; 261 case _MM_ROUND_TOWARD_ZERO_ARM: 262 result = cast(int)value; 263 break; 264 } 265 } 266 else 267 { 268 asm pure nothrow @nogc @trusted 269 { 270 cvtss2si EAX, value; 271 mov result, EAX; 272 } 273 } 274 return result; 275 } 276 277 int convertDoubleToInt32UsingMXCSR(double value) @trusted 278 { 279 int result; 280 version(GNU) 281 { 282 asm pure nothrow @nogc @trusted 283 { 284 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 285 } 286 } 287 else static if (LDC_with_ARM32) 288 { 289 result = __asm!int(`vldr d2, $1 290 vcvtr.s32.f64 s2, d2 291 vmov $0, s2`, "=r,m", value); 292 } 293 else static if (LDC_with_ARM64) 294 { 295 // Get current rounding mode. 296 uint fpscr = arm_get_fpcr(); 297 298 switch(fpscr & _MM_ROUND_MASK_ARM) 299 { 300 default: 301 case _MM_ROUND_NEAREST_ARM: 302 result = __asm!int(`ldr d2, $1 303 fcvtns $0,d2`, "=r,m", value); 304 break; 305 case _MM_ROUND_DOWN_ARM: 306 result = __asm!int(`ldr d2, $1 307 fcvtms $0,d2`, "=r,m", value); 308 break; 309 case _MM_ROUND_UP_ARM: 310 result = __asm!int(`ldr d2, $1 311 fcvtps $0,d2`, "=r,m", value); 312 break; 313 case _MM_ROUND_TOWARD_ZERO_ARM: 314 result = cast(int)value; 315 break; 316 } 317 } 318 else 319 { 320 asm pure nothrow @nogc @trusted 321 { 322 cvtsd2si EAX, value; 323 mov result, EAX; 324 } 325 } 326 return result; 327 } 328 329 long convertFloatToInt64UsingMXCSR(float value) @trusted 330 { 331 static if (LDC_with_ARM32) 332 { 333 // We have to resort to libc since 32-bit ARM 334 // doesn't seem to have 64-bit registers. 335 336 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 337 338 // Note: converting to double precision else rounding could be different for large integers 339 double asDouble = value; 340 341 switch(fpscr & _MM_ROUND_MASK_ARM) 342 { 343 default: 344 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 345 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 346 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 347 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 348 } 349 } 350 else static if (LDC_with_ARM64) 351 { 352 uint fpscr = arm_get_fpcr(); 353 354 switch(fpscr & _MM_ROUND_MASK_ARM) 355 { 356 default: 357 case _MM_ROUND_NEAREST_ARM: 358 return __asm!long(`ldr s2, $1 359 fcvtns $0,s2`, "=r,m", value); 360 case _MM_ROUND_DOWN_ARM: 361 return __asm!long(`ldr s2, $1 362 fcvtms $0,s2`, "=r,m", value); 363 case _MM_ROUND_UP_ARM: 364 return __asm!long(`ldr s2, $1 365 fcvtps $0,s2`, "=r,m", value); 366 case _MM_ROUND_TOWARD_ZERO_ARM: 367 return cast(long)value; 368 } 369 } 370 // 64-bit can use an SSE instruction 371 else version(D_InlineAsm_X86_64) 372 { 373 long result; 374 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 375 { 376 asm pure nothrow @nogc @trusted 377 { 378 movss XMM0, value; 379 cvtss2si RAX, XMM0; 380 mov result, RAX; 381 } 382 } 383 else 384 { 385 asm pure nothrow @nogc @trusted 386 { 387 movss XMM0, value; 388 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 389 mov result, RAX; 390 } 391 } 392 return result; 393 } 394 else version(D_InlineAsm_X86) 395 { 396 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 397 // This leads to an unfortunate FPU sequence in every C++ compiler. 398 // See: https://godbolt.org/z/vZym77 399 400 // Get current MXCSR rounding 401 uint sseRounding; 402 ushort savedFPUCW; 403 ushort newFPUCW; 404 long result; 405 asm pure nothrow @nogc @trusted 406 { 407 stmxcsr sseRounding; 408 fld value; 409 fnstcw savedFPUCW; 410 mov AX, savedFPUCW; 411 and AX, 0xf3ff; // clear FPU rounding bits 412 movzx ECX, word ptr sseRounding; 413 and ECX, 0x6000; // only keep SSE rounding bits 414 shr ECX, 3; 415 or AX, CX; // make a new control word for FPU with SSE bits 416 mov newFPUCW, AX; 417 fldcw newFPUCW; 418 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 419 fldcw savedFPUCW; 420 } 421 return result; 422 } 423 else static if (GDC_with_x86) 424 { 425 version(X86_64) // 64-bit can just use the right instruction 426 { 427 static assert(GDC_with_SSE); 428 __m128 A; 429 A.ptr[0] = value; 430 return __builtin_ia32_cvtss2si64 (A); 431 } 432 else version(X86) // 32-bit 433 { 434 // This is untested! 435 uint sseRounding; 436 ushort savedFPUCW; 437 ushort newFPUCW; 438 long result; 439 asm pure nothrow @nogc @trusted 440 { 441 "stmxcsr %1;\n" ~ 442 "fld %2;\n" ~ 443 "fnstcw %3;\n" ~ 444 "movw %3, %%ax;\n" ~ 445 "andw $0xf3ff, %%ax;\n" ~ 446 "movzwl %1, %%ecx;\n" ~ 447 "andl $0x6000, %%ecx;\n" ~ 448 "shrl $3, %%ecx;\n" ~ 449 "orw %%cx, %%ax\n" ~ 450 "movw %%ax, %4;\n" ~ 451 "fldcw %4;\n" ~ 452 "fistpll %0;\n" ~ 453 "fldcw %3;\n" 454 : "=m"(result) // %0 455 : "m" (sseRounding), 456 "f" (value), 457 "m" (savedFPUCW), 458 "m" (newFPUCW) 459 : "eax", "ecx", "st"; 460 } 461 return result; 462 } 463 else 464 static assert(false); 465 } 466 else 467 static assert(false); 468 } 469 470 471 ///ditto 472 long convertDoubleToInt64UsingMXCSR(double value) @trusted 473 { 474 static if (LDC_with_ARM32) 475 { 476 // We have to resort to libc since 32-bit ARM 477 // doesn't seem to have 64-bit registers. 478 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 479 switch(fpscr & _MM_ROUND_MASK_ARM) 480 { 481 default: 482 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 483 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 484 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 485 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 486 } 487 } 488 else static if (LDC_with_ARM64) 489 { 490 // Get current rounding mode. 491 uint fpscr = arm_get_fpcr(); 492 493 switch(fpscr & _MM_ROUND_MASK_ARM) 494 { 495 default: 496 case _MM_ROUND_NEAREST_ARM: 497 return __asm!long(`ldr d2, $1 498 fcvtns $0,d2`, "=r,m", value); 499 case _MM_ROUND_DOWN_ARM: 500 return __asm!long(`ldr d2, $1 501 fcvtms $0,d2`, "=r,m", value); 502 case _MM_ROUND_UP_ARM: 503 return __asm!long(`ldr d2, $1 504 fcvtps $0,d2`, "=r,m", value); 505 case _MM_ROUND_TOWARD_ZERO_ARM: 506 return cast(long)value; 507 } 508 } 509 // 64-bit can use an SSE instruction 510 else version(D_InlineAsm_X86_64) 511 { 512 long result; 513 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 514 { 515 asm pure nothrow @nogc @trusted 516 { 517 movsd XMM0, value; 518 cvtsd2si RAX, XMM0; 519 mov result, RAX; 520 } 521 } 522 else 523 { 524 asm pure nothrow @nogc @trusted 525 { 526 movsd XMM0, value; 527 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 528 mov result, RAX; 529 } 530 } 531 return result; 532 } 533 else version(D_InlineAsm_X86) 534 { 535 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 536 // This leads to an unfortunate FPU sequence in every C++ compiler. 537 // See: https://godbolt.org/z/vZym77 538 539 // Get current MXCSR rounding 540 uint sseRounding; 541 ushort savedFPUCW; 542 ushort newFPUCW; 543 long result; 544 asm pure nothrow @nogc @trusted 545 { 546 stmxcsr sseRounding; 547 fld value; 548 fnstcw savedFPUCW; 549 mov AX, savedFPUCW; 550 and AX, 0xf3ff; 551 movzx ECX, word ptr sseRounding; 552 and ECX, 0x6000; 553 shr ECX, 3; 554 or AX, CX; 555 mov newFPUCW, AX; 556 fldcw newFPUCW; 557 fistp result; 558 fldcw savedFPUCW; 559 } 560 return result; 561 } 562 else static if (GDC_with_x86) 563 { 564 version(X86_64) 565 { 566 static assert(GDC_with_SSE2); 567 __m128d A; 568 A.ptr[0] = value; 569 return __builtin_ia32_cvtsd2si64 (A); 570 } 571 else 572 { 573 // This is untested! 574 uint sseRounding; 575 ushort savedFPUCW; 576 ushort newFPUCW; 577 long result; 578 asm pure nothrow @nogc @trusted 579 { 580 "stmxcsr %1;\n" ~ 581 "fld %2;\n" ~ 582 "fnstcw %3;\n" ~ 583 "movw %3, %%ax;\n" ~ 584 "andw $0xf3ff, %%ax;\n" ~ 585 "movzwl %1, %%ecx;\n" ~ 586 "andl $0x6000, %%ecx;\n" ~ 587 "shrl $3, %%ecx;\n" ~ 588 "orw %%cx, %%ax\n" ~ 589 "movw %%ax, %4;\n" ~ 590 "fldcw %4;\n" ~ 591 "fistpll %0;\n" ~ 592 "fldcw %3;\n" 593 : "=m"(result) // %0 594 : "m" (sseRounding), 595 "t" (value), 596 "m" (savedFPUCW), 597 "m" (newFPUCW) 598 : "eax", "ecx", "st"; 599 } 600 return result; 601 } 602 } 603 else 604 static assert(false); 605 } 606 607 // 608 // </ROUNDING> 609 // 610 611 612 // using the Intel terminology here 613 614 byte saturateSignedWordToSignedByte(short value) pure @safe 615 { 616 if (value > 127) value = 127; 617 if (value < -128) value = -128; 618 return cast(byte) value; 619 } 620 621 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 622 { 623 if (value > 255) value = 255; 624 if (value < 0) value = 0; 625 return cast(ubyte) value; 626 } 627 628 short saturateSignedIntToSignedShort(int value) pure @safe 629 { 630 if (value > 32767) value = 32767; 631 if (value < -32768) value = -32768; 632 return cast(short) value; 633 } 634 635 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 636 { 637 if (value > 65535) value = 65535; 638 if (value < 0) value = 0; 639 return cast(ushort) value; 640 } 641 642 unittest // test saturate operations 643 { 644 assert( saturateSignedWordToSignedByte(32000) == 127); 645 assert( saturateSignedWordToUnsignedByte(32000) == 255); 646 assert( saturateSignedWordToSignedByte(-4000) == -128); 647 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 648 assert( saturateSignedIntToSignedShort(32768) == 32767); 649 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 650 assert( saturateSignedIntToSignedShort(-32769) == -32768); 651 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 652 } 653 654 version(unittest) 655 { 656 // This is just for debugging tests 657 import core.stdc.stdio: printf; 658 659 // printing vectors for implementation 660 // Note: you can override `pure` within a `debug` clause 661 662 void _mm_print_pi64(__m64 v) @trusted 663 { 664 long1 vl = cast(long1)v; 665 printf("%lld\n", vl.array[0]); 666 } 667 668 void _mm_print_pi32(__m64 v) @trusted 669 { 670 int[2] C = (cast(int2)v).array; 671 printf("%d %d\n", C[0], C[1]); 672 } 673 674 void _mm_print_pi16(__m64 v) @trusted 675 { 676 short[4] C = (cast(short4)v).array; 677 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 678 } 679 680 void _mm_print_pi8(__m64 v) @trusted 681 { 682 byte[8] C = (cast(byte8)v).array; 683 printf("%d %d %d %d %d %d %d %d\n", 684 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 685 } 686 687 void _mm_print_epi64(__m128i v) @trusted 688 { 689 long2 vl = cast(long2)v; 690 printf("%lld %lld\n", vl.array[0], vl.array[1]); 691 } 692 693 void _mm_print_epi32(__m128i v) @trusted 694 { 695 printf("%d %d %d %d\n", 696 v.array[0], v.array[1], v.array[2], v.array[3]); 697 } 698 699 void _mm_print_epi16(__m128i v) @trusted 700 { 701 short[8] C = (cast(short8)v).array; 702 printf("%d %d %d %d %d %d %d %d\n", 703 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 704 } 705 706 void _mm_print_epi8(__m128i v) @trusted 707 { 708 byte[16] C = (cast(byte16)v).array; 709 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 710 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 711 } 712 713 void _mm_print_ps(__m128 v) @trusted 714 { 715 float[4] C = (cast(float4)v).array; 716 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 717 } 718 719 void _mm_print_pd(__m128d v) @trusted 720 { 721 double[2] C = (cast(double2)v).array; 722 printf("%f %f\n", C[0], C[1]); 723 } 724 } 725 726 727 // 728 // <FLOATING-POINT COMPARISONS> 729 // 730 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 731 // need different IR generation. 732 733 enum FPComparison 734 { 735 oeq, // ordered and equal 736 ogt, // ordered and greater than 737 oge, // ordered and greater than or equal 738 olt, // ordered and less than 739 ole, // ordered and less than or equal 740 one, // ordered and not equal 741 ord, // ordered (no nans) 742 ueq, // unordered or equal 743 ugt, // unordered or greater than ("nle") 744 uge, // unordered or greater than or equal ("nlt") 745 ult, // unordered or less than ("nge") 746 ule, // unordered or less than or equal ("ngt") 747 une, // unordered or not equal ("neq") 748 uno, // unordered (either nans) 749 } 750 751 private static immutable string[FPComparison.max+1] FPComparisonToString = 752 [ 753 "oeq", 754 "ogt", 755 "oge", 756 "olt", 757 "ole", 758 "one", 759 "ord", 760 "ueq", 761 "ugt", 762 "uge", 763 "ult", 764 "ule", 765 "une", 766 "uno", 767 ]; 768 769 // Individual float comparison: returns -1 for true or 0 for false. 770 // Useful for DMD and testing 771 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 772 { 773 import std.math; 774 bool unordered = isNaN(a) || isNaN(b); 775 final switch(comparison) with(FPComparison) 776 { 777 case oeq: return a == b; 778 case ogt: return a > b; 779 case oge: return a >= b; 780 case olt: return a < b; 781 case ole: return a <= b; 782 case one: return !unordered && (a != b); // NaN with != always yields true 783 case ord: return !unordered; 784 case ueq: return unordered || (a == b); 785 case ugt: return unordered || (a > b); 786 case uge: return unordered || (a >= b); 787 case ult: return unordered || (a < b); 788 case ule: return unordered || (a <= b); 789 case une: return (a != b); // NaN with != always yields true 790 case uno: return unordered; 791 } 792 } 793 794 version(LDC) 795 { 796 /// Provides packed float comparisons 797 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 798 { 799 enum ir = ` 800 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 801 %r = sext <4 x i1> %cmp to <4 x i32> 802 ret <4 x i32> %r`; 803 804 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 805 } 806 807 /// Provides packed double comparisons 808 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 809 { 810 enum ir = ` 811 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 812 %r = sext <2 x i1> %cmp to <2 x i64> 813 ret <2 x i64> %r`; 814 815 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 816 } 817 818 /// CMPSS-style comparisons 819 /// clang implement it through x86 intrinsics, it is possible with IR alone 820 /// but leads to less optimal code. 821 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 822 /// Not that simple. 823 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 824 { 825 /* 826 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 827 enum bool invertOp = (predicateNumber & 0x80) != 0; 828 static if(invertOp) 829 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 830 else 831 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 832 */ 833 enum ir = ` 834 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 835 %r = sext i1 %cmp to i32 836 %r2 = bitcast i32 %r to float 837 ret float %r2`; 838 839 float4 r = a; 840 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 841 return r; 842 } 843 844 /// CMPSD-style comparisons 845 /// clang implement it through x86 intrinsics, it is possible with IR alone 846 /// but leads to less optimal code. 847 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 848 /// Not that simple. 849 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 850 { 851 enum ir = ` 852 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 853 %r = sext i1 %cmp to i64 854 %r2 = bitcast i64 %r to double 855 ret double %r2`; 856 857 double2 r = a; 858 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 859 return r; 860 } 861 862 // Note: ucomss and ucomsd are left unimplemented 863 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 864 { 865 enum ir = ` 866 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 867 %r = zext i1 %cmp to i32 868 ret i32 %r`; 869 870 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 871 } 872 873 // Note: ucomss and ucomsd are left unimplemented 874 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 875 { 876 enum ir = ` 877 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 878 %r = zext i1 %cmp to i32 879 ret i32 %r`; 880 881 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 882 } 883 } 884 else 885 { 886 /// Provides packed float comparisons 887 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 888 { 889 int4 result; 890 foreach(i; 0..4) 891 { 892 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 893 } 894 return result; 895 } 896 897 /// Provides packed double comparisons 898 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 899 { 900 long2 result; 901 foreach(i; 0..2) 902 { 903 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 904 } 905 return result; 906 } 907 908 /// Provides CMPSS-style comparison 909 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 910 { 911 int4 result = cast(int4)a; 912 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 913 return cast(float4)result; 914 } 915 916 /// Provides CMPSD-style comparison 917 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 918 { 919 long2 result = cast(long2)a; 920 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 921 return cast(double2)result; 922 } 923 924 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 925 { 926 return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0; 927 } 928 929 // Note: ucomss and ucomsd are left unimplemented 930 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 931 { 932 return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0; 933 } 934 } 935 unittest // cmpps 936 { 937 // Check all comparison type is working 938 float4 A = [1, 3, 5, float.nan]; 939 float4 B = [2, 3, 4, 5]; 940 941 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 942 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 943 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 944 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 945 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 946 int4 result_one = cmpps!(FPComparison.one)(A, B); 947 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 948 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 949 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 950 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 951 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 952 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 953 int4 result_une = cmpps!(FPComparison.une)(A, B); 954 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 955 956 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 957 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 958 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 959 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 960 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 961 static immutable int[4] correct_one = [-1, 0,-1, 0]; 962 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 963 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 964 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 965 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 966 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 967 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 968 static immutable int[4] correct_une = [-1, 0,-1,-1]; 969 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 970 971 assert(result_oeq.array == correct_oeq); 972 assert(result_ogt.array == correct_ogt); 973 assert(result_oge.array == correct_oge); 974 assert(result_olt.array == correct_olt); 975 assert(result_ole.array == correct_ole); 976 assert(result_one.array == correct_one); 977 assert(result_ord.array == correct_ord); 978 assert(result_ueq.array == correct_ueq); 979 assert(result_ugt.array == correct_ugt); 980 assert(result_uge.array == correct_uge); 981 assert(result_ult.array == correct_ult); 982 assert(result_ule.array == correct_ule); 983 assert(result_une.array == correct_une); 984 assert(result_uno.array == correct_uno); 985 } 986 unittest 987 { 988 double2 a = [1, 3]; 989 double2 b = [2, 3]; 990 long2 c = cmppd!(FPComparison.ult)(a, b); 991 static immutable long[2] correct = [cast(long)(-1), 0]; 992 assert(c.array == correct); 993 } 994 unittest // cmpss and comss 995 { 996 void testComparison(FPComparison comparison)(float4 A, float4 B) 997 { 998 float4 result = cmpss!comparison(A, B); 999 int4 iresult = cast(int4)result; 1000 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1001 assert(iresult.array[0] == expected); 1002 assert(result.array[1] == A.array[1]); 1003 assert(result.array[2] == A.array[2]); 1004 assert(result.array[3] == A.array[3]); 1005 1006 // check comss 1007 int comResult = comss!comparison(A, B); 1008 assert( (expected != 0) == (comResult != 0) ); 1009 } 1010 1011 // Check all comparison type is working 1012 float4 A = [1, 3, 5, 6]; 1013 float4 B = [2, 3, 4, 5]; 1014 float4 C = [float.nan, 3, 4, 5]; 1015 1016 testComparison!(FPComparison.oeq)(A, B); 1017 testComparison!(FPComparison.oeq)(A, C); 1018 testComparison!(FPComparison.ogt)(A, B); 1019 testComparison!(FPComparison.ogt)(A, C); 1020 testComparison!(FPComparison.oge)(A, B); 1021 testComparison!(FPComparison.oge)(A, C); 1022 testComparison!(FPComparison.olt)(A, B); 1023 testComparison!(FPComparison.olt)(A, C); 1024 testComparison!(FPComparison.ole)(A, B); 1025 testComparison!(FPComparison.ole)(A, C); 1026 testComparison!(FPComparison.one)(A, B); 1027 testComparison!(FPComparison.one)(A, C); 1028 testComparison!(FPComparison.ord)(A, B); 1029 testComparison!(FPComparison.ord)(A, C); 1030 testComparison!(FPComparison.ueq)(A, B); 1031 testComparison!(FPComparison.ueq)(A, C); 1032 testComparison!(FPComparison.ugt)(A, B); 1033 testComparison!(FPComparison.ugt)(A, C); 1034 testComparison!(FPComparison.uge)(A, B); 1035 testComparison!(FPComparison.uge)(A, C); 1036 testComparison!(FPComparison.ult)(A, B); 1037 testComparison!(FPComparison.ult)(A, C); 1038 testComparison!(FPComparison.ule)(A, B); 1039 testComparison!(FPComparison.ule)(A, C); 1040 testComparison!(FPComparison.une)(A, B); 1041 testComparison!(FPComparison.une)(A, C); 1042 testComparison!(FPComparison.uno)(A, B); 1043 testComparison!(FPComparison.uno)(A, C); 1044 } 1045 unittest // cmpsd and comsd 1046 { 1047 void testComparison(FPComparison comparison)(double2 A, double2 B) 1048 { 1049 double2 result = cmpsd!comparison(A, B); 1050 long2 iresult = cast(long2)result; 1051 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1052 assert(iresult.array[0] == expected); 1053 assert(result.array[1] == A.array[1]); 1054 1055 // check comsd 1056 int comResult = comsd!comparison(A, B); 1057 assert( (expected != 0) == (comResult != 0) ); 1058 } 1059 1060 // Check all comparison type is working 1061 double2 A = [1, 3]; 1062 double2 B = [2, 4]; 1063 double2 C = [double.nan, 5]; 1064 1065 testComparison!(FPComparison.oeq)(A, B); 1066 testComparison!(FPComparison.oeq)(A, C); 1067 testComparison!(FPComparison.ogt)(A, B); 1068 testComparison!(FPComparison.ogt)(A, C); 1069 testComparison!(FPComparison.oge)(A, B); 1070 testComparison!(FPComparison.oge)(A, C); 1071 testComparison!(FPComparison.olt)(A, B); 1072 testComparison!(FPComparison.olt)(A, C); 1073 testComparison!(FPComparison.ole)(A, B); 1074 testComparison!(FPComparison.ole)(A, C); 1075 testComparison!(FPComparison.one)(A, B); 1076 testComparison!(FPComparison.one)(A, C); 1077 testComparison!(FPComparison.ord)(A, B); 1078 testComparison!(FPComparison.ord)(A, C); 1079 testComparison!(FPComparison.ueq)(A, B); 1080 testComparison!(FPComparison.ueq)(A, C); 1081 testComparison!(FPComparison.ugt)(A, B); 1082 testComparison!(FPComparison.ugt)(A, C); 1083 testComparison!(FPComparison.uge)(A, B); 1084 testComparison!(FPComparison.uge)(A, C); 1085 testComparison!(FPComparison.ult)(A, B); 1086 testComparison!(FPComparison.ult)(A, C); 1087 testComparison!(FPComparison.ule)(A, B); 1088 testComparison!(FPComparison.ule)(A, C); 1089 testComparison!(FPComparison.une)(A, B); 1090 testComparison!(FPComparison.une)(A, C); 1091 testComparison!(FPComparison.uno)(A, B); 1092 testComparison!(FPComparison.uno)(A, C); 1093 } 1094 1095 // 1096 // </FLOATING-POINT COMPARISONS> 1097 // 1098 1099 1100 __m64 to_m64(__m128i a) pure @trusted 1101 { 1102 long2 la = cast(long2)a; 1103 long1 r; 1104 r.ptr[0] = la.array[0]; 1105 return r; 1106 } 1107 1108 __m128i to_m128i(__m64 a) pure @trusted 1109 { 1110 long2 r = [0, 0]; 1111 r.ptr[0] = a.array[0]; 1112 return cast(__m128i)r; 1113 }