1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Guillaume Piolat 7 */ 8 module inteli.internals; 9 10 import inteli.types; 11 12 // The only math functions needed for intel-intrinsics 13 public import core.math: sqrt; // since it's an intrinsics 14 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit 15 16 17 version(GNU) 18 { 19 version (X86) 20 { 21 // For 32-bit x86, disable vector extensions with GDC. 22 // It just doesn't work well. 23 enum GDC_with_x86 = true; 24 enum GDC_with_MMX = false; 25 enum GDC_with_SSE = false; 26 enum GDC_with_SSE2 = false; 27 enum GDC_with_SSE3 = false; 28 enum LDC_with_ARM32 = false; 29 enum LDC_with_ARM64 = false; 30 enum LDC_with_SSE1 = false; 31 enum LDC_with_SSE2 = false; 32 enum LDC_with_SSE3 = false; 33 } 34 else version (X86_64) 35 { 36 // GDC support uses extended inline assembly: 37 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 38 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 39 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 40 41 public import core.simd; 42 43 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 44 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 45 public import gcc.builtins; 46 47 enum GDC_with_x86 = true; 48 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 49 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 50 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 51 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 52 enum LDC_with_ARM32 = false; 53 enum LDC_with_ARM64 = false; 54 enum LDC_with_SSE1 = false; 55 enum LDC_with_SSE2 = false; 56 enum LDC_with_SSE3 = false; 57 } 58 else 59 { 60 enum GDC_with_x86 = false; 61 enum GDC_with_MMX = false; 62 enum GDC_with_SSE = false; 63 enum GDC_with_SSE2 = false; 64 enum GDC_with_SSE3 = false; 65 enum LDC_with_ARM32 = false; 66 enum LDC_with_ARM64 = false; 67 enum LDC_with_SSE1 = false; 68 enum LDC_with_SSE2 = false; 69 enum LDC_with_SSE3 = false; 70 } 71 } 72 else version(LDC) 73 { 74 public import core.simd; 75 public import ldc.simd; 76 public import ldc.intrinsics; 77 public import ldc.llvmasm: __asm; 78 79 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 80 static if (__VERSION__ >= 2083) 81 { 82 import ldc.llvmasm; 83 alias LDCInlineIR = __ir_pure; 84 85 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 86 alias LDCInlineIREx = __irEx_pure; 87 } 88 else 89 { 90 alias LDCInlineIR = inlineIR; 91 } 92 93 package(inteli) 94 { 95 enum GDC_with_x86 = false; 96 enum GDC_with_MMX = false; 97 enum GDC_with_SSE = false; 98 enum GDC_with_SSE2 = false; 99 enum GDC_with_SSE3 = false; 100 } 101 102 version(ARM) 103 { 104 public import ldc.gccbuiltins_arm; 105 enum LDC_with_ARM32 = true; 106 enum LDC_with_ARM64 = false; 107 enum LDC_with_SSE1 = false; 108 enum LDC_with_SSE2 = false; 109 enum LDC_with_SSE3 = false; 110 } 111 else version(AArch64) 112 { 113 //public import ldc.gccbuiltins_arm; 114 enum LDC_with_ARM32 = false; 115 enum LDC_with_ARM64 = true; 116 enum LDC_with_SSE1 = false; 117 enum LDC_with_SSE2 = false; 118 enum LDC_with_SSE3 = false; 119 } 120 else 121 { 122 public import ldc.gccbuiltins_x86; 123 enum LDC_with_ARM32 = false; 124 enum LDC_with_ARM64 = false; 125 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 126 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 127 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 128 } 129 } 130 else version(DigitalMars) 131 { 132 package(inteli) 133 { 134 enum GDC_with_x86 = false; 135 enum GDC_with_MMX = false; 136 enum GDC_with_SSE = false; 137 enum GDC_with_SSE2 = false; 138 enum GDC_with_SSE3 = false; 139 enum LDC_with_ARM32 = false; 140 enum LDC_with_ARM64 = false; 141 enum LDC_with_SSE1 = false; 142 enum LDC_with_SSE2 = false; 143 enum LDC_with_SSE3 = false; 144 } 145 } 146 else 147 { 148 static assert(false, "Unknown compiler"); 149 } 150 151 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though 152 153 static if (LDC_with_ARM32) 154 { 155 package uint arm_get_fpcr() nothrow @nogc @trusted 156 { 157 return __builtin_arm_get_fpscr(); 158 } 159 160 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 161 { 162 __builtin_arm_set_fpscr(cw); 163 } 164 } 165 166 static if (LDC_with_ARM64) 167 { 168 package uint arm_get_fpcr() pure nothrow @nogc @trusted 169 { 170 return __asm!uint("mrs $0, fpcr", "=r"); 171 } 172 173 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 174 { 175 __asm!void("ldr w2, $0 \n msr fpcr, x2", "m", cw); 176 } 177 } 178 179 version(DigitalMars) 180 { 181 version(D_InlineAsm_X86) 182 enum DMD_with_asm = true; 183 else version(D_InlineAsm_X86_64) 184 enum DMD_with_asm = true; 185 else 186 enum DMD_with_asm = false; 187 188 version(D_InlineAsm_X86) 189 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 190 else 191 enum DMD_with_32bit_asm = false; 192 } 193 else 194 { 195 enum DMD_with_asm = false; 196 enum DMD_with_32bit_asm = false; 197 } 198 199 200 package: 201 nothrow @nogc: 202 203 204 // For internal use only, since public API deals with a x86 semantic emulation 205 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 206 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 207 enum uint _MM_ROUND_UP_ARM = 0x00400000; 208 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 209 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 210 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 211 212 213 // 214 // <ROUNDING> 215 // 216 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 217 // doesn't change the FPU rounding mode, and isn't expected to do so. 218 // So we devised these rounding function to help having consistent rouding between 219 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 220 // 221 // Note: There is no MXCSR in ARM. But there is fpscr that implements similar 222 // functionality the same. 223 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 224 // There is no 225 // We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 226 227 int convertFloatToInt32UsingMXCSR(float value) @trusted 228 { 229 int result; 230 version(GNU) 231 { 232 asm pure nothrow @nogc @trusted 233 { 234 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 235 } 236 } 237 else static if (LDC_with_ARM32) 238 { 239 result = __asm!int(`vldr s2, $1 240 vcvtr.s32.f32 s2, s2 241 vmov $0, s2`, "=r,m", value); 242 } 243 else static if (LDC_with_ARM64) 244 { 245 // Get current rounding mode. 246 uint fpscr = arm_get_fpcr(); 247 248 switch(fpscr & _MM_ROUND_MASK_ARM) 249 { 250 default: 251 case _MM_ROUND_NEAREST_ARM: 252 result = __asm!int(`ldr s2, $1 253 fcvtns $0,s2`, "=r,m", value); 254 break; 255 case _MM_ROUND_DOWN_ARM: 256 result = __asm!int(`ldr s2, $1 257 fcvtms $0,s2`, "=r,m", value); 258 break; 259 case _MM_ROUND_UP_ARM: 260 result = __asm!int(`ldr s2, $1 261 fcvtps $0,s2`, "=r,m", value); 262 break; 263 case _MM_ROUND_TOWARD_ZERO_ARM: 264 result = cast(int)value; 265 break; 266 } 267 } 268 else 269 { 270 asm pure nothrow @nogc @trusted 271 { 272 cvtss2si EAX, value; 273 mov result, EAX; 274 } 275 } 276 return result; 277 } 278 279 int convertDoubleToInt32UsingMXCSR(double value) @trusted 280 { 281 int result; 282 version(GNU) 283 { 284 asm pure nothrow @nogc @trusted 285 { 286 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 287 } 288 } 289 else static if (LDC_with_ARM32) 290 { 291 result = __asm!int(`vldr d2, $1 292 vcvtr.s32.f64 s2, d2 293 vmov $0, s2`, "=r,m", value); 294 } 295 else static if (LDC_with_ARM64) 296 { 297 // Get current rounding mode. 298 uint fpscr = arm_get_fpcr(); 299 300 switch(fpscr & _MM_ROUND_MASK_ARM) 301 { 302 default: 303 case _MM_ROUND_NEAREST_ARM: 304 result = __asm!int(`ldr d2, $1 305 fcvtns $0,d2`, "=r,m", value); 306 break; 307 case _MM_ROUND_DOWN_ARM: 308 result = __asm!int(`ldr d2, $1 309 fcvtms $0,d2`, "=r,m", value); 310 break; 311 case _MM_ROUND_UP_ARM: 312 result = __asm!int(`ldr d2, $1 313 fcvtps $0,d2`, "=r,m", value); 314 break; 315 case _MM_ROUND_TOWARD_ZERO_ARM: 316 result = cast(int)value; 317 break; 318 } 319 } 320 else 321 { 322 asm pure nothrow @nogc @trusted 323 { 324 cvtsd2si EAX, value; 325 mov result, EAX; 326 } 327 } 328 return result; 329 } 330 331 long convertFloatToInt64UsingMXCSR(float value) @trusted 332 { 333 static if (LDC_with_ARM32) 334 { 335 // We have to resort to libc since 32-bit ARM 336 // doesn't seem to have 64-bit registers. 337 338 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 339 340 // Note: converting to double precision else rounding could be different for large integers 341 double asDouble = value; 342 343 switch(fpscr & _MM_ROUND_MASK_ARM) 344 { 345 default: 346 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 347 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 348 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 349 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 350 } 351 } 352 else static if (LDC_with_ARM64) 353 { 354 uint fpscr = arm_get_fpcr(); 355 356 switch(fpscr & _MM_ROUND_MASK_ARM) 357 { 358 default: 359 case _MM_ROUND_NEAREST_ARM: 360 return __asm!long(`ldr s2, $1 361 fcvtns $0,s2`, "=r,m", value); 362 case _MM_ROUND_DOWN_ARM: 363 return __asm!long(`ldr s2, $1 364 fcvtms $0,s2`, "=r,m", value); 365 case _MM_ROUND_UP_ARM: 366 return __asm!long(`ldr s2, $1 367 fcvtps $0,s2`, "=r,m", value); 368 case _MM_ROUND_TOWARD_ZERO_ARM: 369 return cast(long)value; 370 } 371 } 372 // 64-bit can use an SSE instruction 373 else version(D_InlineAsm_X86_64) 374 { 375 long result; 376 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 377 { 378 asm pure nothrow @nogc @trusted 379 { 380 movss XMM0, value; 381 cvtss2si RAX, XMM0; 382 mov result, RAX; 383 } 384 } 385 else 386 { 387 asm pure nothrow @nogc @trusted 388 { 389 movss XMM0, value; 390 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 391 mov result, RAX; 392 } 393 } 394 return result; 395 } 396 else version(D_InlineAsm_X86) 397 { 398 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 399 // This leads to an unfortunate FPU sequence in every C++ compiler. 400 // See: https://godbolt.org/z/vZym77 401 402 // Get current MXCSR rounding 403 uint sseRounding; 404 ushort savedFPUCW; 405 ushort newFPUCW; 406 long result; 407 asm pure nothrow @nogc @trusted 408 { 409 stmxcsr sseRounding; 410 fld value; 411 fnstcw savedFPUCW; 412 mov AX, savedFPUCW; 413 and AX, 0xf3ff; // clear FPU rounding bits 414 movzx ECX, word ptr sseRounding; 415 and ECX, 0x6000; // only keep SSE rounding bits 416 shr ECX, 3; 417 or AX, CX; // make a new control word for FPU with SSE bits 418 mov newFPUCW, AX; 419 fldcw newFPUCW; 420 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 421 fldcw savedFPUCW; 422 } 423 return result; 424 } 425 else static if (GDC_with_x86) 426 { 427 version(X86_64) // 64-bit can just use the right instruction 428 { 429 static assert(GDC_with_SSE); 430 __m128 A; 431 A.ptr[0] = value; 432 return __builtin_ia32_cvtss2si64 (A); 433 } 434 else version(X86) // 32-bit 435 { 436 // This is untested! 437 uint sseRounding; 438 ushort savedFPUCW; 439 ushort newFPUCW; 440 long result; 441 asm pure nothrow @nogc @trusted 442 { 443 "stmxcsr %1;\n" ~ 444 "fld %2;\n" ~ 445 "fnstcw %3;\n" ~ 446 "movw %3, %%ax;\n" ~ 447 "andw $0xf3ff, %%ax;\n" ~ 448 "movzwl %1, %%ecx;\n" ~ 449 "andl $0x6000, %%ecx;\n" ~ 450 "shrl $3, %%ecx;\n" ~ 451 "orw %%cx, %%ax\n" ~ 452 "movw %%ax, %4;\n" ~ 453 "fldcw %4;\n" ~ 454 "fistpll %0;\n" ~ 455 "fldcw %3;\n" 456 : "=m"(result) // %0 457 : "m" (sseRounding), 458 "f" (value), 459 "m" (savedFPUCW), 460 "m" (newFPUCW) 461 : "eax", "ecx", "st"; 462 } 463 return result; 464 } 465 else 466 static assert(false); 467 } 468 else 469 static assert(false); 470 } 471 472 473 ///ditto 474 long convertDoubleToInt64UsingMXCSR(double value) @trusted 475 { 476 static if (LDC_with_ARM32) 477 { 478 // We have to resort to libc since 32-bit ARM 479 // doesn't seem to have 64-bit registers. 480 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 481 switch(fpscr & _MM_ROUND_MASK_ARM) 482 { 483 default: 484 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 485 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 486 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 487 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 488 } 489 } 490 else static if (LDC_with_ARM64) 491 { 492 // Get current rounding mode. 493 uint fpscr = arm_get_fpcr(); 494 495 switch(fpscr & _MM_ROUND_MASK_ARM) 496 { 497 default: 498 case _MM_ROUND_NEAREST_ARM: 499 return __asm!long(`ldr d2, $1 500 fcvtns $0,d2`, "=r,m", value); 501 case _MM_ROUND_DOWN_ARM: 502 return __asm!long(`ldr d2, $1 503 fcvtms $0,d2`, "=r,m", value); 504 case _MM_ROUND_UP_ARM: 505 return __asm!long(`ldr d2, $1 506 fcvtps $0,d2`, "=r,m", value); 507 case _MM_ROUND_TOWARD_ZERO_ARM: 508 return cast(long)value; 509 } 510 } 511 // 64-bit can use an SSE instruction 512 else version(D_InlineAsm_X86_64) 513 { 514 long result; 515 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 516 { 517 asm pure nothrow @nogc @trusted 518 { 519 movsd XMM0, value; 520 cvtsd2si RAX, XMM0; 521 mov result, RAX; 522 } 523 } 524 else 525 { 526 asm pure nothrow @nogc @trusted 527 { 528 movsd XMM0, value; 529 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 530 mov result, RAX; 531 } 532 } 533 return result; 534 } 535 else version(D_InlineAsm_X86) 536 { 537 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 538 // This leads to an unfortunate FPU sequence in every C++ compiler. 539 // See: https://godbolt.org/z/vZym77 540 541 // Get current MXCSR rounding 542 uint sseRounding; 543 ushort savedFPUCW; 544 ushort newFPUCW; 545 long result; 546 asm pure nothrow @nogc @trusted 547 { 548 stmxcsr sseRounding; 549 fld value; 550 fnstcw savedFPUCW; 551 mov AX, savedFPUCW; 552 and AX, 0xf3ff; 553 movzx ECX, word ptr sseRounding; 554 and ECX, 0x6000; 555 shr ECX, 3; 556 or AX, CX; 557 mov newFPUCW, AX; 558 fldcw newFPUCW; 559 fistp result; 560 fldcw savedFPUCW; 561 } 562 return result; 563 } 564 else static if (GDC_with_x86) 565 { 566 version(X86_64) 567 { 568 static assert(GDC_with_SSE2); 569 __m128d A; 570 A.ptr[0] = value; 571 return __builtin_ia32_cvtsd2si64 (A); 572 } 573 else 574 { 575 // This is untested! 576 uint sseRounding; 577 ushort savedFPUCW; 578 ushort newFPUCW; 579 long result; 580 asm pure nothrow @nogc @trusted 581 { 582 "stmxcsr %1;\n" ~ 583 "fld %2;\n" ~ 584 "fnstcw %3;\n" ~ 585 "movw %3, %%ax;\n" ~ 586 "andw $0xf3ff, %%ax;\n" ~ 587 "movzwl %1, %%ecx;\n" ~ 588 "andl $0x6000, %%ecx;\n" ~ 589 "shrl $3, %%ecx;\n" ~ 590 "orw %%cx, %%ax\n" ~ 591 "movw %%ax, %4;\n" ~ 592 "fldcw %4;\n" ~ 593 "fistpll %0;\n" ~ 594 "fldcw %3;\n" 595 : "=m"(result) // %0 596 : "m" (sseRounding), 597 "t" (value), 598 "m" (savedFPUCW), 599 "m" (newFPUCW) 600 : "eax", "ecx", "st"; 601 } 602 return result; 603 } 604 } 605 else 606 static assert(false); 607 } 608 609 // 610 // </ROUNDING> 611 // 612 613 614 // using the Intel terminology here 615 616 byte saturateSignedWordToSignedByte(short value) pure @safe 617 { 618 if (value > 127) value = 127; 619 if (value < -128) value = -128; 620 return cast(byte) value; 621 } 622 623 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 624 { 625 if (value > 255) value = 255; 626 if (value < 0) value = 0; 627 return cast(ubyte) value; 628 } 629 630 short saturateSignedIntToSignedShort(int value) pure @safe 631 { 632 if (value > 32767) value = 32767; 633 if (value < -32768) value = -32768; 634 return cast(short) value; 635 } 636 637 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 638 { 639 if (value > 65535) value = 65535; 640 if (value < 0) value = 0; 641 return cast(ushort) value; 642 } 643 644 unittest // test saturate operations 645 { 646 assert( saturateSignedWordToSignedByte(32000) == 127); 647 assert( saturateSignedWordToUnsignedByte(32000) == 255); 648 assert( saturateSignedWordToSignedByte(-4000) == -128); 649 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 650 assert( saturateSignedIntToSignedShort(32768) == 32767); 651 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 652 assert( saturateSignedIntToSignedShort(-32769) == -32768); 653 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 654 } 655 656 version(unittest) 657 { 658 // This is just for debugging tests 659 import core.stdc.stdio: printf; 660 661 // printing vectors for implementation 662 // Note: you can override `pure` within a `debug` clause 663 664 void _mm_print_pi64(__m64 v) @trusted 665 { 666 long1 vl = cast(long1)v; 667 printf("%lld\n", vl.array[0]); 668 } 669 670 void _mm_print_pi32(__m64 v) @trusted 671 { 672 int[2] C = (cast(int2)v).array; 673 printf("%d %d\n", C[0], C[1]); 674 } 675 676 void _mm_print_pi16(__m64 v) @trusted 677 { 678 short[4] C = (cast(short4)v).array; 679 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 680 } 681 682 void _mm_print_pi8(__m64 v) @trusted 683 { 684 byte[8] C = (cast(byte8)v).array; 685 printf("%d %d %d %d %d %d %d %d\n", 686 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 687 } 688 689 void _mm_print_epi64(__m128i v) @trusted 690 { 691 long2 vl = cast(long2)v; 692 printf("%lld %lld\n", vl.array[0], vl.array[1]); 693 } 694 695 void _mm_print_epi32(__m128i v) @trusted 696 { 697 printf("%d %d %d %d\n", 698 v.array[0], v.array[1], v.array[2], v.array[3]); 699 } 700 701 void _mm_print_epi16(__m128i v) @trusted 702 { 703 short[8] C = (cast(short8)v).array; 704 printf("%d %d %d %d %d %d %d %d\n", 705 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 706 } 707 708 void _mm_print_epi8(__m128i v) @trusted 709 { 710 byte[16] C = (cast(byte16)v).array; 711 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 712 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 713 } 714 715 void _mm_print_ps(__m128 v) @trusted 716 { 717 float[4] C = (cast(float4)v).array; 718 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 719 } 720 721 void _mm_print_pd(__m128d v) @trusted 722 { 723 double[2] C = (cast(double2)v).array; 724 printf("%f %f\n", C[0], C[1]); 725 } 726 } 727 728 729 // 730 // <FLOATING-POINT COMPARISONS> 731 // 732 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 733 // need different IR generation. 734 735 enum FPComparison 736 { 737 oeq, // ordered and equal 738 ogt, // ordered and greater than 739 oge, // ordered and greater than or equal 740 olt, // ordered and less than 741 ole, // ordered and less than or equal 742 one, // ordered and not equal 743 ord, // ordered (no nans) 744 ueq, // unordered or equal 745 ugt, // unordered or greater than ("nle") 746 uge, // unordered or greater than or equal ("nlt") 747 ult, // unordered or less than ("nge") 748 ule, // unordered or less than or equal ("ngt") 749 une, // unordered or not equal ("neq") 750 uno, // unordered (either nans) 751 } 752 753 private static immutable string[FPComparison.max+1] FPComparisonToString = 754 [ 755 "oeq", 756 "ogt", 757 "oge", 758 "olt", 759 "ole", 760 "one", 761 "ord", 762 "ueq", 763 "ugt", 764 "uge", 765 "ult", 766 "ule", 767 "une", 768 "uno", 769 ]; 770 771 // Individual float comparison: returns -1 for true or 0 for false. 772 // Useful for DMD and testing 773 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 774 { 775 import std.math; 776 bool unordered = isNaN(a) || isNaN(b); 777 final switch(comparison) with(FPComparison) 778 { 779 case oeq: return a == b; 780 case ogt: return a > b; 781 case oge: return a >= b; 782 case olt: return a < b; 783 case ole: return a <= b; 784 case one: return !unordered && (a != b); // NaN with != always yields true 785 case ord: return !unordered; 786 case ueq: return unordered || (a == b); 787 case ugt: return unordered || (a > b); 788 case uge: return unordered || (a >= b); 789 case ult: return unordered || (a < b); 790 case ule: return unordered || (a <= b); 791 case une: return (a != b); // NaN with != always yields true 792 case uno: return unordered; 793 } 794 } 795 796 version(LDC) 797 { 798 /// Provides packed float comparisons 799 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 800 { 801 enum ir = ` 802 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 803 %r = sext <4 x i1> %cmp to <4 x i32> 804 ret <4 x i32> %r`; 805 806 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 807 } 808 809 /// Provides packed double comparisons 810 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 811 { 812 enum ir = ` 813 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 814 %r = sext <2 x i1> %cmp to <2 x i64> 815 ret <2 x i64> %r`; 816 817 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 818 } 819 820 /// CMPSS-style comparisons 821 /// clang implement it through x86 intrinsics, it is possible with IR alone 822 /// but leads to less optimal code. 823 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 824 /// Not that simple. 825 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 826 { 827 /* 828 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 829 enum bool invertOp = (predicateNumber & 0x80) != 0; 830 static if(invertOp) 831 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 832 else 833 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 834 */ 835 enum ir = ` 836 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 837 %r = sext i1 %cmp to i32 838 %r2 = bitcast i32 %r to float 839 ret float %r2`; 840 841 float4 r = a; 842 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 843 return r; 844 } 845 846 /// CMPSD-style comparisons 847 /// clang implement it through x86 intrinsics, it is possible with IR alone 848 /// but leads to less optimal code. 849 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 850 /// Not that simple. 851 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 852 { 853 enum ir = ` 854 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 855 %r = sext i1 %cmp to i64 856 %r2 = bitcast i64 %r to double 857 ret double %r2`; 858 859 double2 r = a; 860 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 861 return r; 862 } 863 864 // Note: ucomss and ucomsd are left unimplemented 865 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 866 { 867 enum ir = ` 868 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 869 %r = zext i1 %cmp to i32 870 ret i32 %r`; 871 872 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 873 } 874 875 // Note: ucomss and ucomsd are left unimplemented 876 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 877 { 878 enum ir = ` 879 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 880 %r = zext i1 %cmp to i32 881 ret i32 %r`; 882 883 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 884 } 885 } 886 else 887 { 888 /// Provides packed float comparisons 889 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 890 { 891 int4 result; 892 foreach(i; 0..4) 893 { 894 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 895 } 896 return result; 897 } 898 899 /// Provides packed double comparisons 900 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 901 { 902 long2 result; 903 foreach(i; 0..2) 904 { 905 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 906 } 907 return result; 908 } 909 910 /// Provides CMPSS-style comparison 911 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 912 { 913 int4 result = cast(int4)a; 914 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 915 return cast(float4)result; 916 } 917 918 /// Provides CMPSD-style comparison 919 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 920 { 921 long2 result = cast(long2)a; 922 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 923 return cast(double2)result; 924 } 925 926 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 927 { 928 return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0; 929 } 930 931 // Note: ucomss and ucomsd are left unimplemented 932 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 933 { 934 return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0; 935 } 936 } 937 unittest // cmpps 938 { 939 // Check all comparison type is working 940 float4 A = [1, 3, 5, float.nan]; 941 float4 B = [2, 3, 4, 5]; 942 943 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 944 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 945 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 946 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 947 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 948 int4 result_one = cmpps!(FPComparison.one)(A, B); 949 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 950 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 951 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 952 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 953 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 954 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 955 int4 result_une = cmpps!(FPComparison.une)(A, B); 956 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 957 958 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 959 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 960 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 961 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 962 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 963 static immutable int[4] correct_one = [-1, 0,-1, 0]; 964 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 965 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 966 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 967 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 968 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 969 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 970 static immutable int[4] correct_une = [-1, 0,-1,-1]; 971 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 972 973 assert(result_oeq.array == correct_oeq); 974 assert(result_ogt.array == correct_ogt); 975 assert(result_oge.array == correct_oge); 976 assert(result_olt.array == correct_olt); 977 assert(result_ole.array == correct_ole); 978 assert(result_one.array == correct_one); 979 assert(result_ord.array == correct_ord); 980 assert(result_ueq.array == correct_ueq); 981 assert(result_ugt.array == correct_ugt); 982 assert(result_uge.array == correct_uge); 983 assert(result_ult.array == correct_ult); 984 assert(result_ule.array == correct_ule); 985 assert(result_une.array == correct_une); 986 assert(result_uno.array == correct_uno); 987 } 988 unittest 989 { 990 double2 a = [1, 3]; 991 double2 b = [2, 3]; 992 long2 c = cmppd!(FPComparison.ult)(a, b); 993 static immutable long[2] correct = [cast(long)(-1), 0]; 994 assert(c.array == correct); 995 } 996 unittest // cmpss and comss 997 { 998 void testComparison(FPComparison comparison)(float4 A, float4 B) 999 { 1000 float4 result = cmpss!comparison(A, B); 1001 int4 iresult = cast(int4)result; 1002 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1003 assert(iresult.array[0] == expected); 1004 assert(result.array[1] == A.array[1]); 1005 assert(result.array[2] == A.array[2]); 1006 assert(result.array[3] == A.array[3]); 1007 1008 // check comss 1009 int comResult = comss!comparison(A, B); 1010 assert( (expected != 0) == (comResult != 0) ); 1011 } 1012 1013 // Check all comparison type is working 1014 float4 A = [1, 3, 5, 6]; 1015 float4 B = [2, 3, 4, 5]; 1016 float4 C = [float.nan, 3, 4, 5]; 1017 1018 testComparison!(FPComparison.oeq)(A, B); 1019 testComparison!(FPComparison.oeq)(A, C); 1020 testComparison!(FPComparison.ogt)(A, B); 1021 testComparison!(FPComparison.ogt)(A, C); 1022 testComparison!(FPComparison.oge)(A, B); 1023 testComparison!(FPComparison.oge)(A, C); 1024 testComparison!(FPComparison.olt)(A, B); 1025 testComparison!(FPComparison.olt)(A, C); 1026 testComparison!(FPComparison.ole)(A, B); 1027 testComparison!(FPComparison.ole)(A, C); 1028 testComparison!(FPComparison.one)(A, B); 1029 testComparison!(FPComparison.one)(A, C); 1030 testComparison!(FPComparison.ord)(A, B); 1031 testComparison!(FPComparison.ord)(A, C); 1032 testComparison!(FPComparison.ueq)(A, B); 1033 testComparison!(FPComparison.ueq)(A, C); 1034 testComparison!(FPComparison.ugt)(A, B); 1035 testComparison!(FPComparison.ugt)(A, C); 1036 testComparison!(FPComparison.uge)(A, B); 1037 testComparison!(FPComparison.uge)(A, C); 1038 testComparison!(FPComparison.ult)(A, B); 1039 testComparison!(FPComparison.ult)(A, C); 1040 testComparison!(FPComparison.ule)(A, B); 1041 testComparison!(FPComparison.ule)(A, C); 1042 testComparison!(FPComparison.une)(A, B); 1043 testComparison!(FPComparison.une)(A, C); 1044 testComparison!(FPComparison.uno)(A, B); 1045 testComparison!(FPComparison.uno)(A, C); 1046 } 1047 unittest // cmpsd and comsd 1048 { 1049 void testComparison(FPComparison comparison)(double2 A, double2 B) 1050 { 1051 double2 result = cmpsd!comparison(A, B); 1052 long2 iresult = cast(long2)result; 1053 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1054 assert(iresult.array[0] == expected); 1055 assert(result.array[1] == A.array[1]); 1056 1057 // check comsd 1058 int comResult = comsd!comparison(A, B); 1059 assert( (expected != 0) == (comResult != 0) ); 1060 } 1061 1062 // Check all comparison type is working 1063 double2 A = [1, 3]; 1064 double2 B = [2, 4]; 1065 double2 C = [double.nan, 5]; 1066 1067 testComparison!(FPComparison.oeq)(A, B); 1068 testComparison!(FPComparison.oeq)(A, C); 1069 testComparison!(FPComparison.ogt)(A, B); 1070 testComparison!(FPComparison.ogt)(A, C); 1071 testComparison!(FPComparison.oge)(A, B); 1072 testComparison!(FPComparison.oge)(A, C); 1073 testComparison!(FPComparison.olt)(A, B); 1074 testComparison!(FPComparison.olt)(A, C); 1075 testComparison!(FPComparison.ole)(A, B); 1076 testComparison!(FPComparison.ole)(A, C); 1077 testComparison!(FPComparison.one)(A, B); 1078 testComparison!(FPComparison.one)(A, C); 1079 testComparison!(FPComparison.ord)(A, B); 1080 testComparison!(FPComparison.ord)(A, C); 1081 testComparison!(FPComparison.ueq)(A, B); 1082 testComparison!(FPComparison.ueq)(A, C); 1083 testComparison!(FPComparison.ugt)(A, B); 1084 testComparison!(FPComparison.ugt)(A, C); 1085 testComparison!(FPComparison.uge)(A, B); 1086 testComparison!(FPComparison.uge)(A, C); 1087 testComparison!(FPComparison.ult)(A, B); 1088 testComparison!(FPComparison.ult)(A, C); 1089 testComparison!(FPComparison.ule)(A, B); 1090 testComparison!(FPComparison.ule)(A, C); 1091 testComparison!(FPComparison.une)(A, B); 1092 testComparison!(FPComparison.une)(A, C); 1093 testComparison!(FPComparison.uno)(A, B); 1094 testComparison!(FPComparison.uno)(A, C); 1095 } 1096 1097 // 1098 // </FLOATING-POINT COMPARISONS> 1099 // 1100 1101 1102 __m64 to_m64(__m128i a) pure @trusted 1103 { 1104 long2 la = cast(long2)a; 1105 long1 r; 1106 r.ptr[0] = la.array[0]; 1107 return r; 1108 } 1109 1110 __m128i to_m128i(__m64 a) pure @trusted 1111 { 1112 long2 r = [0, 0]; 1113 r.ptr[0] = a.array[0]; 1114 return cast(__m128i)r; 1115 } 1116 1117 // SOME NEON INTRINSICS 1118 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1119 // Not in the public API but the simde project expose it all for the user to use. 1120 // MAYDO: create a new neon.d module, for internal use only. 1121 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1122 static if (LDC_with_ARM64) 1123 { 1124 // VERY USEFUL LINK 1125 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1126 1127 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1128 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1129 1130 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1131 { 1132 return a & b; 1133 } 1134 1135 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1136 { 1137 int4 r; 1138 r.ptr[0] = lo.array[0]; 1139 r.ptr[1] = lo.array[1]; 1140 r.ptr[2] = hi.array[0]; 1141 r.ptr[3] = hi.array[1]; 1142 return r; 1143 } 1144 1145 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1146 { 1147 byte16 r; 1148 r.ptr[0] = lo.array[0]; 1149 r.ptr[1] = lo.array[1]; 1150 r.ptr[2] = lo.array[2]; 1151 r.ptr[3] = lo.array[3]; 1152 r.ptr[4] = lo.array[4]; 1153 r.ptr[5] = lo.array[5]; 1154 r.ptr[6] = lo.array[6]; 1155 r.ptr[7] = lo.array[7]; 1156 r.ptr[8] = hi.array[0]; 1157 r.ptr[9] = hi.array[1]; 1158 r.ptr[10] = hi.array[2]; 1159 r.ptr[11] = hi.array[3]; 1160 r.ptr[12] = hi.array[4]; 1161 r.ptr[13] = hi.array[5]; 1162 r.ptr[14] = hi.array[6]; 1163 r.ptr[15] = hi.array[7]; 1164 return r; 1165 } 1166 1167 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1168 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1169 1170 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1171 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1172 1173 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1174 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1175 1176 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1177 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1178 1179 short4 vget_high_s16(short8 a) pure @trusted 1180 { 1181 short4 r; 1182 r.ptr[0] = a.array[4]; 1183 r.ptr[1] = a.array[5]; 1184 r.ptr[2] = a.array[6]; 1185 r.ptr[3] = a.array[7]; 1186 return r; 1187 } 1188 1189 int2 vget_high_s32(int4 a) pure @trusted 1190 { 1191 int2 r; 1192 r.ptr[0] = a.array[2]; 1193 r.ptr[1] = a.array[3]; 1194 return r; 1195 } 1196 1197 byte8 vget_high_u8(byte16 a) pure @trusted 1198 { 1199 byte8 r; 1200 r.ptr[0] = a.array[8]; 1201 r.ptr[1] = a.array[9]; 1202 r.ptr[2] = a.array[10]; 1203 r.ptr[3] = a.array[11]; 1204 r.ptr[4] = a.array[12]; 1205 r.ptr[5] = a.array[13]; 1206 r.ptr[6] = a.array[14]; 1207 r.ptr[7] = a.array[15]; 1208 return r; 1209 } 1210 1211 short4 vget_low_s16(short8 a) pure @trusted 1212 { 1213 short4 r; 1214 r.ptr[0] = a.array[0]; 1215 r.ptr[1] = a.array[1]; 1216 r.ptr[2] = a.array[2]; 1217 r.ptr[3] = a.array[3]; 1218 return r; 1219 } 1220 1221 int2 vget_low_s32(int4 a) pure @trusted 1222 { 1223 int2 r; 1224 r.ptr[0] = a.array[0]; 1225 r.ptr[1] = a.array[1]; 1226 return r; 1227 } 1228 1229 byte8 vget_low_u8(byte16 a) pure @trusted 1230 { 1231 byte8 r; 1232 r.ptr[0] = a.array[0]; 1233 r.ptr[1] = a.array[1]; 1234 r.ptr[2] = a.array[2]; 1235 r.ptr[3] = a.array[3]; 1236 r.ptr[4] = a.array[4]; 1237 r.ptr[5] = a.array[5]; 1238 r.ptr[6] = a.array[6]; 1239 r.ptr[7] = a.array[7]; 1240 return r; 1241 } 1242 1243 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1244 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1245 1246 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1247 short8 vminq_s16(short8 a, short8 b) pure @safe; 1248 1249 int4 vmull_s16(short4 a, short4 b) pure @trusted 1250 { 1251 int4 r; 1252 r.ptr[0] = a.array[0] * b.array[0]; 1253 r.ptr[1] = a.array[1] * b.array[1]; 1254 r.ptr[2] = a.array[2] * b.array[2]; 1255 r.ptr[3] = a.array[3] * b.array[3]; 1256 return r; 1257 } 1258 1259 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1260 { 1261 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1262 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1263 } 1264 else 1265 { 1266 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1267 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1268 } 1269 1270 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1271 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1272 1273 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1274 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1275 1276 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1277 byte8 vqmovn_s16(short8 a) pure @safe; 1278 1279 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1280 byte8 vqmovun_s16(short8 a) pure @safe; 1281 1282 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1283 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1284 1285 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1286 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1287 1288 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1289 { 1290 return a >>> b; 1291 } 1292 } 1293