1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Guillaume Piolat 7 */ 8 module inteli.internals; 9 10 import inteli.types; 11 12 // The only math functions needed for intel-intrinsics 13 public import core.math: sqrt; // since it's an intrinsics 14 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit 15 16 17 version(GNU) 18 { 19 version (X86) 20 { 21 // For 32-bit x86, disable vector extensions with GDC. 22 // It just doesn't work well. 23 enum GDC_with_x86 = true; 24 enum GDC_with_MMX = false; 25 enum GDC_with_SSE = false; 26 enum GDC_with_SSE2 = false; 27 enum GDC_with_SSE3 = false; 28 enum LDC_with_ARM32 = false; 29 enum LDC_with_ARM64 = false; 30 enum LDC_with_SSE1 = false; 31 enum LDC_with_SSE2 = false; 32 enum LDC_with_SSE3 = false; 33 } 34 else version (X86_64) 35 { 36 // GDC support uses extended inline assembly: 37 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 38 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 39 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 40 41 public import core.simd; 42 43 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 44 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 45 public import gcc.builtins; 46 47 enum GDC_with_x86 = true; 48 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 49 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 50 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 51 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 52 enum LDC_with_ARM32 = false; 53 enum LDC_with_ARM64 = false; 54 enum LDC_with_SSE1 = false; 55 enum LDC_with_SSE2 = false; 56 enum LDC_with_SSE3 = false; 57 } 58 else 59 { 60 enum GDC_with_x86 = false; 61 enum GDC_with_MMX = false; 62 enum GDC_with_SSE = false; 63 enum GDC_with_SSE2 = false; 64 enum GDC_with_SSE3 = false; 65 enum LDC_with_ARM32 = false; 66 enum LDC_with_ARM64 = false; 67 enum LDC_with_SSE1 = false; 68 enum LDC_with_SSE2 = false; 69 enum LDC_with_SSE3 = false; 70 } 71 } 72 else version(LDC) 73 { 74 public import core.simd; 75 public import ldc.simd; 76 public import ldc.intrinsics; 77 public import ldc.llvmasm: __asm; 78 79 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 80 static if (__VERSION__ >= 2083) 81 { 82 import ldc.llvmasm; 83 alias LDCInlineIR = __ir_pure; 84 85 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 86 alias LDCInlineIREx = __irEx_pure; 87 } 88 else 89 { 90 alias LDCInlineIR = inlineIR; 91 } 92 93 package(inteli) 94 { 95 enum GDC_with_x86 = false; 96 enum GDC_with_MMX = false; 97 enum GDC_with_SSE = false; 98 enum GDC_with_SSE2 = false; 99 enum GDC_with_SSE3 = false; 100 } 101 102 version(ARM) 103 { 104 public import ldc.gccbuiltins_arm; 105 enum LDC_with_ARM32 = true; 106 enum LDC_with_ARM64 = false; 107 enum LDC_with_SSE1 = false; 108 enum LDC_with_SSE2 = false; 109 enum LDC_with_SSE3 = false; 110 } 111 else version(AArch64) 112 { 113 //public import ldc.gccbuiltins_arm; 114 enum LDC_with_ARM32 = false; 115 enum LDC_with_ARM64 = true; 116 enum LDC_with_SSE1 = false; 117 enum LDC_with_SSE2 = false; 118 enum LDC_with_SSE3 = false; 119 } 120 else 121 { 122 public import ldc.gccbuiltins_x86; 123 enum LDC_with_ARM32 = false; 124 enum LDC_with_ARM64 = false; 125 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 126 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 127 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 128 } 129 } 130 else version(DigitalMars) 131 { 132 package(inteli) 133 { 134 enum GDC_with_x86 = false; 135 enum GDC_with_MMX = false; 136 enum GDC_with_SSE = false; 137 enum GDC_with_SSE2 = false; 138 enum GDC_with_SSE3 = false; 139 enum LDC_with_ARM32 = false; 140 enum LDC_with_ARM64 = false; 141 enum LDC_with_SSE1 = false; 142 enum LDC_with_SSE2 = false; 143 enum LDC_with_SSE3 = false; 144 } 145 } 146 else 147 { 148 static assert(false, "Unknown compiler"); 149 } 150 151 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though 152 153 static if (LDC_with_ARM32) 154 { 155 package uint arm_get_fpcr() nothrow @nogc @trusted 156 { 157 return __builtin_arm_get_fpscr(); 158 } 159 160 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 161 { 162 __builtin_arm_set_fpscr(cw); 163 } 164 } 165 166 static if (LDC_with_ARM64) 167 { 168 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 169 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 170 171 package uint arm_get_fpcr() pure nothrow @nogc @trusted 172 { 173 return cast(uint) __builtin_aarch64_get_fpcr(); 174 } 175 176 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 177 { 178 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 179 long save_x2; 180 __asm!void("str x2, $1 \n" ~ 181 "ldr w2, $0 \n" ~ 182 "msr fpcr, x2 \n" ~ 183 "ldr x2, $1 " , "m,m", cw, &save_x2); 184 } 185 } 186 187 version(DigitalMars) 188 { 189 version(D_InlineAsm_X86) 190 enum DMD_with_asm = true; 191 else version(D_InlineAsm_X86_64) 192 enum DMD_with_asm = true; 193 else 194 enum DMD_with_asm = false; 195 196 version(D_InlineAsm_X86) 197 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 198 else 199 enum DMD_with_32bit_asm = false; 200 } 201 else 202 { 203 enum DMD_with_asm = false; 204 enum DMD_with_32bit_asm = false; 205 } 206 207 208 package: 209 nothrow @nogc: 210 211 212 // For internal use only, since public API deals with a x86 semantic emulation 213 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 214 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 215 enum uint _MM_ROUND_UP_ARM = 0x00400000; 216 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 217 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 218 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 219 220 221 // 222 // <ROUNDING> 223 // 224 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 225 // doesn't change the FPU rounding mode, and isn't expected to do so. 226 // So we devised these rounding function to help having consistent rouding between 227 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 228 // 229 // Note: There is no MXCSR in ARM. But there is fpscr that implements similar 230 // functionality the same. 231 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 232 // There is no 233 // We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 234 235 int convertFloatToInt32UsingMXCSR(float value) @trusted 236 { 237 int result; 238 version(GNU) 239 { 240 asm pure nothrow @nogc @trusted 241 { 242 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 243 } 244 } 245 else static if (LDC_with_ARM32) 246 { 247 result = __asm!int(`vldr s2, $1 248 vcvtr.s32.f32 s2, s2 249 vmov $0, s2`, "=r,m", value); 250 } 251 else static if (LDC_with_ARM64) 252 { 253 // Get current rounding mode. 254 uint fpscr = arm_get_fpcr(); 255 256 switch(fpscr & _MM_ROUND_MASK_ARM) 257 { 258 default: 259 case _MM_ROUND_NEAREST_ARM: 260 result = __asm!int(`ldr s2, $1 261 fcvtns $0,s2`, "=r,m", value); 262 break; 263 case _MM_ROUND_DOWN_ARM: 264 result = __asm!int(`ldr s2, $1 265 fcvtms $0,s2`, "=r,m", value); 266 break; 267 case _MM_ROUND_UP_ARM: 268 result = __asm!int(`ldr s2, $1 269 fcvtps $0,s2`, "=r,m", value); 270 break; 271 case _MM_ROUND_TOWARD_ZERO_ARM: 272 result = cast(int)value; 273 break; 274 } 275 } 276 else 277 { 278 asm pure nothrow @nogc @trusted 279 { 280 cvtss2si EAX, value; 281 mov result, EAX; 282 } 283 } 284 return result; 285 } 286 287 int convertDoubleToInt32UsingMXCSR(double value) @trusted 288 { 289 int result; 290 version(GNU) 291 { 292 asm pure nothrow @nogc @trusted 293 { 294 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 295 } 296 } 297 else static if (LDC_with_ARM32) 298 { 299 result = __asm!int(`vldr d2, $1 300 vcvtr.s32.f64 s2, d2 301 vmov $0, s2`, "=r,m", value); 302 } 303 else static if (LDC_with_ARM64) 304 { 305 // Get current rounding mode. 306 uint fpscr = arm_get_fpcr(); 307 308 switch(fpscr & _MM_ROUND_MASK_ARM) 309 { 310 default: 311 case _MM_ROUND_NEAREST_ARM: 312 result = __asm!int(`ldr d2, $1 313 fcvtns $0,d2`, "=r,m", value); 314 break; 315 case _MM_ROUND_DOWN_ARM: 316 result = __asm!int(`ldr d2, $1 317 fcvtms $0,d2`, "=r,m", value); 318 break; 319 case _MM_ROUND_UP_ARM: 320 result = __asm!int(`ldr d2, $1 321 fcvtps $0,d2`, "=r,m", value); 322 break; 323 case _MM_ROUND_TOWARD_ZERO_ARM: 324 result = cast(int)value; 325 break; 326 } 327 } 328 else 329 { 330 asm pure nothrow @nogc @trusted 331 { 332 cvtsd2si EAX, value; 333 mov result, EAX; 334 } 335 } 336 return result; 337 } 338 339 long convertFloatToInt64UsingMXCSR(float value) @trusted 340 { 341 static if (LDC_with_ARM32) 342 { 343 // We have to resort to libc since 32-bit ARM 344 // doesn't seem to have 64-bit registers. 345 346 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 347 348 // Note: converting to double precision else rounding could be different for large integers 349 double asDouble = value; 350 351 switch(fpscr & _MM_ROUND_MASK_ARM) 352 { 353 default: 354 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 355 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 356 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 357 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 358 } 359 } 360 else static if (LDC_with_ARM64) 361 { 362 uint fpscr = arm_get_fpcr(); 363 364 switch(fpscr & _MM_ROUND_MASK_ARM) 365 { 366 default: 367 case _MM_ROUND_NEAREST_ARM: 368 return __asm!long(`ldr s2, $1 369 fcvtns $0,s2`, "=r,m", value); 370 case _MM_ROUND_DOWN_ARM: 371 return __asm!long(`ldr s2, $1 372 fcvtms $0,s2`, "=r,m", value); 373 case _MM_ROUND_UP_ARM: 374 return __asm!long(`ldr s2, $1 375 fcvtps $0,s2`, "=r,m", value); 376 case _MM_ROUND_TOWARD_ZERO_ARM: 377 return cast(long)value; 378 } 379 } 380 // 64-bit can use an SSE instruction 381 else version(D_InlineAsm_X86_64) 382 { 383 long result; 384 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 385 { 386 asm pure nothrow @nogc @trusted 387 { 388 movss XMM0, value; 389 cvtss2si RAX, XMM0; 390 mov result, RAX; 391 } 392 } 393 else 394 { 395 asm pure nothrow @nogc @trusted 396 { 397 movss XMM0, value; 398 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 399 mov result, RAX; 400 } 401 } 402 return result; 403 } 404 else version(D_InlineAsm_X86) 405 { 406 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 407 // This leads to an unfortunate FPU sequence in every C++ compiler. 408 // See: https://godbolt.org/z/vZym77 409 410 // Get current MXCSR rounding 411 uint sseRounding; 412 ushort savedFPUCW; 413 ushort newFPUCW; 414 long result; 415 asm pure nothrow @nogc @trusted 416 { 417 stmxcsr sseRounding; 418 fld value; 419 fnstcw savedFPUCW; 420 mov AX, savedFPUCW; 421 and AX, 0xf3ff; // clear FPU rounding bits 422 movzx ECX, word ptr sseRounding; 423 and ECX, 0x6000; // only keep SSE rounding bits 424 shr ECX, 3; 425 or AX, CX; // make a new control word for FPU with SSE bits 426 mov newFPUCW, AX; 427 fldcw newFPUCW; 428 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 429 fldcw savedFPUCW; 430 } 431 return result; 432 } 433 else static if (GDC_with_x86) 434 { 435 version(X86_64) // 64-bit can just use the right instruction 436 { 437 static assert(GDC_with_SSE); 438 __m128 A; 439 A.ptr[0] = value; 440 return __builtin_ia32_cvtss2si64 (A); 441 } 442 else version(X86) // 32-bit 443 { 444 // This is untested! 445 uint sseRounding; 446 ushort savedFPUCW; 447 ushort newFPUCW; 448 long result; 449 asm pure nothrow @nogc @trusted 450 { 451 "stmxcsr %1;\n" ~ 452 "fld %2;\n" ~ 453 "fnstcw %3;\n" ~ 454 "movw %3, %%ax;\n" ~ 455 "andw $0xf3ff, %%ax;\n" ~ 456 "movzwl %1, %%ecx;\n" ~ 457 "andl $0x6000, %%ecx;\n" ~ 458 "shrl $3, %%ecx;\n" ~ 459 "orw %%cx, %%ax\n" ~ 460 "movw %%ax, %4;\n" ~ 461 "fldcw %4;\n" ~ 462 "fistpll %0;\n" ~ 463 "fldcw %3;\n" 464 : "=m"(result) // %0 465 : "m" (sseRounding), 466 "f" (value), 467 "m" (savedFPUCW), 468 "m" (newFPUCW) 469 : "eax", "ecx", "st"; 470 } 471 return result; 472 } 473 else 474 static assert(false); 475 } 476 else 477 static assert(false); 478 } 479 480 481 ///ditto 482 long convertDoubleToInt64UsingMXCSR(double value) @trusted 483 { 484 static if (LDC_with_ARM32) 485 { 486 // We have to resort to libc since 32-bit ARM 487 // doesn't seem to have 64-bit registers. 488 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 489 switch(fpscr & _MM_ROUND_MASK_ARM) 490 { 491 default: 492 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 493 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 494 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 495 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 496 } 497 } 498 else static if (LDC_with_ARM64) 499 { 500 // Get current rounding mode. 501 uint fpscr = arm_get_fpcr(); 502 503 switch(fpscr & _MM_ROUND_MASK_ARM) 504 { 505 default: 506 case _MM_ROUND_NEAREST_ARM: 507 return __asm!long(`ldr d2, $1 508 fcvtns $0,d2`, "=r,m", value); 509 case _MM_ROUND_DOWN_ARM: 510 return __asm!long(`ldr d2, $1 511 fcvtms $0,d2`, "=r,m", value); 512 case _MM_ROUND_UP_ARM: 513 return __asm!long(`ldr d2, $1 514 fcvtps $0,d2`, "=r,m", value); 515 case _MM_ROUND_TOWARD_ZERO_ARM: 516 return cast(long)value; 517 } 518 } 519 // 64-bit can use an SSE instruction 520 else version(D_InlineAsm_X86_64) 521 { 522 long result; 523 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 524 { 525 asm pure nothrow @nogc @trusted 526 { 527 movsd XMM0, value; 528 cvtsd2si RAX, XMM0; 529 mov result, RAX; 530 } 531 } 532 else 533 { 534 asm pure nothrow @nogc @trusted 535 { 536 movsd XMM0, value; 537 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 538 mov result, RAX; 539 } 540 } 541 return result; 542 } 543 else version(D_InlineAsm_X86) 544 { 545 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 546 // This leads to an unfortunate FPU sequence in every C++ compiler. 547 // See: https://godbolt.org/z/vZym77 548 549 // Get current MXCSR rounding 550 uint sseRounding; 551 ushort savedFPUCW; 552 ushort newFPUCW; 553 long result; 554 asm pure nothrow @nogc @trusted 555 { 556 stmxcsr sseRounding; 557 fld value; 558 fnstcw savedFPUCW; 559 mov AX, savedFPUCW; 560 and AX, 0xf3ff; 561 movzx ECX, word ptr sseRounding; 562 and ECX, 0x6000; 563 shr ECX, 3; 564 or AX, CX; 565 mov newFPUCW, AX; 566 fldcw newFPUCW; 567 fistp result; 568 fldcw savedFPUCW; 569 } 570 return result; 571 } 572 else static if (GDC_with_x86) 573 { 574 version(X86_64) 575 { 576 static assert(GDC_with_SSE2); 577 __m128d A; 578 A.ptr[0] = value; 579 return __builtin_ia32_cvtsd2si64 (A); 580 } 581 else 582 { 583 // This is untested! 584 uint sseRounding; 585 ushort savedFPUCW; 586 ushort newFPUCW; 587 long result; 588 asm pure nothrow @nogc @trusted 589 { 590 "stmxcsr %1;\n" ~ 591 "fld %2;\n" ~ 592 "fnstcw %3;\n" ~ 593 "movw %3, %%ax;\n" ~ 594 "andw $0xf3ff, %%ax;\n" ~ 595 "movzwl %1, %%ecx;\n" ~ 596 "andl $0x6000, %%ecx;\n" ~ 597 "shrl $3, %%ecx;\n" ~ 598 "orw %%cx, %%ax\n" ~ 599 "movw %%ax, %4;\n" ~ 600 "fldcw %4;\n" ~ 601 "fistpll %0;\n" ~ 602 "fldcw %3;\n" 603 : "=m"(result) // %0 604 : "m" (sseRounding), 605 "t" (value), 606 "m" (savedFPUCW), 607 "m" (newFPUCW) 608 : "eax", "ecx", "st"; 609 } 610 return result; 611 } 612 } 613 else 614 static assert(false); 615 } 616 617 // 618 // </ROUNDING> 619 // 620 621 622 // using the Intel terminology here 623 624 byte saturateSignedWordToSignedByte(short value) pure @safe 625 { 626 if (value > 127) value = 127; 627 if (value < -128) value = -128; 628 return cast(byte) value; 629 } 630 631 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 632 { 633 if (value > 255) value = 255; 634 if (value < 0) value = 0; 635 return cast(ubyte) value; 636 } 637 638 short saturateSignedIntToSignedShort(int value) pure @safe 639 { 640 if (value > 32767) value = 32767; 641 if (value < -32768) value = -32768; 642 return cast(short) value; 643 } 644 645 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 646 { 647 if (value > 65535) value = 65535; 648 if (value < 0) value = 0; 649 return cast(ushort) value; 650 } 651 652 unittest // test saturate operations 653 { 654 assert( saturateSignedWordToSignedByte(32000) == 127); 655 assert( saturateSignedWordToUnsignedByte(32000) == 255); 656 assert( saturateSignedWordToSignedByte(-4000) == -128); 657 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 658 assert( saturateSignedIntToSignedShort(32768) == 32767); 659 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 660 assert( saturateSignedIntToSignedShort(-32769) == -32768); 661 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 662 } 663 664 version(unittest) 665 { 666 // This is just for debugging tests 667 import core.stdc.stdio: printf; 668 669 // printing vectors for implementation 670 // Note: you can override `pure` within a `debug` clause 671 672 void _mm_print_pi64(__m64 v) @trusted 673 { 674 long1 vl = cast(long1)v; 675 printf("%lld\n", vl.array[0]); 676 } 677 678 void _mm_print_pi32(__m64 v) @trusted 679 { 680 int[2] C = (cast(int2)v).array; 681 printf("%d %d\n", C[0], C[1]); 682 } 683 684 void _mm_print_pi16(__m64 v) @trusted 685 { 686 short[4] C = (cast(short4)v).array; 687 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 688 } 689 690 void _mm_print_pi8(__m64 v) @trusted 691 { 692 byte[8] C = (cast(byte8)v).array; 693 printf("%d %d %d %d %d %d %d %d\n", 694 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 695 } 696 697 void _mm_print_epi64(__m128i v) @trusted 698 { 699 long2 vl = cast(long2)v; 700 printf("%lld %lld\n", vl.array[0], vl.array[1]); 701 } 702 703 void _mm_print_epi32(__m128i v) @trusted 704 { 705 printf("%d %d %d %d\n", 706 v.array[0], v.array[1], v.array[2], v.array[3]); 707 } 708 709 void _mm_print_epi16(__m128i v) @trusted 710 { 711 short[8] C = (cast(short8)v).array; 712 printf("%d %d %d %d %d %d %d %d\n", 713 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 714 } 715 716 void _mm_print_epi8(__m128i v) @trusted 717 { 718 byte[16] C = (cast(byte16)v).array; 719 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 720 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 721 } 722 723 void _mm_print_ps(__m128 v) @trusted 724 { 725 float[4] C = (cast(float4)v).array; 726 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 727 } 728 729 void _mm_print_pd(__m128d v) @trusted 730 { 731 double[2] C = (cast(double2)v).array; 732 printf("%f %f\n", C[0], C[1]); 733 } 734 } 735 736 737 // 738 // <FLOATING-POINT COMPARISONS> 739 // 740 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 741 // need different IR generation. 742 743 enum FPComparison 744 { 745 oeq, // ordered and equal 746 ogt, // ordered and greater than 747 oge, // ordered and greater than or equal 748 olt, // ordered and less than 749 ole, // ordered and less than or equal 750 one, // ordered and not equal 751 ord, // ordered (no nans) 752 ueq, // unordered or equal 753 ugt, // unordered or greater than ("nle") 754 uge, // unordered or greater than or equal ("nlt") 755 ult, // unordered or less than ("nge") 756 ule, // unordered or less than or equal ("ngt") 757 une, // unordered or not equal ("neq") 758 uno, // unordered (either nans) 759 } 760 761 private static immutable string[FPComparison.max+1] FPComparisonToString = 762 [ 763 "oeq", 764 "ogt", 765 "oge", 766 "olt", 767 "ole", 768 "one", 769 "ord", 770 "ueq", 771 "ugt", 772 "uge", 773 "ult", 774 "ule", 775 "une", 776 "uno", 777 ]; 778 779 // Individual float comparison: returns -1 for true or 0 for false. 780 // Useful for DMD and testing 781 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 782 { 783 import std.math; 784 bool unordered = isNaN(a) || isNaN(b); 785 final switch(comparison) with(FPComparison) 786 { 787 case oeq: return a == b; 788 case ogt: return a > b; 789 case oge: return a >= b; 790 case olt: return a < b; 791 case ole: return a <= b; 792 case one: return !unordered && (a != b); // NaN with != always yields true 793 case ord: return !unordered; 794 case ueq: return unordered || (a == b); 795 case ugt: return unordered || (a > b); 796 case uge: return unordered || (a >= b); 797 case ult: return unordered || (a < b); 798 case ule: return unordered || (a <= b); 799 case une: return (a != b); // NaN with != always yields true 800 case uno: return unordered; 801 } 802 } 803 804 version(LDC) 805 { 806 /// Provides packed float comparisons 807 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 808 { 809 enum ir = ` 810 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 811 %r = sext <4 x i1> %cmp to <4 x i32> 812 ret <4 x i32> %r`; 813 814 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 815 } 816 817 /// Provides packed double comparisons 818 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 819 { 820 enum ir = ` 821 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 822 %r = sext <2 x i1> %cmp to <2 x i64> 823 ret <2 x i64> %r`; 824 825 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 826 } 827 828 /// CMPSS-style comparisons 829 /// clang implement it through x86 intrinsics, it is possible with IR alone 830 /// but leads to less optimal code. 831 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 832 /// Not that simple. 833 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 834 { 835 /* 836 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 837 enum bool invertOp = (predicateNumber & 0x80) != 0; 838 static if(invertOp) 839 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 840 else 841 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 842 */ 843 enum ir = ` 844 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 845 %r = sext i1 %cmp to i32 846 %r2 = bitcast i32 %r to float 847 ret float %r2`; 848 849 float4 r = a; 850 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 851 return r; 852 } 853 854 /// CMPSD-style comparisons 855 /// clang implement it through x86 intrinsics, it is possible with IR alone 856 /// but leads to less optimal code. 857 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 858 /// Not that simple. 859 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 860 { 861 enum ir = ` 862 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 863 %r = sext i1 %cmp to i64 864 %r2 = bitcast i64 %r to double 865 ret double %r2`; 866 867 double2 r = a; 868 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 869 return r; 870 } 871 872 // Note: ucomss and ucomsd are left unimplemented 873 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 874 { 875 enum ir = ` 876 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 877 %r = zext i1 %cmp to i32 878 ret i32 %r`; 879 880 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 881 } 882 883 // Note: ucomss and ucomsd are left unimplemented 884 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 885 { 886 enum ir = ` 887 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 888 %r = zext i1 %cmp to i32 889 ret i32 %r`; 890 891 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 892 } 893 } 894 else 895 { 896 /// Provides packed float comparisons 897 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 898 { 899 int4 result; 900 foreach(i; 0..4) 901 { 902 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 903 } 904 return result; 905 } 906 907 /// Provides packed double comparisons 908 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 909 { 910 long2 result; 911 foreach(i; 0..2) 912 { 913 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 914 } 915 return result; 916 } 917 918 /// Provides CMPSS-style comparison 919 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 920 { 921 int4 result = cast(int4)a; 922 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 923 return cast(float4)result; 924 } 925 926 /// Provides CMPSD-style comparison 927 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 928 { 929 long2 result = cast(long2)a; 930 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 931 return cast(double2)result; 932 } 933 934 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 935 { 936 return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0; 937 } 938 939 // Note: ucomss and ucomsd are left unimplemented 940 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 941 { 942 return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0; 943 } 944 } 945 unittest // cmpps 946 { 947 // Check all comparison type is working 948 float4 A = [1, 3, 5, float.nan]; 949 float4 B = [2, 3, 4, 5]; 950 951 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 952 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 953 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 954 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 955 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 956 int4 result_one = cmpps!(FPComparison.one)(A, B); 957 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 958 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 959 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 960 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 961 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 962 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 963 int4 result_une = cmpps!(FPComparison.une)(A, B); 964 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 965 966 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 967 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 968 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 969 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 970 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 971 static immutable int[4] correct_one = [-1, 0,-1, 0]; 972 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 973 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 974 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 975 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 976 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 977 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 978 static immutable int[4] correct_une = [-1, 0,-1,-1]; 979 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 980 981 assert(result_oeq.array == correct_oeq); 982 assert(result_ogt.array == correct_ogt); 983 assert(result_oge.array == correct_oge); 984 assert(result_olt.array == correct_olt); 985 assert(result_ole.array == correct_ole); 986 assert(result_one.array == correct_one); 987 assert(result_ord.array == correct_ord); 988 assert(result_ueq.array == correct_ueq); 989 assert(result_ugt.array == correct_ugt); 990 assert(result_uge.array == correct_uge); 991 assert(result_ult.array == correct_ult); 992 assert(result_ule.array == correct_ule); 993 assert(result_une.array == correct_une); 994 assert(result_uno.array == correct_uno); 995 } 996 unittest 997 { 998 double2 a = [1, 3]; 999 double2 b = [2, 3]; 1000 long2 c = cmppd!(FPComparison.ult)(a, b); 1001 static immutable long[2] correct = [cast(long)(-1), 0]; 1002 assert(c.array == correct); 1003 } 1004 unittest // cmpss and comss 1005 { 1006 void testComparison(FPComparison comparison)(float4 A, float4 B) 1007 { 1008 float4 result = cmpss!comparison(A, B); 1009 int4 iresult = cast(int4)result; 1010 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1011 assert(iresult.array[0] == expected); 1012 assert(result.array[1] == A.array[1]); 1013 assert(result.array[2] == A.array[2]); 1014 assert(result.array[3] == A.array[3]); 1015 1016 // check comss 1017 int comResult = comss!comparison(A, B); 1018 assert( (expected != 0) == (comResult != 0) ); 1019 } 1020 1021 // Check all comparison type is working 1022 float4 A = [1, 3, 5, 6]; 1023 float4 B = [2, 3, 4, 5]; 1024 float4 C = [float.nan, 3, 4, 5]; 1025 1026 testComparison!(FPComparison.oeq)(A, B); 1027 testComparison!(FPComparison.oeq)(A, C); 1028 testComparison!(FPComparison.ogt)(A, B); 1029 testComparison!(FPComparison.ogt)(A, C); 1030 testComparison!(FPComparison.oge)(A, B); 1031 testComparison!(FPComparison.oge)(A, C); 1032 testComparison!(FPComparison.olt)(A, B); 1033 testComparison!(FPComparison.olt)(A, C); 1034 testComparison!(FPComparison.ole)(A, B); 1035 testComparison!(FPComparison.ole)(A, C); 1036 testComparison!(FPComparison.one)(A, B); 1037 testComparison!(FPComparison.one)(A, C); 1038 testComparison!(FPComparison.ord)(A, B); 1039 testComparison!(FPComparison.ord)(A, C); 1040 testComparison!(FPComparison.ueq)(A, B); 1041 testComparison!(FPComparison.ueq)(A, C); 1042 testComparison!(FPComparison.ugt)(A, B); 1043 testComparison!(FPComparison.ugt)(A, C); 1044 testComparison!(FPComparison.uge)(A, B); 1045 testComparison!(FPComparison.uge)(A, C); 1046 testComparison!(FPComparison.ult)(A, B); 1047 testComparison!(FPComparison.ult)(A, C); 1048 testComparison!(FPComparison.ule)(A, B); 1049 testComparison!(FPComparison.ule)(A, C); 1050 testComparison!(FPComparison.une)(A, B); 1051 testComparison!(FPComparison.une)(A, C); 1052 testComparison!(FPComparison.uno)(A, B); 1053 testComparison!(FPComparison.uno)(A, C); 1054 } 1055 unittest // cmpsd and comsd 1056 { 1057 void testComparison(FPComparison comparison)(double2 A, double2 B) 1058 { 1059 double2 result = cmpsd!comparison(A, B); 1060 long2 iresult = cast(long2)result; 1061 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1062 assert(iresult.array[0] == expected); 1063 assert(result.array[1] == A.array[1]); 1064 1065 // check comsd 1066 int comResult = comsd!comparison(A, B); 1067 assert( (expected != 0) == (comResult != 0) ); 1068 } 1069 1070 // Check all comparison type is working 1071 double2 A = [1, 3]; 1072 double2 B = [2, 4]; 1073 double2 C = [double.nan, 5]; 1074 1075 testComparison!(FPComparison.oeq)(A, B); 1076 testComparison!(FPComparison.oeq)(A, C); 1077 testComparison!(FPComparison.ogt)(A, B); 1078 testComparison!(FPComparison.ogt)(A, C); 1079 testComparison!(FPComparison.oge)(A, B); 1080 testComparison!(FPComparison.oge)(A, C); 1081 testComparison!(FPComparison.olt)(A, B); 1082 testComparison!(FPComparison.olt)(A, C); 1083 testComparison!(FPComparison.ole)(A, B); 1084 testComparison!(FPComparison.ole)(A, C); 1085 testComparison!(FPComparison.one)(A, B); 1086 testComparison!(FPComparison.one)(A, C); 1087 testComparison!(FPComparison.ord)(A, B); 1088 testComparison!(FPComparison.ord)(A, C); 1089 testComparison!(FPComparison.ueq)(A, B); 1090 testComparison!(FPComparison.ueq)(A, C); 1091 testComparison!(FPComparison.ugt)(A, B); 1092 testComparison!(FPComparison.ugt)(A, C); 1093 testComparison!(FPComparison.uge)(A, B); 1094 testComparison!(FPComparison.uge)(A, C); 1095 testComparison!(FPComparison.ult)(A, B); 1096 testComparison!(FPComparison.ult)(A, C); 1097 testComparison!(FPComparison.ule)(A, B); 1098 testComparison!(FPComparison.ule)(A, C); 1099 testComparison!(FPComparison.une)(A, B); 1100 testComparison!(FPComparison.une)(A, C); 1101 testComparison!(FPComparison.uno)(A, B); 1102 testComparison!(FPComparison.uno)(A, C); 1103 } 1104 1105 // 1106 // </FLOATING-POINT COMPARISONS> 1107 // 1108 1109 1110 __m64 to_m64(__m128i a) pure @trusted 1111 { 1112 long2 la = cast(long2)a; 1113 long1 r; 1114 r.ptr[0] = la.array[0]; 1115 return r; 1116 } 1117 1118 __m128i to_m128i(__m64 a) pure @trusted 1119 { 1120 long2 r = [0, 0]; 1121 r.ptr[0] = a.array[0]; 1122 return cast(__m128i)r; 1123 } 1124 1125 // SOME NEON INTRINSICS 1126 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1127 // Not in the public API but the simde project expose it all for the user to use. 1128 // MAYDO: create a new neon.d module, for internal use only. 1129 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1130 static if (LDC_with_ARM64) 1131 { 1132 // VERY USEFUL LINK 1133 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1134 1135 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1136 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1137 1138 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1139 { 1140 return a & b; 1141 } 1142 1143 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1144 { 1145 int4 r; 1146 r.ptr[0] = lo.array[0]; 1147 r.ptr[1] = lo.array[1]; 1148 r.ptr[2] = hi.array[0]; 1149 r.ptr[3] = hi.array[1]; 1150 return r; 1151 } 1152 1153 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1154 { 1155 byte16 r; 1156 r.ptr[0] = lo.array[0]; 1157 r.ptr[1] = lo.array[1]; 1158 r.ptr[2] = lo.array[2]; 1159 r.ptr[3] = lo.array[3]; 1160 r.ptr[4] = lo.array[4]; 1161 r.ptr[5] = lo.array[5]; 1162 r.ptr[6] = lo.array[6]; 1163 r.ptr[7] = lo.array[7]; 1164 r.ptr[8] = hi.array[0]; 1165 r.ptr[9] = hi.array[1]; 1166 r.ptr[10] = hi.array[2]; 1167 r.ptr[11] = hi.array[3]; 1168 r.ptr[12] = hi.array[4]; 1169 r.ptr[13] = hi.array[5]; 1170 r.ptr[14] = hi.array[6]; 1171 r.ptr[15] = hi.array[7]; 1172 return r; 1173 } 1174 1175 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1176 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1177 1178 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1179 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1180 1181 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1182 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1183 1184 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1185 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1186 1187 short4 vget_high_s16(short8 a) pure @trusted 1188 { 1189 short4 r; 1190 r.ptr[0] = a.array[4]; 1191 r.ptr[1] = a.array[5]; 1192 r.ptr[2] = a.array[6]; 1193 r.ptr[3] = a.array[7]; 1194 return r; 1195 } 1196 1197 int2 vget_high_s32(int4 a) pure @trusted 1198 { 1199 int2 r; 1200 r.ptr[0] = a.array[2]; 1201 r.ptr[1] = a.array[3]; 1202 return r; 1203 } 1204 1205 byte8 vget_high_u8(byte16 a) pure @trusted 1206 { 1207 byte8 r; 1208 r.ptr[0] = a.array[8]; 1209 r.ptr[1] = a.array[9]; 1210 r.ptr[2] = a.array[10]; 1211 r.ptr[3] = a.array[11]; 1212 r.ptr[4] = a.array[12]; 1213 r.ptr[5] = a.array[13]; 1214 r.ptr[6] = a.array[14]; 1215 r.ptr[7] = a.array[15]; 1216 return r; 1217 } 1218 1219 short4 vget_low_s16(short8 a) pure @trusted 1220 { 1221 short4 r; 1222 r.ptr[0] = a.array[0]; 1223 r.ptr[1] = a.array[1]; 1224 r.ptr[2] = a.array[2]; 1225 r.ptr[3] = a.array[3]; 1226 return r; 1227 } 1228 1229 int2 vget_low_s32(int4 a) pure @trusted 1230 { 1231 int2 r; 1232 r.ptr[0] = a.array[0]; 1233 r.ptr[1] = a.array[1]; 1234 return r; 1235 } 1236 1237 byte8 vget_low_u8(byte16 a) pure @trusted 1238 { 1239 byte8 r; 1240 r.ptr[0] = a.array[0]; 1241 r.ptr[1] = a.array[1]; 1242 r.ptr[2] = a.array[2]; 1243 r.ptr[3] = a.array[3]; 1244 r.ptr[4] = a.array[4]; 1245 r.ptr[5] = a.array[5]; 1246 r.ptr[6] = a.array[6]; 1247 r.ptr[7] = a.array[7]; 1248 return r; 1249 } 1250 1251 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1252 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1253 1254 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1255 short8 vminq_s16(short8 a, short8 b) pure @safe; 1256 1257 int4 vmull_s16(short4 a, short4 b) pure @trusted 1258 { 1259 int4 r; 1260 r.ptr[0] = a.array[0] * b.array[0]; 1261 r.ptr[1] = a.array[1] * b.array[1]; 1262 r.ptr[2] = a.array[2] * b.array[2]; 1263 r.ptr[3] = a.array[3] * b.array[3]; 1264 return r; 1265 } 1266 1267 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1268 { 1269 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1270 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1271 } 1272 else 1273 { 1274 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1275 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1276 } 1277 1278 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1279 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1280 1281 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1282 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1283 1284 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1285 byte8 vqmovn_s16(short8 a) pure @safe; 1286 1287 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1288 byte8 vqmovun_s16(short8 a) pure @safe; 1289 1290 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1291 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1292 1293 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1294 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1295 1296 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1297 { 1298 return a >>> b; 1299 } 1300 } 1301