1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 } 31 else version (X86_64) 32 { 33 // GDC support uses extended inline assembly: 34 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 35 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 36 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 37 38 public import core.simd; 39 40 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 41 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 42 public import gcc.builtins; 43 44 enum GDC_with_x86 = true; 45 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 46 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 47 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 48 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 49 } 50 else 51 { 52 enum GDC_with_x86 = false; 53 enum GDC_with_MMX = false; 54 enum GDC_with_SSE = false; 55 enum GDC_with_SSE2 = false; 56 enum GDC_with_SSE3 = false; 57 } 58 } 59 else 60 { 61 enum GDC_with_x86 = false; 62 enum GDC_with_MMX = false; 63 enum GDC_with_SSE = false; 64 enum GDC_with_SSE2 = false; 65 enum GDC_with_SSE3 = false; 66 } 67 68 version(LDC) 69 { 70 public import core.simd; 71 public import ldc.simd; 72 public import ldc.intrinsics; 73 public import ldc.llvmasm: __asm; 74 75 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 76 static if (__VERSION__ >= 2083) 77 { 78 import ldc.llvmasm; 79 alias LDCInlineIR = __ir_pure; 80 81 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 82 alias LDCInlineIREx = __irEx_pure; 83 } 84 else 85 { 86 alias LDCInlineIR = inlineIR; 87 } 88 89 version(ARM) 90 { 91 public import ldc.gccbuiltins_arm; 92 enum LDC_with_ARM32 = true; 93 enum LDC_with_ARM64 = false; 94 enum LDC_with_SSE1 = false; 95 enum LDC_with_SSE2 = false; 96 enum LDC_with_SSE3 = false; 97 } 98 else version(AArch64) 99 { 100 enum LDC_with_ARM32 = false; 101 enum LDC_with_ARM64 = true; 102 enum LDC_with_SSE1 = false; 103 enum LDC_with_SSE2 = false; 104 enum LDC_with_SSE3 = false; 105 } 106 else 107 { 108 public import ldc.gccbuiltins_x86; 109 enum LDC_with_ARM32 = false; 110 enum LDC_with_ARM64 = false; 111 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 112 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 113 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 114 } 115 } 116 else 117 { 118 enum LDC_with_ARM32 = false; 119 enum LDC_with_ARM64 = false; 120 enum LDC_with_SSE1 = false; 121 enum LDC_with_SSE2 = false; 122 enum LDC_with_SSE3 = false; 123 } 124 125 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 126 127 version(DigitalMars) 128 { 129 version(D_InlineAsm_X86) 130 enum DMD_with_asm = true; 131 else version(D_InlineAsm_X86_64) 132 enum DMD_with_asm = true; 133 else 134 enum DMD_with_asm = false; 135 136 version(D_InlineAsm_X86) 137 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 138 else 139 enum DMD_with_32bit_asm = false; 140 141 version (D_SIMD) 142 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 143 else 144 enum DMD_with_DSIMD = false; 145 } 146 else 147 { 148 enum DMD_with_asm = false; 149 enum DMD_with_32bit_asm = false; 150 enum DMD_with_DSIMD = false; 151 } 152 153 static if (LDC_with_ARM32) 154 { 155 package uint arm_get_fpcr() nothrow @nogc @trusted 156 { 157 return __builtin_arm_get_fpscr(); 158 } 159 160 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 161 { 162 __builtin_arm_set_fpscr(cw); 163 } 164 } 165 166 static if (LDC_with_ARM64) 167 { 168 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 169 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 170 171 package uint arm_get_fpcr() pure nothrow @nogc @trusted 172 { 173 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 174 return __asm!uint("mrs $0, fpcr", "=r"); 175 } 176 177 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 178 { 179 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 180 long save_x2; 181 __asm!void("str x2, $1 \n" ~ 182 "ldr w2, $0 \n" ~ 183 "msr fpcr, x2 \n" ~ 184 "ldr x2, $1 " , "m,m", cw, &save_x2); 185 } 186 } 187 188 189 // For internal use only, since public API deals with a x86 semantic emulation 190 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 191 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 192 enum uint _MM_ROUND_UP_ARM = 0x00400000; 193 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 194 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 195 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 196 197 198 // 199 // <ROUNDING> 200 // 201 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 202 // doesn't change the FPU rounding mode, and isn't expected to do so. 203 // So we devised these rounding function to help having consistent rouding between 204 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 205 // 206 // Note: There is no MXCSR in ARM. But there is fpscr that implements similar 207 // functionality the same. 208 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 209 // There is no 210 // We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 211 212 int convertFloatToInt32UsingMXCSR(float value) @trusted 213 { 214 int result; 215 version(GNU) 216 { 217 asm pure nothrow @nogc @trusted 218 { 219 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 220 } 221 } 222 else static if (LDC_with_ARM32) 223 { 224 // TODO: this is a bug, it won't preserve registers when optimized 225 result = __asm!int(`vldr s2, $1 226 vcvtr.s32.f32 s2, s2 227 vmov $0, s2`, "=r,m", value); 228 } 229 else static if (LDC_with_ARM64) 230 { 231 // Get current rounding mode. 232 uint fpscr = arm_get_fpcr(); 233 234 switch(fpscr & _MM_ROUND_MASK_ARM) 235 { 236 default: 237 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 238 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 239 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 240 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 241 } 242 } 243 else 244 { 245 asm pure nothrow @nogc @trusted 246 { 247 cvtss2si EAX, value; 248 mov result, EAX; 249 } 250 } 251 return result; 252 } 253 254 int convertDoubleToInt32UsingMXCSR(double value) @trusted 255 { 256 int result; 257 version(GNU) 258 { 259 asm pure nothrow @nogc @trusted 260 { 261 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 262 } 263 } 264 else static if (LDC_with_ARM32) 265 { 266 // TODO: bug, doesn't preserve registers 267 result = __asm!int(`vldr d2, $1 268 vcvtr.s32.f64 s2, d2 269 vmov $0, s2`, "=r,m", value); 270 } 271 else static if (LDC_with_ARM64) 272 { 273 // Get current rounding mode. 274 uint fpscr = arm_get_fpcr(); 275 276 switch(fpscr & _MM_ROUND_MASK_ARM) 277 { 278 default: 279 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 280 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 281 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 282 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 283 } 284 } 285 else 286 { 287 asm pure nothrow @nogc @trusted 288 { 289 cvtsd2si EAX, value; 290 mov result, EAX; 291 } 292 } 293 return result; 294 } 295 296 long convertFloatToInt64UsingMXCSR(float value) @trusted 297 { 298 static if (LDC_with_ARM32) 299 { 300 // We have to resort to libc since 32-bit ARM 301 // doesn't seem to have 64-bit registers. 302 303 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 304 305 // Note: converting to double precision else rounding could be different for large integers 306 double asDouble = value; 307 308 switch(fpscr & _MM_ROUND_MASK_ARM) 309 { 310 default: 311 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 312 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 313 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 314 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 315 } 316 } 317 else static if (LDC_with_ARM64) 318 { 319 uint fpscr = arm_get_fpcr(); 320 321 switch(fpscr & _MM_ROUND_MASK_ARM) 322 { 323 default: 324 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 325 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 326 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 327 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 328 } 329 } 330 // 64-bit can use an SSE instruction 331 else version(D_InlineAsm_X86_64) 332 { 333 long result; 334 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 335 { 336 asm pure nothrow @nogc @trusted 337 { 338 movss XMM0, value; 339 cvtss2si RAX, XMM0; 340 mov result, RAX; 341 } 342 } 343 else 344 { 345 asm pure nothrow @nogc @trusted 346 { 347 movss XMM0, value; 348 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 349 mov result, RAX; 350 } 351 } 352 return result; 353 } 354 else version(D_InlineAsm_X86) 355 { 356 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 357 // This leads to an unfortunate FPU sequence in every C++ compiler. 358 // See: https://godbolt.org/z/vZym77 359 360 // Get current MXCSR rounding 361 uint sseRounding; 362 ushort savedFPUCW; 363 ushort newFPUCW; 364 long result; 365 asm pure nothrow @nogc @trusted 366 { 367 stmxcsr sseRounding; 368 fld value; 369 fnstcw savedFPUCW; 370 mov AX, savedFPUCW; 371 and AX, 0xf3ff; // clear FPU rounding bits 372 movzx ECX, word ptr sseRounding; 373 and ECX, 0x6000; // only keep SSE rounding bits 374 shr ECX, 3; 375 or AX, CX; // make a new control word for FPU with SSE bits 376 mov newFPUCW, AX; 377 fldcw newFPUCW; 378 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 379 fldcw savedFPUCW; 380 } 381 return result; 382 } 383 else static if (GDC_with_x86) 384 { 385 version(X86_64) // 64-bit can just use the right instruction 386 { 387 static assert(GDC_with_SSE); 388 __m128 A; 389 A.ptr[0] = value; 390 return __builtin_ia32_cvtss2si64 (A); 391 } 392 else version(X86) // 32-bit 393 { 394 // This is untested! 395 uint sseRounding; 396 ushort savedFPUCW; 397 ushort newFPUCW; 398 long result; 399 asm pure nothrow @nogc @trusted 400 { 401 "stmxcsr %1;\n" ~ 402 "fld %2;\n" ~ 403 "fnstcw %3;\n" ~ 404 "movw %3, %%ax;\n" ~ 405 "andw $0xf3ff, %%ax;\n" ~ 406 "movzwl %1, %%ecx;\n" ~ 407 "andl $0x6000, %%ecx;\n" ~ 408 "shrl $3, %%ecx;\n" ~ 409 "orw %%cx, %%ax\n" ~ 410 "movw %%ax, %4;\n" ~ 411 "fldcw %4;\n" ~ 412 "fistpll %0;\n" ~ 413 "fldcw %3;\n" 414 : "=m"(result) // %0 415 : "m" (sseRounding), 416 "f" (value), 417 "m" (savedFPUCW), 418 "m" (newFPUCW) 419 : "eax", "ecx", "st"; 420 } 421 return result; 422 } 423 else 424 static assert(false); 425 } 426 else 427 static assert(false); 428 } 429 430 431 ///ditto 432 long convertDoubleToInt64UsingMXCSR(double value) @trusted 433 { 434 static if (LDC_with_ARM32) 435 { 436 // We have to resort to libc since 32-bit ARM 437 // doesn't seem to have 64-bit registers. 438 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 439 switch(fpscr & _MM_ROUND_MASK_ARM) 440 { 441 default: 442 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 443 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 444 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 445 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 446 } 447 } 448 else static if (LDC_with_ARM64) 449 { 450 // Get current rounding mode. 451 uint fpscr = arm_get_fpcr(); 452 453 switch(fpscr & _MM_ROUND_MASK_ARM) 454 { 455 default: 456 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 457 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 458 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 459 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 460 } 461 } 462 // 64-bit can use an SSE instruction 463 else version(D_InlineAsm_X86_64) 464 { 465 long result; 466 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 467 { 468 asm pure nothrow @nogc @trusted 469 { 470 movsd XMM0, value; 471 cvtsd2si RAX, XMM0; 472 mov result, RAX; 473 } 474 } 475 else 476 { 477 asm pure nothrow @nogc @trusted 478 { 479 movsd XMM0, value; 480 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 481 mov result, RAX; 482 } 483 } 484 return result; 485 } 486 else version(D_InlineAsm_X86) 487 { 488 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 489 // This leads to an unfortunate FPU sequence in every C++ compiler. 490 // See: https://godbolt.org/z/vZym77 491 492 // Get current MXCSR rounding 493 uint sseRounding; 494 ushort savedFPUCW; 495 ushort newFPUCW; 496 long result; 497 asm pure nothrow @nogc @trusted 498 { 499 stmxcsr sseRounding; 500 fld value; 501 fnstcw savedFPUCW; 502 mov AX, savedFPUCW; 503 and AX, 0xf3ff; 504 movzx ECX, word ptr sseRounding; 505 and ECX, 0x6000; 506 shr ECX, 3; 507 or AX, CX; 508 mov newFPUCW, AX; 509 fldcw newFPUCW; 510 fistp result; 511 fldcw savedFPUCW; 512 } 513 return result; 514 } 515 else static if (GDC_with_x86) 516 { 517 version(X86_64) 518 { 519 static assert(GDC_with_SSE2); 520 __m128d A; 521 A.ptr[0] = value; 522 return __builtin_ia32_cvtsd2si64 (A); 523 } 524 else 525 { 526 // This is untested! 527 uint sseRounding; 528 ushort savedFPUCW; 529 ushort newFPUCW; 530 long result; 531 asm pure nothrow @nogc @trusted 532 { 533 "stmxcsr %1;\n" ~ 534 "fld %2;\n" ~ 535 "fnstcw %3;\n" ~ 536 "movw %3, %%ax;\n" ~ 537 "andw $0xf3ff, %%ax;\n" ~ 538 "movzwl %1, %%ecx;\n" ~ 539 "andl $0x6000, %%ecx;\n" ~ 540 "shrl $3, %%ecx;\n" ~ 541 "orw %%cx, %%ax\n" ~ 542 "movw %%ax, %4;\n" ~ 543 "fldcw %4;\n" ~ 544 "fistpll %0;\n" ~ 545 "fldcw %3;\n" 546 : "=m"(result) // %0 547 : "m" (sseRounding), 548 "t" (value), 549 "m" (savedFPUCW), 550 "m" (newFPUCW) 551 : "eax", "ecx", "st"; 552 } 553 return result; 554 } 555 } 556 else 557 static assert(false); 558 } 559 560 // 561 // </ROUNDING> 562 // 563 564 565 // using the Intel terminology here 566 567 byte saturateSignedWordToSignedByte(short value) pure @safe 568 { 569 if (value > 127) value = 127; 570 if (value < -128) value = -128; 571 return cast(byte) value; 572 } 573 574 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 575 { 576 if (value > 255) value = 255; 577 if (value < 0) value = 0; 578 return cast(ubyte) value; 579 } 580 581 short saturateSignedIntToSignedShort(int value) pure @safe 582 { 583 if (value > 32767) value = 32767; 584 if (value < -32768) value = -32768; 585 return cast(short) value; 586 } 587 588 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 589 { 590 if (value > 65535) value = 65535; 591 if (value < 0) value = 0; 592 return cast(ushort) value; 593 } 594 595 unittest // test saturate operations 596 { 597 assert( saturateSignedWordToSignedByte(32000) == 127); 598 assert( saturateSignedWordToUnsignedByte(32000) == 255); 599 assert( saturateSignedWordToSignedByte(-4000) == -128); 600 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 601 assert( saturateSignedIntToSignedShort(32768) == 32767); 602 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 603 assert( saturateSignedIntToSignedShort(-32769) == -32768); 604 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 605 } 606 607 version(unittest) 608 { 609 // This is just for debugging tests 610 import core.stdc.stdio: printf; 611 612 // printing vectors for implementation 613 // Note: you can override `pure` within a `debug` clause 614 615 void _mm_print_pi64(__m64 v) @trusted 616 { 617 long1 vl = cast(long1)v; 618 printf("%lld\n", vl.array[0]); 619 } 620 621 void _mm_print_pi32(__m64 v) @trusted 622 { 623 int[2] C = (cast(int2)v).array; 624 printf("%d %d\n", C[0], C[1]); 625 } 626 627 void _mm_print_pi16(__m64 v) @trusted 628 { 629 short[4] C = (cast(short4)v).array; 630 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 631 } 632 633 void _mm_print_pi8(__m64 v) @trusted 634 { 635 byte[8] C = (cast(byte8)v).array; 636 printf("%d %d %d %d %d %d %d %d\n", 637 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 638 } 639 640 void _mm_print_epi64(__m128i v) @trusted 641 { 642 long2 vl = cast(long2)v; 643 printf("%lld %lld\n", vl.array[0], vl.array[1]); 644 } 645 646 void _mm_print_epi32(__m128i v) @trusted 647 { 648 printf("%d %d %d %d\n", 649 v.array[0], v.array[1], v.array[2], v.array[3]); 650 } 651 652 void _mm_print_epi16(__m128i v) @trusted 653 { 654 short[8] C = (cast(short8)v).array; 655 printf("%d %d %d %d %d %d %d %d\n", 656 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 657 } 658 659 void _mm_print_epi8(__m128i v) @trusted 660 { 661 byte[16] C = (cast(byte16)v).array; 662 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 663 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 664 } 665 666 void _mm_print_ps(__m128 v) @trusted 667 { 668 float[4] C = (cast(float4)v).array; 669 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 670 } 671 672 void _mm_print_pd(__m128d v) @trusted 673 { 674 double[2] C = (cast(double2)v).array; 675 printf("%f %f\n", C[0], C[1]); 676 } 677 } 678 679 680 // 681 // <FLOATING-POINT COMPARISONS> 682 // 683 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 684 // need different IR generation. 685 686 enum FPComparison 687 { 688 oeq, // ordered and equal 689 ogt, // ordered and greater than 690 oge, // ordered and greater than or equal 691 olt, // ordered and less than 692 ole, // ordered and less than or equal 693 one, // ordered and not equal 694 ord, // ordered (no nans) 695 ueq, // unordered or equal 696 ugt, // unordered or greater than ("nle") 697 uge, // unordered or greater than or equal ("nlt") 698 ult, // unordered or less than ("nge") 699 ule, // unordered or less than or equal ("ngt") 700 une, // unordered or not equal ("neq") 701 uno, // unordered (either nans) 702 } 703 704 private static immutable string[FPComparison.max+1] FPComparisonToString = 705 [ 706 "oeq", 707 "ogt", 708 "oge", 709 "olt", 710 "ole", 711 "one", 712 "ord", 713 "ueq", 714 "ugt", 715 "uge", 716 "ult", 717 "ule", 718 "une", 719 "uno", 720 ]; 721 722 // Individual float comparison: returns -1 for true or 0 for false. 723 // Useful for DMD and testing 724 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 725 { 726 bool unordered = isnan(a) || isnan(b); 727 final switch(comparison) with(FPComparison) 728 { 729 case oeq: return a == b; 730 case ogt: return a > b; 731 case oge: return a >= b; 732 case olt: return a < b; 733 case ole: return a <= b; 734 case one: return !unordered && (a != b); // NaN with != always yields true 735 case ord: return !unordered; 736 case ueq: return unordered || (a == b); 737 case ugt: return unordered || (a > b); 738 case uge: return unordered || (a >= b); 739 case ult: return unordered || (a < b); 740 case ule: return unordered || (a <= b); 741 case une: return (a != b); // NaN with != always yields true 742 case uno: return unordered; 743 } 744 } 745 746 version(LDC) 747 { 748 /// Provides packed float comparisons 749 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 750 { 751 enum ir = ` 752 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 753 %r = sext <4 x i1> %cmp to <4 x i32> 754 ret <4 x i32> %r`; 755 756 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 757 } 758 759 /// Provides packed double comparisons 760 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 761 { 762 enum ir = ` 763 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 764 %r = sext <2 x i1> %cmp to <2 x i64> 765 ret <2 x i64> %r`; 766 767 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 768 } 769 770 /// CMPSS-style comparisons 771 /// clang implement it through x86 intrinsics, it is possible with IR alone 772 /// but leads to less optimal code. 773 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 774 /// Not that simple. 775 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 776 { 777 /* 778 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 779 enum bool invertOp = (predicateNumber & 0x80) != 0; 780 static if(invertOp) 781 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 782 else 783 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 784 */ 785 enum ir = ` 786 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 787 %r = sext i1 %cmp to i32 788 %r2 = bitcast i32 %r to float 789 ret float %r2`; 790 791 float4 r = a; 792 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 793 return r; 794 } 795 796 /// CMPSD-style comparisons 797 /// clang implement it through x86 intrinsics, it is possible with IR alone 798 /// but leads to less optimal code. 799 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 800 /// Not that simple. 801 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 802 { 803 enum ir = ` 804 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 805 %r = sext i1 %cmp to i64 806 %r2 = bitcast i64 %r to double 807 ret double %r2`; 808 809 double2 r = a; 810 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 811 return r; 812 } 813 } 814 else 815 { 816 /// Provides packed float comparisons 817 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 818 { 819 int4 result; 820 foreach(i; 0..4) 821 { 822 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 823 } 824 return result; 825 } 826 827 /// Provides packed double comparisons 828 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 829 { 830 long2 result; 831 foreach(i; 0..2) 832 { 833 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 834 } 835 return result; 836 } 837 838 /// Provides CMPSS-style comparison 839 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 840 { 841 int4 result = cast(int4)a; 842 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 843 return cast(float4)result; 844 } 845 846 /// Provides CMPSD-style comparison 847 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 848 { 849 long2 result = cast(long2)a; 850 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 851 return cast(double2)result; 852 } 853 } 854 unittest // cmpps 855 { 856 // Check all comparison type is working 857 float4 A = [1, 3, 5, float.nan]; 858 float4 B = [2, 3, 4, 5]; 859 860 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 861 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 862 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 863 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 864 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 865 int4 result_one = cmpps!(FPComparison.one)(A, B); 866 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 867 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 868 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 869 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 870 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 871 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 872 int4 result_une = cmpps!(FPComparison.une)(A, B); 873 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 874 875 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 876 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 877 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 878 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 879 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 880 static immutable int[4] correct_one = [-1, 0,-1, 0]; 881 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 882 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 883 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 884 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 885 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 886 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 887 static immutable int[4] correct_une = [-1, 0,-1,-1]; 888 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 889 890 assert(result_oeq.array == correct_oeq); 891 assert(result_ogt.array == correct_ogt); 892 assert(result_oge.array == correct_oge); 893 assert(result_olt.array == correct_olt); 894 assert(result_ole.array == correct_ole); 895 assert(result_one.array == correct_one); 896 assert(result_ord.array == correct_ord); 897 assert(result_ueq.array == correct_ueq); 898 assert(result_ugt.array == correct_ugt); 899 assert(result_uge.array == correct_uge); 900 assert(result_ult.array == correct_ult); 901 assert(result_ule.array == correct_ule); 902 assert(result_une.array == correct_une); 903 assert(result_uno.array == correct_uno); 904 } 905 unittest 906 { 907 double2 a = [1, 3]; 908 double2 b = [2, 3]; 909 long2 c = cmppd!(FPComparison.ult)(a, b); 910 static immutable long[2] correct = [cast(long)(-1), 0]; 911 assert(c.array == correct); 912 } 913 unittest // cmpss 914 { 915 void testComparison(FPComparison comparison)(float4 A, float4 B) 916 { 917 float4 result = cmpss!comparison(A, B); 918 int4 iresult = cast(int4)result; 919 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 920 assert(iresult.array[0] == expected); 921 assert(result.array[1] == A.array[1]); 922 assert(result.array[2] == A.array[2]); 923 assert(result.array[3] == A.array[3]); 924 } 925 926 // Check all comparison type is working 927 float4 A = [1, 3, 5, 6]; 928 float4 B = [2, 3, 4, 5]; 929 float4 C = [float.nan, 3, 4, 5]; 930 931 testComparison!(FPComparison.oeq)(A, B); 932 testComparison!(FPComparison.oeq)(A, C); 933 testComparison!(FPComparison.ogt)(A, B); 934 testComparison!(FPComparison.ogt)(A, C); 935 testComparison!(FPComparison.oge)(A, B); 936 testComparison!(FPComparison.oge)(A, C); 937 testComparison!(FPComparison.olt)(A, B); 938 testComparison!(FPComparison.olt)(A, C); 939 testComparison!(FPComparison.ole)(A, B); 940 testComparison!(FPComparison.ole)(A, C); 941 testComparison!(FPComparison.one)(A, B); 942 testComparison!(FPComparison.one)(A, C); 943 testComparison!(FPComparison.ord)(A, B); 944 testComparison!(FPComparison.ord)(A, C); 945 testComparison!(FPComparison.ueq)(A, B); 946 testComparison!(FPComparison.ueq)(A, C); 947 testComparison!(FPComparison.ugt)(A, B); 948 testComparison!(FPComparison.ugt)(A, C); 949 testComparison!(FPComparison.uge)(A, B); 950 testComparison!(FPComparison.uge)(A, C); 951 testComparison!(FPComparison.ult)(A, B); 952 testComparison!(FPComparison.ult)(A, C); 953 testComparison!(FPComparison.ule)(A, B); 954 testComparison!(FPComparison.ule)(A, C); 955 testComparison!(FPComparison.une)(A, B); 956 testComparison!(FPComparison.une)(A, C); 957 testComparison!(FPComparison.uno)(A, B); 958 testComparison!(FPComparison.uno)(A, C); 959 } 960 unittest // cmpsd 961 { 962 void testComparison(FPComparison comparison)(double2 A, double2 B) 963 { 964 double2 result = cmpsd!comparison(A, B); 965 long2 iresult = cast(long2)result; 966 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 967 assert(iresult.array[0] == expected); 968 assert(result.array[1] == A.array[1]); 969 } 970 971 // Check all comparison type is working 972 double2 A = [1, 3]; 973 double2 B = [2, 4]; 974 double2 C = [double.nan, 5]; 975 976 testComparison!(FPComparison.oeq)(A, B); 977 testComparison!(FPComparison.oeq)(A, C); 978 testComparison!(FPComparison.ogt)(A, B); 979 testComparison!(FPComparison.ogt)(A, C); 980 testComparison!(FPComparison.oge)(A, B); 981 testComparison!(FPComparison.oge)(A, C); 982 testComparison!(FPComparison.olt)(A, B); 983 testComparison!(FPComparison.olt)(A, C); 984 testComparison!(FPComparison.ole)(A, B); 985 testComparison!(FPComparison.ole)(A, C); 986 testComparison!(FPComparison.one)(A, B); 987 testComparison!(FPComparison.one)(A, C); 988 testComparison!(FPComparison.ord)(A, B); 989 testComparison!(FPComparison.ord)(A, C); 990 testComparison!(FPComparison.ueq)(A, B); 991 testComparison!(FPComparison.ueq)(A, C); 992 testComparison!(FPComparison.ugt)(A, B); 993 testComparison!(FPComparison.ugt)(A, C); 994 testComparison!(FPComparison.uge)(A, B); 995 testComparison!(FPComparison.uge)(A, C); 996 testComparison!(FPComparison.ult)(A, B); 997 testComparison!(FPComparison.ult)(A, C); 998 testComparison!(FPComparison.ule)(A, B); 999 testComparison!(FPComparison.ule)(A, C); 1000 testComparison!(FPComparison.une)(A, B); 1001 testComparison!(FPComparison.une)(A, C); 1002 testComparison!(FPComparison.uno)(A, B); 1003 testComparison!(FPComparison.uno)(A, C); 1004 } 1005 1006 // 1007 // </FLOATING-POINT COMPARISONS> 1008 // 1009 1010 1011 __m64 to_m64(__m128i a) pure @trusted 1012 { 1013 long2 la = cast(long2)a; 1014 long1 r = la.array[0]; 1015 return r; 1016 } 1017 1018 __m128i to_m128i(__m64 a) pure @trusted 1019 { 1020 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1021 1022 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1023 { 1024 long2 r = a.array[0]; 1025 r.ptr[1] = 0; 1026 return cast(int4)r; 1027 } 1028 else */ 1029 { 1030 long2 r = [0, 0]; 1031 r.ptr[0] = a.array[0]; 1032 return cast(__m128i)r; 1033 } 1034 } 1035 1036 // SOME NEON INTRINSICS 1037 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1038 // Not in the public API but the simde project expose it all for the user to use. 1039 // MAYDO: create a new neon.d module, for internal use only. 1040 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1041 static if (LDC_with_ARM64) 1042 { 1043 // VERY USEFUL LINK 1044 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1045 1046 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1047 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1048 1049 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1050 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1051 1052 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1053 short8 vpaddlq_u8 (byte16 a) pure @safe; 1054 1055 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1056 { 1057 return a & b; 1058 } 1059 1060 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1061 { 1062 short8 r; 1063 r.ptr[0] = lo.array[0]; 1064 r.ptr[1] = lo.array[1]; 1065 r.ptr[2] = lo.array[2]; 1066 r.ptr[3] = lo.array[3]; 1067 r.ptr[4] = hi.array[0]; 1068 r.ptr[5] = hi.array[1]; 1069 r.ptr[6] = hi.array[2]; 1070 r.ptr[7] = hi.array[3]; 1071 return r; 1072 } 1073 1074 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1075 { 1076 int4 r; 1077 r.ptr[0] = lo.array[0]; 1078 r.ptr[1] = lo.array[1]; 1079 r.ptr[2] = hi.array[0]; 1080 r.ptr[3] = hi.array[1]; 1081 return r; 1082 } 1083 1084 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1085 { 1086 byte16 r; 1087 r.ptr[0] = lo.array[0]; 1088 r.ptr[1] = lo.array[1]; 1089 r.ptr[2] = lo.array[2]; 1090 r.ptr[3] = lo.array[3]; 1091 r.ptr[4] = lo.array[4]; 1092 r.ptr[5] = lo.array[5]; 1093 r.ptr[6] = lo.array[6]; 1094 r.ptr[7] = lo.array[7]; 1095 r.ptr[8] = hi.array[0]; 1096 r.ptr[9] = hi.array[1]; 1097 r.ptr[10] = hi.array[2]; 1098 r.ptr[11] = hi.array[3]; 1099 r.ptr[12] = hi.array[4]; 1100 r.ptr[13] = hi.array[5]; 1101 r.ptr[14] = hi.array[6]; 1102 r.ptr[15] = hi.array[7]; 1103 return r; 1104 } 1105 1106 1107 // float4 => int4 1108 1109 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1110 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1111 1112 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1113 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1114 1115 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1116 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1117 1118 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1119 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1120 1121 1122 // double2 => long2 1123 1124 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1125 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1126 1127 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1128 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1129 1130 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1131 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1132 1133 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1134 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1135 1136 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1137 int vcvtms_s32_f32(float a) pure @safe; 1138 1139 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1140 int vcvtns_s32_f32(float a) pure @safe; 1141 1142 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1143 int vcvtps_s32_f32(float a) pure @safe; 1144 1145 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1146 int vcvts_s32_f32(float a) pure @safe; 1147 1148 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1149 int vcvtms_s32_f64(double a) pure @safe; 1150 1151 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1152 int vcvtns_s32_f64(double a) pure @safe; 1153 1154 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1155 int vcvtps_s32_f64(double a) pure @safe; 1156 1157 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1158 int vcvts_s32_f64(double a) pure @safe; 1159 1160 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1161 long vcvtms_s64_f32(float a) pure @safe; 1162 1163 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1164 long vcvtns_s64_f32(float a) pure @safe; 1165 1166 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1167 long vcvtps_s64_f32(float a) pure @safe; 1168 1169 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1170 long vcvts_s64_f32(float a) pure @safe; 1171 1172 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1173 long vcvtms_s64_f64(double a) pure @safe; 1174 1175 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1176 long vcvtns_s64_f64(double a) pure @safe; 1177 1178 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1179 long vcvtps_s64_f64(double a) pure @safe; 1180 1181 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1182 long vcvts_s64_f64(double a) pure @safe; 1183 1184 short4 vget_high_s16(short8 a) pure @trusted 1185 { 1186 short4 r; 1187 r.ptr[0] = a.array[4]; 1188 r.ptr[1] = a.array[5]; 1189 r.ptr[2] = a.array[6]; 1190 r.ptr[3] = a.array[7]; 1191 return r; 1192 } 1193 1194 int2 vget_high_s32(int4 a) pure @trusted 1195 { 1196 int2 r; 1197 r.ptr[0] = a.array[2]; 1198 r.ptr[1] = a.array[3]; 1199 return r; 1200 } 1201 1202 byte8 vget_high_u8(byte16 a) pure @trusted 1203 { 1204 byte8 r; 1205 r.ptr[0] = a.array[8]; 1206 r.ptr[1] = a.array[9]; 1207 r.ptr[2] = a.array[10]; 1208 r.ptr[3] = a.array[11]; 1209 r.ptr[4] = a.array[12]; 1210 r.ptr[5] = a.array[13]; 1211 r.ptr[6] = a.array[14]; 1212 r.ptr[7] = a.array[15]; 1213 return r; 1214 } 1215 1216 short4 vget_low_s16(short8 a) pure @trusted 1217 { 1218 short4 r; 1219 r.ptr[0] = a.array[0]; 1220 r.ptr[1] = a.array[1]; 1221 r.ptr[2] = a.array[2]; 1222 r.ptr[3] = a.array[3]; 1223 return r; 1224 } 1225 1226 int2 vget_low_s32(int4 a) pure @trusted 1227 { 1228 int2 r; 1229 r.ptr[0] = a.array[0]; 1230 r.ptr[1] = a.array[1]; 1231 return r; 1232 } 1233 1234 byte8 vget_low_u8(byte16 a) pure @trusted 1235 { 1236 byte8 r; 1237 r.ptr[0] = a.array[0]; 1238 r.ptr[1] = a.array[1]; 1239 r.ptr[2] = a.array[2]; 1240 r.ptr[3] = a.array[3]; 1241 r.ptr[4] = a.array[4]; 1242 r.ptr[5] = a.array[5]; 1243 r.ptr[6] = a.array[6]; 1244 r.ptr[7] = a.array[7]; 1245 return r; 1246 } 1247 1248 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1249 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1250 1251 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1252 short8 vminq_s16(short8 a, short8 b) pure @safe; 1253 1254 int4 vmull_s16(short4 a, short4 b) pure @trusted 1255 { 1256 int4 r; 1257 r.ptr[0] = a.array[0] * b.array[0]; 1258 r.ptr[1] = a.array[1] * b.array[1]; 1259 r.ptr[2] = a.array[2] * b.array[2]; 1260 r.ptr[3] = a.array[3] * b.array[3]; 1261 return r; 1262 } 1263 1264 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1265 { 1266 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1267 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1268 } 1269 else 1270 { 1271 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1272 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1273 } 1274 1275 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1276 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1277 1278 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1279 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1280 1281 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1282 byte8 vqmovn_s16(short8 a) pure @safe; 1283 1284 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1285 short4 vqmovn_s32(int4 a) pure @safe; 1286 1287 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1288 byte8 vqmovun_s16(short8 a) pure @safe; 1289 1290 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1291 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1292 1293 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1294 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1295 1296 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1297 { 1298 return a >>> b; 1299 } 1300 } 1301 1302 version(unittest) 1303 { 1304 double abs_double(double x) @trusted 1305 { 1306 version(LDC) 1307 return llvm_fabs(x); 1308 else 1309 { 1310 long uf = *cast(long*)(&x); 1311 uf &= 0x7fffffff_ffffffff; 1312 return *cast(double*)(&uf); 1313 } 1314 } 1315 } 1316 1317 // needed because in olg GDC from travis, core.stdc.math.isnan isn't pure 1318 1319 bool isnan(float x) pure @trusted 1320 { 1321 uint u = *cast(uint*)(&x); 1322 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1323 return result; 1324 } 1325 unittest 1326 { 1327 float x = float.nan; 1328 assert(isnan(x)); 1329 1330 x = 0; 1331 assert(!isnan(x)); 1332 1333 x = float.infinity; 1334 assert(!isnan(x)); 1335 } 1336 1337 bool isnan(double x) pure @trusted 1338 { 1339 ulong u = *cast(ulong*)(&x); 1340 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1341 } 1342 unittest 1343 { 1344 double x = double.nan; 1345 assert(isnan(x)); 1346 1347 x = 0; 1348 assert(!isnan(x)); 1349 1350 x = double.infinity; 1351 assert(!isnan(x)); 1352 }