1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_SHA = false; 34 enum GDC_with_BMI2 = false; 35 } 36 else version (X86_64) 37 { 38 // GDC support uses extended inline assembly: 39 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 40 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 41 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 42 43 public import core.simd; 44 45 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 46 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 47 public import gcc.builtins; 48 49 enum GDC_with_x86 = true; 50 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 51 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 52 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 53 54 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 55 enum GDC_with_SSSE3 = false; // TODO: we don't have a way to detect that at CT 56 enum GDC_with_SSE41 = false; // TODO: we don't have a way to detect that at CT 57 enum GDC_with_SSE42 = false; // TODO: we don't have a way to detect that at CT 58 enum GDC_with_SHA = false; 59 enum GDC_with_BMI2 = false; 60 } 61 else 62 { 63 enum GDC_with_x86 = false; 64 enum GDC_with_MMX = false; 65 enum GDC_with_SSE = false; 66 enum GDC_with_SSE2 = false; 67 enum GDC_with_SSE3 = false; 68 enum GDC_with_SSSE3 = false; 69 enum GDC_with_SSE41 = false; 70 enum GDC_with_SSE42 = false; 71 enum GDC_with_SHA = false; 72 enum GDC_with_BMI2 = false; 73 } 74 } 75 else 76 { 77 enum GDC_with_x86 = false; 78 enum GDC_with_MMX = false; 79 enum GDC_with_SSE = false; 80 enum GDC_with_SSE2 = false; 81 enum GDC_with_SSE3 = false; 82 enum GDC_with_SSSE3 = false; 83 enum GDC_with_SSE41 = false; 84 enum GDC_with_SSE42 = false; 85 enum GDC_with_SHA = false; 86 enum GDC_with_BMI2 = false; 87 } 88 89 version(LDC) 90 { 91 public import core.simd; 92 public import ldc.simd; 93 public import ldc.intrinsics; 94 public import ldc.llvmasm: __asm; 95 96 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 97 static if (__VERSION__ >= 2083) 98 { 99 import ldc.llvmasm; 100 alias LDCInlineIR = __ir_pure; 101 102 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 103 alias LDCInlineIREx = __irEx_pure; 104 } 105 else 106 { 107 alias LDCInlineIR = inlineIR; 108 } 109 110 version(ARM) 111 { 112 public import ldc.gccbuiltins_arm; 113 enum LDC_with_ARM32 = true; 114 enum LDC_with_ARM64 = false; 115 enum LDC_with_SSE1 = false; 116 enum LDC_with_SSE2 = false; 117 enum LDC_with_SSE3 = false; 118 enum LDC_with_SSSE3 = false; 119 enum LDC_with_SSE41 = false; 120 enum LDC_with_SSE42 = false; 121 enum LDC_with_AVX = false; 122 enum LDC_with_AVX2 = false; 123 enum LDC_with_SHA = false; 124 enum LDC_with_BMI2 = false; 125 } 126 else version(AArch64) 127 { 128 enum LDC_with_ARM32 = false; 129 enum LDC_with_ARM64 = true; 130 enum LDC_with_SSE1 = false; 131 enum LDC_with_SSE2 = false; 132 enum LDC_with_SSE3 = false; 133 enum LDC_with_SSSE3 = false; 134 enum LDC_with_SSE41 = false; 135 enum LDC_with_SSE42 = false; 136 enum LDC_with_AVX = false; 137 enum LDC_with_AVX2 = false; 138 enum LDC_with_SHA = false; 139 enum LDC_with_BMI2 = false; 140 } 141 else 142 { 143 public import ldc.gccbuiltins_x86; 144 enum LDC_with_ARM32 = false; 145 enum LDC_with_ARM64 = false; 146 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 147 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 148 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 149 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 150 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 151 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 152 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 153 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 154 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 155 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2"); 156 } 157 } 158 else 159 { 160 enum LDC_with_ARM32 = false; 161 enum LDC_with_ARM64 = false; 162 enum LDC_with_SSE1 = false; 163 enum LDC_with_SSE2 = false; 164 enum LDC_with_SSE3 = false; 165 enum LDC_with_SSSE3 = false; 166 enum LDC_with_SSE41 = false; 167 enum LDC_with_SSE42 = false; 168 enum LDC_with_AVX = false; 169 enum LDC_with_AVX2 = false; 170 enum LDC_with_SHA = false; 171 enum LDC_with_BMI2 = false; 172 } 173 174 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 175 176 version(DigitalMars) 177 { 178 version(D_InlineAsm_X86) 179 enum DMD_with_asm = true; 180 else version(D_InlineAsm_X86_64) 181 enum DMD_with_asm = true; 182 else 183 enum DMD_with_asm = false; 184 185 version(D_InlineAsm_X86) 186 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 187 else 188 enum DMD_with_32bit_asm = false; 189 190 version (D_SIMD) 191 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 192 else 193 enum DMD_with_DSIMD = false; 194 } 195 else 196 { 197 enum DMD_with_asm = false; 198 enum DMD_with_32bit_asm = false; 199 enum DMD_with_DSIMD = false; 200 } 201 202 static if (LDC_with_ARM32) 203 { 204 package uint arm_get_fpcr() nothrow @nogc @trusted 205 { 206 return __builtin_arm_get_fpscr(); 207 } 208 209 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 210 { 211 __builtin_arm_set_fpscr(cw); 212 } 213 } 214 215 static if (LDC_with_ARM64) 216 { 217 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 218 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 219 220 package uint arm_get_fpcr() pure nothrow @nogc @trusted 221 { 222 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 223 return __asm!uint("mrs $0, fpcr", "=r"); 224 } 225 226 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 227 { 228 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 229 long save_x2; 230 __asm!void("str x2, $1 \n" ~ 231 "ldr w2, $0 \n" ~ 232 "msr fpcr, x2 \n" ~ 233 "ldr x2, $1 " , "m,m", cw, &save_x2); 234 } 235 } 236 237 238 // For internal use only, since public API deals with a x86 semantic emulation 239 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 240 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 241 enum uint _MM_ROUND_UP_ARM = 0x00400000; 242 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 243 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 244 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 245 246 247 // 248 // <ROUNDING> 249 // 250 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 251 // doesn't change the FPU rounding mode, and isn't expected to do so. 252 // So we devised these rounding function to help having consistent rouding between 253 // LDC and DMD. It's important that DMD uses what is in MXCSR to round. 254 // 255 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 256 // functionality. 257 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 258 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 259 260 int convertFloatToInt32UsingMXCSR(float value) @trusted 261 { 262 int result; 263 version(GNU) 264 { 265 asm pure nothrow @nogc @trusted 266 { 267 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 268 } 269 } 270 else static if (LDC_with_ARM32) 271 { 272 // TODO: this is a bug, it won't preserve registers when optimized 273 result = __asm!int(`vldr s2, $1 274 vcvtr.s32.f32 s2, s2 275 vmov $0, s2`, "=r,m", value); 276 } 277 else static if (LDC_with_ARM64) 278 { 279 // Get current rounding mode. 280 uint fpscr = arm_get_fpcr(); 281 282 switch(fpscr & _MM_ROUND_MASK_ARM) 283 { 284 default: 285 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 286 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 287 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 288 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 289 } 290 } 291 else 292 { 293 asm pure nothrow @nogc @trusted 294 { 295 cvtss2si EAX, value; 296 mov result, EAX; 297 } 298 } 299 return result; 300 } 301 302 int convertDoubleToInt32UsingMXCSR(double value) @trusted 303 { 304 int result; 305 version(GNU) 306 { 307 asm pure nothrow @nogc @trusted 308 { 309 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 310 } 311 } 312 else static if (LDC_with_ARM32) 313 { 314 // TODO: bug, doesn't preserve registers 315 result = __asm!int(`vldr d2, $1 316 vcvtr.s32.f64 s2, d2 317 vmov $0, s2`, "=r,m", value); 318 } 319 else static if (LDC_with_ARM64) 320 { 321 // Get current rounding mode. 322 uint fpscr = arm_get_fpcr(); 323 324 switch(fpscr & _MM_ROUND_MASK_ARM) 325 { 326 default: 327 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 328 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 329 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 330 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 331 } 332 } 333 else 334 { 335 asm pure nothrow @nogc @trusted 336 { 337 cvtsd2si EAX, value; 338 mov result, EAX; 339 } 340 } 341 return result; 342 } 343 344 long convertFloatToInt64UsingMXCSR(float value) @trusted 345 { 346 static if (LDC_with_ARM32) 347 { 348 // We have to resort to libc since 32-bit ARM 349 // doesn't seem to have 64-bit registers. 350 351 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 352 353 // Note: converting to double precision else rounding could be different for large integers 354 double asDouble = value; 355 356 switch(fpscr & _MM_ROUND_MASK_ARM) 357 { 358 default: 359 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 360 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 361 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 362 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 363 } 364 } 365 else static if (LDC_with_ARM64) 366 { 367 uint fpscr = arm_get_fpcr(); 368 369 switch(fpscr & _MM_ROUND_MASK_ARM) 370 { 371 default: 372 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 373 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 374 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 375 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 376 } 377 } 378 // 64-bit can use an SSE instruction 379 else version(D_InlineAsm_X86_64) 380 { 381 long result; 382 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 383 { 384 asm pure nothrow @nogc @trusted 385 { 386 movss XMM0, value; 387 cvtss2si RAX, XMM0; 388 mov result, RAX; 389 } 390 } 391 else 392 { 393 asm pure nothrow @nogc @trusted 394 { 395 movss XMM0, value; 396 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 397 mov result, RAX; 398 } 399 } 400 return result; 401 } 402 else version(D_InlineAsm_X86) 403 { 404 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 405 // This leads to an unfortunate FPU sequence in every C++ compiler. 406 // See: https://godbolt.org/z/vZym77 407 408 // Get current MXCSR rounding 409 uint sseRounding; 410 ushort savedFPUCW; 411 ushort newFPUCW; 412 long result; 413 asm pure nothrow @nogc @trusted 414 { 415 stmxcsr sseRounding; 416 fld value; 417 fnstcw savedFPUCW; 418 mov AX, savedFPUCW; 419 and AX, 0xf3ff; // clear FPU rounding bits 420 movzx ECX, word ptr sseRounding; 421 and ECX, 0x6000; // only keep SSE rounding bits 422 shr ECX, 3; 423 or AX, CX; // make a new control word for FPU with SSE bits 424 mov newFPUCW, AX; 425 fldcw newFPUCW; 426 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 427 fldcw savedFPUCW; 428 } 429 return result; 430 } 431 else static if (GDC_with_x86) 432 { 433 version(X86_64) // 64-bit can just use the right instruction 434 { 435 static assert(GDC_with_SSE); 436 __m128 A; 437 A.ptr[0] = value; 438 return __builtin_ia32_cvtss2si64 (A); 439 } 440 else version(X86) // 32-bit 441 { 442 // This is untested! 443 uint sseRounding; 444 ushort savedFPUCW; 445 ushort newFPUCW; 446 long result; 447 asm pure nothrow @nogc @trusted 448 { 449 "stmxcsr %1;\n" ~ 450 "fld %2;\n" ~ 451 "fnstcw %3;\n" ~ 452 "movw %3, %%ax;\n" ~ 453 "andw $0xf3ff, %%ax;\n" ~ 454 "movzwl %1, %%ecx;\n" ~ 455 "andl $0x6000, %%ecx;\n" ~ 456 "shrl $3, %%ecx;\n" ~ 457 "orw %%cx, %%ax\n" ~ 458 "movw %%ax, %4;\n" ~ 459 "fldcw %4;\n" ~ 460 "fistpll %0;\n" ~ 461 "fldcw %3;\n" 462 : "=m"(result) // %0 463 : "m" (sseRounding), 464 "f" (value), 465 "m" (savedFPUCW), 466 "m" (newFPUCW) 467 : "eax", "ecx", "st"; 468 } 469 return result; 470 } 471 else 472 static assert(false); 473 } 474 else 475 static assert(false); 476 } 477 478 479 ///ditto 480 long convertDoubleToInt64UsingMXCSR(double value) @trusted 481 { 482 static if (LDC_with_ARM32) 483 { 484 // We have to resort to libc since 32-bit ARM 485 // doesn't seem to have 64-bit registers. 486 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 487 switch(fpscr & _MM_ROUND_MASK_ARM) 488 { 489 default: 490 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 491 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 492 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 493 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 494 } 495 } 496 else static if (LDC_with_ARM64) 497 { 498 // Get current rounding mode. 499 uint fpscr = arm_get_fpcr(); 500 501 switch(fpscr & _MM_ROUND_MASK_ARM) 502 { 503 default: 504 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 505 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 506 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 507 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 508 } 509 } 510 // 64-bit can use an SSE instruction 511 else version(D_InlineAsm_X86_64) 512 { 513 long result; 514 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 515 { 516 asm pure nothrow @nogc @trusted 517 { 518 movsd XMM0, value; 519 cvtsd2si RAX, XMM0; 520 mov result, RAX; 521 } 522 } 523 else 524 { 525 asm pure nothrow @nogc @trusted 526 { 527 movsd XMM0, value; 528 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 529 mov result, RAX; 530 } 531 } 532 return result; 533 } 534 else version(D_InlineAsm_X86) 535 { 536 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 537 // This leads to an unfortunate FPU sequence in every C++ compiler. 538 // See: https://godbolt.org/z/vZym77 539 540 // Get current MXCSR rounding 541 uint sseRounding; 542 ushort savedFPUCW; 543 ushort newFPUCW; 544 long result; 545 asm pure nothrow @nogc @trusted 546 { 547 stmxcsr sseRounding; 548 fld value; 549 fnstcw savedFPUCW; 550 mov AX, savedFPUCW; 551 and AX, 0xf3ff; 552 movzx ECX, word ptr sseRounding; 553 and ECX, 0x6000; 554 shr ECX, 3; 555 or AX, CX; 556 mov newFPUCW, AX; 557 fldcw newFPUCW; 558 fistp result; 559 fldcw savedFPUCW; 560 } 561 return result; 562 } 563 else static if (GDC_with_x86) 564 { 565 version(X86_64) 566 { 567 static assert(GDC_with_SSE2); 568 __m128d A; 569 A.ptr[0] = value; 570 return __builtin_ia32_cvtsd2si64 (A); 571 } 572 else 573 { 574 // This is untested! 575 uint sseRounding; 576 ushort savedFPUCW; 577 ushort newFPUCW; 578 long result; 579 asm pure nothrow @nogc @trusted 580 { 581 "stmxcsr %1;\n" ~ 582 "fld %2;\n" ~ 583 "fnstcw %3;\n" ~ 584 "movw %3, %%ax;\n" ~ 585 "andw $0xf3ff, %%ax;\n" ~ 586 "movzwl %1, %%ecx;\n" ~ 587 "andl $0x6000, %%ecx;\n" ~ 588 "shrl $3, %%ecx;\n" ~ 589 "orw %%cx, %%ax\n" ~ 590 "movw %%ax, %4;\n" ~ 591 "fldcw %4;\n" ~ 592 "fistpll %0;\n" ~ 593 "fldcw %3;\n" 594 : "=m"(result) // %0 595 : "m" (sseRounding), 596 "t" (value), 597 "m" (savedFPUCW), 598 "m" (newFPUCW) 599 : "eax", "ecx", "st"; 600 } 601 return result; 602 } 603 } 604 else 605 static assert(false); 606 } 607 608 // 609 // </ROUNDING> 610 // 611 612 613 // using the Intel terminology here 614 615 byte saturateSignedWordToSignedByte(short value) pure @safe 616 { 617 if (value > 127) value = 127; 618 if (value < -128) value = -128; 619 return cast(byte) value; 620 } 621 622 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 623 { 624 if (value > 255) value = 255; 625 if (value < 0) value = 0; 626 return cast(ubyte) value; 627 } 628 629 short saturateSignedIntToSignedShort(int value) pure @safe 630 { 631 if (value > 32767) value = 32767; 632 if (value < -32768) value = -32768; 633 return cast(short) value; 634 } 635 636 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 637 { 638 if (value > 65535) value = 65535; 639 if (value < 0) value = 0; 640 return cast(ushort) value; 641 } 642 643 unittest // test saturate operations 644 { 645 assert( saturateSignedWordToSignedByte(32000) == 127); 646 assert( saturateSignedWordToUnsignedByte(32000) == 255); 647 assert( saturateSignedWordToSignedByte(-4000) == -128); 648 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 649 assert( saturateSignedIntToSignedShort(32768) == 32767); 650 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 651 assert( saturateSignedIntToSignedShort(-32769) == -32768); 652 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 653 } 654 655 version(unittest) 656 { 657 // This is just for debugging tests 658 import core.stdc.stdio: printf; 659 660 // printing vectors for implementation 661 // Note: you can override `pure` within a `debug` clause 662 663 void _mm_print_pi64(__m64 v) @trusted 664 { 665 long1 vl = cast(long1)v; 666 printf("%lld\n", vl.array[0]); 667 } 668 669 void _mm_print_pi32(__m64 v) @trusted 670 { 671 int[2] C = (cast(int2)v).array; 672 printf("%d %d\n", C[0], C[1]); 673 } 674 675 void _mm_print_pi16(__m64 v) @trusted 676 { 677 short[4] C = (cast(short4)v).array; 678 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 679 } 680 681 void _mm_print_pi8(__m64 v) @trusted 682 { 683 byte[8] C = (cast(byte8)v).array; 684 printf("%d %d %d %d %d %d %d %d\n", 685 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 686 } 687 688 void _mm_print_epi64(__m128i v) @trusted 689 { 690 long2 vl = cast(long2)v; 691 printf("%lld %lld\n", vl.array[0], vl.array[1]); 692 } 693 694 void _mm_print_epi32(__m128i v) @trusted 695 { 696 printf("%d %d %d %d\n", 697 v.array[0], v.array[1], v.array[2], v.array[3]); 698 } 699 700 void _mm_print_epi16(__m128i v) @trusted 701 { 702 short[8] C = (cast(short8)v).array; 703 printf("%d %d %d %d %d %d %d %d\n", 704 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 705 } 706 707 void _mm_print_epi8(__m128i v) @trusted 708 { 709 byte[16] C = (cast(byte16)v).array; 710 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 711 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 712 } 713 714 void _mm_print_ps(__m128 v) @trusted 715 { 716 float[4] C = (cast(float4)v).array; 717 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 718 } 719 720 void _mm_print_pd(__m128d v) @trusted 721 { 722 double[2] C = (cast(double2)v).array; 723 printf("%f %f\n", C[0], C[1]); 724 } 725 } 726 727 728 // 729 // <FLOATING-POINT COMPARISONS> 730 // 731 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 732 // need different IR generation. 733 734 enum FPComparison 735 { 736 oeq, // ordered and equal 737 ogt, // ordered and greater than 738 oge, // ordered and greater than or equal 739 olt, // ordered and less than 740 ole, // ordered and less than or equal 741 one, // ordered and not equal 742 ord, // ordered (no nans) 743 ueq, // unordered or equal 744 ugt, // unordered or greater than ("nle") 745 uge, // unordered or greater than or equal ("nlt") 746 ult, // unordered or less than ("nge") 747 ule, // unordered or less than or equal ("ngt") 748 une, // unordered or not equal ("neq") 749 uno, // unordered (either nans) 750 } 751 752 private static immutable string[FPComparison.max+1] FPComparisonToString = 753 [ 754 "oeq", 755 "ogt", 756 "oge", 757 "olt", 758 "ole", 759 "one", 760 "ord", 761 "ueq", 762 "ugt", 763 "uge", 764 "ult", 765 "ule", 766 "une", 767 "uno", 768 ]; 769 770 // Individual float comparison: returns -1 for true or 0 for false. 771 // Useful for DMD and testing 772 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 773 { 774 bool unordered = isnan(a) || isnan(b); 775 final switch(comparison) with(FPComparison) 776 { 777 case oeq: return a == b; 778 case ogt: return a > b; 779 case oge: return a >= b; 780 case olt: return a < b; 781 case ole: return a <= b; 782 case one: return !unordered && (a != b); // NaN with != always yields true 783 case ord: return !unordered; 784 case ueq: return unordered || (a == b); 785 case ugt: return unordered || (a > b); 786 case uge: return unordered || (a >= b); 787 case ult: return unordered || (a < b); 788 case ule: return unordered || (a <= b); 789 case une: return (a != b); // NaN with != always yields true 790 case uno: return unordered; 791 } 792 } 793 794 version(LDC) 795 { 796 /// Provides packed float comparisons 797 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 798 { 799 enum ir = ` 800 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 801 %r = sext <4 x i1> %cmp to <4 x i32> 802 ret <4 x i32> %r`; 803 804 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 805 } 806 807 /// Provides packed double comparisons 808 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 809 { 810 enum ir = ` 811 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 812 %r = sext <2 x i1> %cmp to <2 x i64> 813 ret <2 x i64> %r`; 814 815 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 816 } 817 818 /// CMPSS-style comparisons 819 /// clang implement it through x86 intrinsics, it is possible with IR alone 820 /// but leads to less optimal code. 821 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 822 /// Not that simple. 823 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 824 { 825 /* 826 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 827 enum bool invertOp = (predicateNumber & 0x80) != 0; 828 static if(invertOp) 829 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 830 else 831 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 832 */ 833 enum ir = ` 834 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 835 %r = sext i1 %cmp to i32 836 %r2 = bitcast i32 %r to float 837 ret float %r2`; 838 839 float4 r = a; 840 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 841 return r; 842 } 843 844 /// CMPSD-style comparisons 845 /// clang implement it through x86 intrinsics, it is possible with IR alone 846 /// but leads to less optimal code. 847 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 848 /// Not that simple. 849 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 850 { 851 enum ir = ` 852 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 853 %r = sext i1 %cmp to i64 854 %r2 = bitcast i64 %r to double 855 ret double %r2`; 856 857 double2 r = a; 858 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 859 return r; 860 } 861 } 862 else 863 { 864 /// Provides packed float comparisons 865 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 866 { 867 int4 result; 868 foreach(i; 0..4) 869 { 870 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 871 } 872 return result; 873 } 874 875 /// Provides packed double comparisons 876 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 877 { 878 long2 result; 879 foreach(i; 0..2) 880 { 881 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 882 } 883 return result; 884 } 885 886 /// Provides CMPSS-style comparison 887 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 888 { 889 int4 result = cast(int4)a; 890 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 891 return cast(float4)result; 892 } 893 894 /// Provides CMPSD-style comparison 895 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 896 { 897 long2 result = cast(long2)a; 898 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 899 return cast(double2)result; 900 } 901 } 902 unittest // cmpps 903 { 904 // Check all comparison type is working 905 float4 A = [1, 3, 5, float.nan]; 906 float4 B = [2, 3, 4, 5]; 907 908 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 909 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 910 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 911 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 912 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 913 int4 result_one = cmpps!(FPComparison.one)(A, B); 914 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 915 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 916 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 917 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 918 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 919 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 920 int4 result_une = cmpps!(FPComparison.une)(A, B); 921 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 922 923 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 924 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 925 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 926 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 927 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 928 static immutable int[4] correct_one = [-1, 0,-1, 0]; 929 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 930 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 931 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 932 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 933 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 934 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 935 static immutable int[4] correct_une = [-1, 0,-1,-1]; 936 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 937 938 assert(result_oeq.array == correct_oeq); 939 assert(result_ogt.array == correct_ogt); 940 assert(result_oge.array == correct_oge); 941 assert(result_olt.array == correct_olt); 942 assert(result_ole.array == correct_ole); 943 assert(result_one.array == correct_one); 944 assert(result_ord.array == correct_ord); 945 assert(result_ueq.array == correct_ueq); 946 assert(result_ugt.array == correct_ugt); 947 assert(result_uge.array == correct_uge); 948 assert(result_ult.array == correct_ult); 949 assert(result_ule.array == correct_ule); 950 assert(result_une.array == correct_une); 951 assert(result_uno.array == correct_uno); 952 } 953 unittest 954 { 955 double2 a = [1, 3]; 956 double2 b = [2, 3]; 957 long2 c = cmppd!(FPComparison.ult)(a, b); 958 static immutable long[2] correct = [cast(long)(-1), 0]; 959 assert(c.array == correct); 960 } 961 unittest // cmpss 962 { 963 void testComparison(FPComparison comparison)(float4 A, float4 B) 964 { 965 float4 result = cmpss!comparison(A, B); 966 int4 iresult = cast(int4)result; 967 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 968 assert(iresult.array[0] == expected); 969 assert(result.array[1] == A.array[1]); 970 assert(result.array[2] == A.array[2]); 971 assert(result.array[3] == A.array[3]); 972 } 973 974 // Check all comparison type is working 975 float4 A = [1, 3, 5, 6]; 976 float4 B = [2, 3, 4, 5]; 977 float4 C = [float.nan, 3, 4, 5]; 978 979 testComparison!(FPComparison.oeq)(A, B); 980 testComparison!(FPComparison.oeq)(A, C); 981 testComparison!(FPComparison.ogt)(A, B); 982 testComparison!(FPComparison.ogt)(A, C); 983 testComparison!(FPComparison.oge)(A, B); 984 testComparison!(FPComparison.oge)(A, C); 985 testComparison!(FPComparison.olt)(A, B); 986 testComparison!(FPComparison.olt)(A, C); 987 testComparison!(FPComparison.ole)(A, B); 988 testComparison!(FPComparison.ole)(A, C); 989 testComparison!(FPComparison.one)(A, B); 990 testComparison!(FPComparison.one)(A, C); 991 testComparison!(FPComparison.ord)(A, B); 992 testComparison!(FPComparison.ord)(A, C); 993 testComparison!(FPComparison.ueq)(A, B); 994 testComparison!(FPComparison.ueq)(A, C); 995 testComparison!(FPComparison.ugt)(A, B); 996 testComparison!(FPComparison.ugt)(A, C); 997 testComparison!(FPComparison.uge)(A, B); 998 testComparison!(FPComparison.uge)(A, C); 999 testComparison!(FPComparison.ult)(A, B); 1000 testComparison!(FPComparison.ult)(A, C); 1001 testComparison!(FPComparison.ule)(A, B); 1002 testComparison!(FPComparison.ule)(A, C); 1003 testComparison!(FPComparison.une)(A, B); 1004 testComparison!(FPComparison.une)(A, C); 1005 testComparison!(FPComparison.uno)(A, B); 1006 testComparison!(FPComparison.uno)(A, C); 1007 } 1008 unittest // cmpsd 1009 { 1010 void testComparison(FPComparison comparison)(double2 A, double2 B) 1011 { 1012 double2 result = cmpsd!comparison(A, B); 1013 long2 iresult = cast(long2)result; 1014 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1015 assert(iresult.array[0] == expected); 1016 assert(result.array[1] == A.array[1]); 1017 } 1018 1019 // Check all comparison type is working 1020 double2 A = [1, 3]; 1021 double2 B = [2, 4]; 1022 double2 C = [double.nan, 5]; 1023 1024 testComparison!(FPComparison.oeq)(A, B); 1025 testComparison!(FPComparison.oeq)(A, C); 1026 testComparison!(FPComparison.ogt)(A, B); 1027 testComparison!(FPComparison.ogt)(A, C); 1028 testComparison!(FPComparison.oge)(A, B); 1029 testComparison!(FPComparison.oge)(A, C); 1030 testComparison!(FPComparison.olt)(A, B); 1031 testComparison!(FPComparison.olt)(A, C); 1032 testComparison!(FPComparison.ole)(A, B); 1033 testComparison!(FPComparison.ole)(A, C); 1034 testComparison!(FPComparison.one)(A, B); 1035 testComparison!(FPComparison.one)(A, C); 1036 testComparison!(FPComparison.ord)(A, B); 1037 testComparison!(FPComparison.ord)(A, C); 1038 testComparison!(FPComparison.ueq)(A, B); 1039 testComparison!(FPComparison.ueq)(A, C); 1040 testComparison!(FPComparison.ugt)(A, B); 1041 testComparison!(FPComparison.ugt)(A, C); 1042 testComparison!(FPComparison.uge)(A, B); 1043 testComparison!(FPComparison.uge)(A, C); 1044 testComparison!(FPComparison.ult)(A, B); 1045 testComparison!(FPComparison.ult)(A, C); 1046 testComparison!(FPComparison.ule)(A, B); 1047 testComparison!(FPComparison.ule)(A, C); 1048 testComparison!(FPComparison.une)(A, B); 1049 testComparison!(FPComparison.une)(A, C); 1050 testComparison!(FPComparison.uno)(A, B); 1051 testComparison!(FPComparison.uno)(A, C); 1052 } 1053 1054 // 1055 // </FLOATING-POINT COMPARISONS> 1056 // 1057 1058 1059 __m64 to_m64(__m128i a) pure @trusted 1060 { 1061 long2 la = cast(long2)a; 1062 long1 r = la.array[0]; 1063 return r; 1064 } 1065 1066 __m128i to_m128i(__m64 a) pure @trusted 1067 { 1068 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1069 1070 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1071 { 1072 long2 r = a.array[0]; 1073 r.ptr[1] = 0; 1074 return cast(int4)r; 1075 } 1076 else */ 1077 { 1078 long2 r = [0, 0]; 1079 r.ptr[0] = a.array[0]; 1080 return cast(__m128i)r; 1081 } 1082 } 1083 1084 // ADDITIONAL x86 INTRINSICS 1085 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1086 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1087 static if (LDC_with_SSE41) 1088 { 1089 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1090 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1091 } 1092 1093 // SOME NEON INTRINSICS 1094 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1095 // Not in the public API but the simde project expose it all for the user to use. 1096 // MAYDO: create a new neon.d module, for internal use only. 1097 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1098 static if (LDC_with_ARM64) 1099 { 1100 // VERY USEFUL LINK 1101 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1102 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1103 1104 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1105 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1106 1107 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1108 short8 vabsq_s16(short8 a) pure @safe; 1109 1110 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1111 int4 vabsq_s32(int4 a) pure @safe; 1112 1113 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1114 byte16 vabsq_s8(byte16 a) pure @safe; 1115 1116 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1117 { 1118 return a & b; 1119 } 1120 1121 long2 vandq_s64(long2 a, long2 b) 1122 { 1123 return a & b; 1124 } 1125 1126 long2 vbicq_s64(long2 a, long2 b) pure @safe 1127 { 1128 return a & ~b; 1129 } 1130 1131 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1132 { 1133 return c ^ ((c ^ b) & a); 1134 } 1135 1136 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1137 { 1138 return c ^ ((c ^ b) & a); 1139 } 1140 1141 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1142 { 1143 return c ^ ((c ^ b) & a); 1144 } 1145 1146 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1147 { 1148 short8 r; 1149 r.ptr[0] = lo.array[0]; 1150 r.ptr[1] = lo.array[1]; 1151 r.ptr[2] = lo.array[2]; 1152 r.ptr[3] = lo.array[3]; 1153 r.ptr[4] = hi.array[0]; 1154 r.ptr[5] = hi.array[1]; 1155 r.ptr[6] = hi.array[2]; 1156 r.ptr[7] = hi.array[3]; 1157 return r; 1158 } 1159 1160 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1161 { 1162 int4 r; 1163 r.ptr[0] = lo.array[0]; 1164 r.ptr[1] = lo.array[1]; 1165 r.ptr[2] = hi.array[0]; 1166 r.ptr[3] = hi.array[1]; 1167 return r; 1168 } 1169 1170 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1171 { 1172 byte16 r; 1173 r.ptr[0] = lo.array[0]; 1174 r.ptr[1] = lo.array[1]; 1175 r.ptr[2] = lo.array[2]; 1176 r.ptr[3] = lo.array[3]; 1177 r.ptr[4] = lo.array[4]; 1178 r.ptr[5] = lo.array[5]; 1179 r.ptr[6] = lo.array[6]; 1180 r.ptr[7] = lo.array[7]; 1181 r.ptr[8] = hi.array[0]; 1182 r.ptr[9] = hi.array[1]; 1183 r.ptr[10] = hi.array[2]; 1184 r.ptr[11] = hi.array[3]; 1185 r.ptr[12] = hi.array[4]; 1186 r.ptr[13] = hi.array[5]; 1187 r.ptr[14] = hi.array[6]; 1188 r.ptr[15] = hi.array[7]; 1189 return r; 1190 } 1191 1192 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1193 { 1194 short8 r; 1195 r.ptr[0] = lo.array[0]; 1196 r.ptr[1] = lo.array[1]; 1197 r.ptr[2] = lo.array[2]; 1198 r.ptr[3] = lo.array[3]; 1199 r.ptr[4] = hi.array[0]; 1200 r.ptr[5] = hi.array[1]; 1201 r.ptr[6] = hi.array[2]; 1202 r.ptr[7] = hi.array[3]; 1203 return r; 1204 } 1205 1206 1207 // float4 => int4 1208 1209 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1210 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1211 1212 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1213 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1214 1215 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1216 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1217 1218 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1219 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1220 1221 1222 // double2 => long2 1223 1224 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1225 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1226 1227 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1228 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1229 1230 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1231 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1232 1233 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1234 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1235 1236 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1237 int vcvtms_s32_f32(float a) pure @safe; 1238 1239 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1240 int vcvtns_s32_f32(float a) pure @safe; 1241 1242 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1243 int vcvtps_s32_f32(float a) pure @safe; 1244 1245 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1246 int vcvts_s32_f32(float a) pure @safe; 1247 1248 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1249 int vcvtms_s32_f64(double a) pure @safe; 1250 1251 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1252 int vcvtns_s32_f64(double a) pure @safe; 1253 1254 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1255 int vcvtps_s32_f64(double a) pure @safe; 1256 1257 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1258 int vcvts_s32_f64(double a) pure @safe; 1259 1260 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1261 long vcvtms_s64_f32(float a) pure @safe; 1262 1263 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1264 long vcvtns_s64_f32(float a) pure @safe; 1265 1266 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1267 long vcvtps_s64_f32(float a) pure @safe; 1268 1269 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1270 long vcvts_s64_f32(float a) pure @safe; 1271 1272 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1273 long vcvtms_s64_f64(double a) pure @safe; 1274 1275 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1276 long vcvtns_s64_f64(double a) pure @safe; 1277 1278 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1279 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1280 1281 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1282 long vcvts_s64_f64(double a) pure @safe; 1283 1284 long2 vdupq_n_s64(long value) pure @safe 1285 { 1286 long2 r; 1287 r = value; 1288 return r; 1289 } 1290 1291 short4 vget_high_s16(short8 a) pure @trusted 1292 { 1293 short4 r; 1294 r.ptr[0] = a.array[4]; 1295 r.ptr[1] = a.array[5]; 1296 r.ptr[2] = a.array[6]; 1297 r.ptr[3] = a.array[7]; 1298 return r; 1299 } 1300 1301 int2 vget_high_s32(int4 a) pure @trusted 1302 { 1303 int2 r; 1304 r.ptr[0] = a.array[2]; 1305 r.ptr[1] = a.array[3]; 1306 return r; 1307 } 1308 1309 byte8 vget_high_u8(byte16 a) pure @trusted 1310 { 1311 byte8 r; 1312 r.ptr[0] = a.array[8]; 1313 r.ptr[1] = a.array[9]; 1314 r.ptr[2] = a.array[10]; 1315 r.ptr[3] = a.array[11]; 1316 r.ptr[4] = a.array[12]; 1317 r.ptr[5] = a.array[13]; 1318 r.ptr[6] = a.array[14]; 1319 r.ptr[7] = a.array[15]; 1320 return r; 1321 } 1322 1323 short4 vget_low_s16(short8 a) pure @trusted 1324 { 1325 short4 r; 1326 r.ptr[0] = a.array[0]; 1327 r.ptr[1] = a.array[1]; 1328 r.ptr[2] = a.array[2]; 1329 r.ptr[3] = a.array[3]; 1330 return r; 1331 } 1332 1333 int2 vget_low_s32(int4 a) pure @trusted 1334 { 1335 int2 r; 1336 r.ptr[0] = a.array[0]; 1337 r.ptr[1] = a.array[1]; 1338 return r; 1339 } 1340 1341 byte8 vget_low_u8(byte16 a) pure @trusted 1342 { 1343 byte8 r; 1344 r.ptr[0] = a.array[0]; 1345 r.ptr[1] = a.array[1]; 1346 r.ptr[2] = a.array[2]; 1347 r.ptr[3] = a.array[3]; 1348 r.ptr[4] = a.array[4]; 1349 r.ptr[5] = a.array[5]; 1350 r.ptr[6] = a.array[6]; 1351 r.ptr[7] = a.array[7]; 1352 return r; 1353 } 1354 1355 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1356 { 1357 return v.array[lane]; 1358 } 1359 1360 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1361 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1362 1363 int4 vmaxq_s32(int4 a, int4 b) 1364 { 1365 int4 r; 1366 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1367 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1368 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1369 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1370 return r; 1371 } 1372 1373 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1374 short8 vminq_s16(short8 a, short8 b) pure @safe; 1375 1376 int2 vmovn_s64(long2 a) pure @trusted 1377 { 1378 int2 r; 1379 r.ptr[0] = cast(int)(a.array[0]); 1380 r.ptr[1] = cast(int)(a.array[1]); 1381 return r; 1382 } 1383 1384 int4 vmull_s16(short4 a, short4 b) pure @trusted 1385 { 1386 int4 r; 1387 r.ptr[0] = a.array[0] * b.array[0]; 1388 r.ptr[1] = a.array[1] * b.array[1]; 1389 r.ptr[2] = a.array[2] * b.array[2]; 1390 r.ptr[3] = a.array[3] * b.array[3]; 1391 return r; 1392 } 1393 1394 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1395 long2 vmull_s32(int2 a, int2 b) pure @safe; 1396 1397 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1398 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1399 1400 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1401 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1402 1403 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1404 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1405 1406 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1407 short8 vpaddlq_u8 (byte16 a) pure @safe; 1408 1409 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1410 { 1411 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1412 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1413 } 1414 else 1415 { 1416 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1417 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1418 } 1419 1420 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1421 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1422 1423 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1424 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1425 1426 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1427 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1428 1429 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1430 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1431 1432 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1433 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1434 1435 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1436 byte8 vqmovn_s16(short8 a) pure @safe; 1437 1438 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1439 short4 vqmovn_s32(int4 a) pure @safe; 1440 1441 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1442 short4 vqmovn_u32(int4 a) pure @safe; 1443 1444 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1445 byte8 vqmovun_s16(short8 a) pure @safe; 1446 1447 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1448 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1449 1450 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1451 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1452 1453 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1454 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1455 1456 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1457 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1458 1459 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1460 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1461 1462 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1463 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1464 1465 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1466 { 1467 return a >>> b; 1468 } 1469 1470 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1471 { 1472 a = a >> byte16(cast(byte)r); 1473 return a; 1474 } 1475 1476 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1477 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1478 } 1479 1480 version(unittest) 1481 { 1482 double abs_double(double x) @trusted 1483 { 1484 version(LDC) 1485 return llvm_fabs(x); 1486 else 1487 { 1488 long uf = *cast(long*)(&x); 1489 uf &= 0x7fffffff_ffffffff; 1490 return *cast(double*)(&uf); 1491 } 1492 } 1493 } 1494 1495 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1496 1497 bool isnan(float x) pure @trusted 1498 { 1499 uint u = *cast(uint*)(&x); 1500 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1501 return result; 1502 } 1503 unittest 1504 { 1505 float x = float.nan; 1506 assert(isnan(x)); 1507 1508 x = 0; 1509 assert(!isnan(x)); 1510 1511 x = float.infinity; 1512 assert(!isnan(x)); 1513 } 1514 1515 bool isnan(double x) pure @trusted 1516 { 1517 ulong u = *cast(ulong*)(&x); 1518 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1519 } 1520 unittest 1521 { 1522 double x = double.nan; 1523 assert(isnan(x)); 1524 1525 x = 0; 1526 assert(!isnan(x)); 1527 1528 x = double.infinity; 1529 assert(!isnan(x)); 1530 }