1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_AVX = false; 34 enum GDC_with_AVX2 = false; 35 enum GDC_with_SHA = false; 36 enum GDC_with_BMI2 = false; 37 } 38 else version (X86_64) 39 { 40 // GDC support uses extended inline assembly: 41 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 42 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 43 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 44 45 public import core.simd: byte16, short8, int4, float4, double2; 46 47 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 48 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 49 public import gcc.builtins; 50 51 enum GDC_with_x86 = true; 52 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 53 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 54 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 55 56 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 57 enum GDC_with_SSSE3 = false; // TODO: we don't have a way to detect that at CT 58 enum GDC_with_SSE41 = false; // TODO: we don't have a way to detect that at CT 59 enum GDC_with_SSE42 = false; // TODO: we don't have a way to detect that at CT 60 enum GDC_with_AVX = false; // TODO: we don't have a way to detect that at CT 61 enum GDC_with_AVX2 = false; // TODO: we don't have a way to detect that at CT 62 enum GDC_with_SHA = false; 63 enum GDC_with_BMI2 = false; 64 } 65 else 66 { 67 enum GDC_with_x86 = false; 68 enum GDC_with_MMX = false; 69 enum GDC_with_SSE = false; 70 enum GDC_with_SSE2 = false; 71 enum GDC_with_SSE3 = false; 72 enum GDC_with_SSSE3 = false; 73 enum GDC_with_SSE41 = false; 74 enum GDC_with_SSE42 = false; 75 enum GDC_with_AVX = false; 76 enum GDC_with_AVX2 = false; 77 enum GDC_with_SHA = false; 78 enum GDC_with_BMI2 = false; 79 } 80 } 81 else 82 { 83 enum GDC_with_x86 = false; 84 enum GDC_with_MMX = false; 85 enum GDC_with_SSE = false; 86 enum GDC_with_SSE2 = false; 87 enum GDC_with_SSE3 = false; 88 enum GDC_with_SSSE3 = false; 89 enum GDC_with_SSE41 = false; 90 enum GDC_with_SSE42 = false; 91 enum GDC_with_AVX = false; 92 enum GDC_with_AVX2 = false; 93 enum GDC_with_SHA = false; 94 enum GDC_with_BMI2 = false; 95 } 96 97 version(LDC) 98 { 99 public import core.simd; 100 public import ldc.simd; 101 public import ldc.intrinsics; 102 public import ldc.llvmasm: __asm; 103 104 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 105 static if (__VERSION__ >= 2083) 106 { 107 import ldc.llvmasm; 108 alias LDCInlineIR = __ir_pure; 109 110 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 111 alias LDCInlineIREx = __irEx_pure; 112 } 113 else 114 { 115 alias LDCInlineIR = inlineIR; 116 } 117 118 version(ARM) 119 { 120 public import ldc.gccbuiltins_arm; 121 enum LDC_with_ARM32 = true; 122 enum LDC_with_ARM64 = false; 123 enum LDC_with_ARM64_CRC = false; 124 enum LDC_with_SSE1 = false; 125 enum LDC_with_SSE2 = false; 126 enum LDC_with_SSE3 = false; 127 enum LDC_with_SSSE3 = false; 128 enum LDC_with_SSE41 = false; 129 enum LDC_with_SSE42 = false; 130 enum LDC_with_AVX = false; 131 enum LDC_with_AVX2 = false; 132 enum LDC_with_SHA = false; 133 enum LDC_with_BMI2 = false; 134 } 135 else version(AArch64) 136 { 137 enum LDC_with_ARM32 = false; 138 enum LDC_with_ARM64 = true; // implies "has Neon" 139 enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc"); 140 enum LDC_with_SSE1 = false; 141 enum LDC_with_SSE2 = false; 142 enum LDC_with_SSE3 = false; 143 enum LDC_with_SSSE3 = false; 144 enum LDC_with_SSE41 = false; 145 enum LDC_with_SSE42 = false; 146 enum LDC_with_AVX = false; 147 enum LDC_with_AVX2 = false; 148 enum LDC_with_SHA = false; 149 enum LDC_with_BMI2 = false; 150 } 151 else 152 { 153 public import ldc.gccbuiltins_x86; 154 enum LDC_with_ARM32 = false; 155 enum LDC_with_ARM64 = false; 156 enum LDC_with_ARM64_CRC = false; 157 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 158 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 159 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 160 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 161 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 162 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 163 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 164 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 165 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 166 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2"); 167 } 168 } 169 else 170 { 171 enum LDC_with_ARM32 = false; 172 enum LDC_with_ARM64 = false; 173 enum LDC_with_ARM64_CRC = false; 174 enum LDC_with_SSE1 = false; 175 enum LDC_with_SSE2 = false; 176 enum LDC_with_SSE3 = false; 177 enum LDC_with_SSSE3 = false; 178 enum LDC_with_SSE41 = false; 179 enum LDC_with_SSE42 = false; 180 enum LDC_with_AVX = false; 181 enum LDC_with_AVX2 = false; 182 enum LDC_with_SHA = false; 183 enum LDC_with_BMI2 = false; 184 } 185 186 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 187 188 version(DigitalMars) 189 { 190 version(D_InlineAsm_X86) 191 enum DMD_with_asm = true; 192 else version(D_InlineAsm_X86_64) 193 enum DMD_with_asm = true; 194 else 195 enum DMD_with_asm = false; 196 197 version(D_InlineAsm_X86) 198 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 199 else 200 enum DMD_with_32bit_asm = false; 201 202 version (D_SIMD) 203 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 204 else 205 enum DMD_with_DSIMD = false; 206 } 207 else 208 { 209 enum DMD_with_asm = false; 210 enum DMD_with_32bit_asm = false; 211 enum DMD_with_DSIMD = false; 212 } 213 214 static if (LDC_with_ARM32) 215 { 216 package uint arm_get_fpcr() nothrow @nogc @trusted 217 { 218 return __builtin_arm_get_fpscr(); 219 } 220 221 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 222 { 223 __builtin_arm_set_fpscr(cw); 224 } 225 } 226 227 static if (LDC_with_ARM64) 228 { 229 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 230 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 231 232 package uint arm_get_fpcr() pure nothrow @nogc @trusted 233 { 234 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 235 return __asm!uint("mrs $0, fpcr", "=r"); 236 } 237 238 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 239 { 240 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 241 long save_x2; 242 __asm!void("str x2, $1 \n" ~ 243 "ldr w2, $0 \n" ~ 244 "msr fpcr, x2 \n" ~ 245 "ldr x2, $1 " , "m,m", cw, &save_x2); 246 } 247 } 248 249 250 // For internal use only, since public API deals with a x86 semantic emulation 251 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 252 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 253 enum uint _MM_ROUND_UP_ARM = 0x00400000; 254 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 255 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 256 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 257 258 259 // 260 // <ROUNDING> 261 // 262 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 263 // doesn't change the FPU rounding mode, and isn't expected to do so. 264 // So we devised these rounding function to help having consistent rounding between 265 // LDC and DMD. It's important that DMD uses whatever is in MXCSR to round. 266 // 267 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 268 // functionality. 269 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 270 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 271 272 int convertFloatToInt32UsingMXCSR(float value) @trusted 273 { 274 int result; 275 version(GNU) 276 { 277 asm pure nothrow @nogc @trusted 278 { 279 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 280 } 281 } 282 else static if (LDC_with_ARM32) 283 { 284 // TODO: this is a bug, it won't preserve registers when optimized 285 result = __asm!int(`vldr s2, $1 286 vcvtr.s32.f32 s2, s2 287 vmov $0, s2`, "=r,m", value); 288 } 289 else static if (LDC_with_ARM64) 290 { 291 // Get current rounding mode. 292 uint fpscr = arm_get_fpcr(); 293 294 switch(fpscr & _MM_ROUND_MASK_ARM) 295 { 296 default: 297 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 298 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 299 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 300 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 301 } 302 } 303 else 304 { 305 asm pure nothrow @nogc @trusted 306 { 307 cvtss2si EAX, value; 308 mov result, EAX; 309 } 310 } 311 return result; 312 } 313 314 int convertDoubleToInt32UsingMXCSR(double value) @trusted 315 { 316 int result; 317 version(GNU) 318 { 319 asm pure nothrow @nogc @trusted 320 { 321 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 322 } 323 } 324 else static if (LDC_with_ARM32) 325 { 326 // TODO: bug, doesn't preserve registers 327 result = __asm!int(`vldr d2, $1 328 vcvtr.s32.f64 s2, d2 329 vmov $0, s2`, "=r,m", value); 330 } 331 else static if (LDC_with_ARM64) 332 { 333 // Get current rounding mode. 334 uint fpscr = arm_get_fpcr(); 335 336 switch(fpscr & _MM_ROUND_MASK_ARM) 337 { 338 default: 339 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 340 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 341 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 342 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 343 } 344 } 345 else 346 { 347 asm pure nothrow @nogc @trusted 348 { 349 cvtsd2si EAX, value; 350 mov result, EAX; 351 } 352 } 353 return result; 354 } 355 356 long convertFloatToInt64UsingMXCSR(float value) @trusted 357 { 358 static if (LDC_with_ARM32) 359 { 360 // We have to resort to libc since 32-bit ARM 361 // doesn't seem to have 64-bit registers. 362 363 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 364 365 // Note: converting to double precision else rounding could be different for large integers 366 double asDouble = value; 367 368 switch(fpscr & _MM_ROUND_MASK_ARM) 369 { 370 default: 371 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 372 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 373 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 374 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 375 } 376 } 377 else static if (LDC_with_ARM64) 378 { 379 uint fpscr = arm_get_fpcr(); 380 381 switch(fpscr & _MM_ROUND_MASK_ARM) 382 { 383 default: 384 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 385 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 386 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 387 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 388 } 389 } 390 // 64-bit can use an SSE instruction 391 else version(D_InlineAsm_X86_64) 392 { 393 long result; 394 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 395 { 396 asm pure nothrow @nogc @trusted 397 { 398 movss XMM0, value; 399 cvtss2si RAX, XMM0; 400 mov result, RAX; 401 } 402 } 403 else 404 { 405 asm pure nothrow @nogc @trusted 406 { 407 movss XMM0, value; 408 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 409 mov result, RAX; 410 } 411 } 412 return result; 413 } 414 else version(D_InlineAsm_X86) 415 { 416 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 417 // This leads to an unfortunate FPU sequence in every C++ compiler. 418 // See: https://godbolt.org/z/vZym77 419 420 // Get current MXCSR rounding 421 uint sseRounding; 422 ushort savedFPUCW; 423 ushort newFPUCW; 424 long result; 425 asm pure nothrow @nogc @trusted 426 { 427 stmxcsr sseRounding; 428 fld value; 429 fnstcw savedFPUCW; 430 mov AX, savedFPUCW; 431 and AX, 0xf3ff; // clear FPU rounding bits 432 movzx ECX, word ptr sseRounding; 433 and ECX, 0x6000; // only keep SSE rounding bits 434 shr ECX, 3; 435 or AX, CX; // make a new control word for FPU with SSE bits 436 mov newFPUCW, AX; 437 fldcw newFPUCW; 438 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 439 fldcw savedFPUCW; 440 } 441 return result; 442 } 443 else static if (GDC_with_x86) 444 { 445 version(X86_64) // 64-bit can just use the right instruction 446 { 447 static assert(GDC_with_SSE); 448 __m128 A; 449 A.ptr[0] = value; 450 return __builtin_ia32_cvtss2si64 (A); 451 } 452 else version(X86) // 32-bit 453 { 454 // This is untested! 455 uint sseRounding; 456 ushort savedFPUCW; 457 ushort newFPUCW; 458 long result; 459 asm pure nothrow @nogc @trusted 460 { 461 "stmxcsr %1;\n" ~ 462 "fld %2;\n" ~ 463 "fnstcw %3;\n" ~ 464 "movw %3, %%ax;\n" ~ 465 "andw $0xf3ff, %%ax;\n" ~ 466 "movzwl %1, %%ecx;\n" ~ 467 "andl $0x6000, %%ecx;\n" ~ 468 "shrl $3, %%ecx;\n" ~ 469 "orw %%cx, %%ax\n" ~ 470 "movw %%ax, %4;\n" ~ 471 "fldcw %4;\n" ~ 472 "fistpll %0;\n" ~ 473 "fldcw %3;\n" 474 : "=m"(result) // %0 475 : "m" (sseRounding), 476 "f" (value), 477 "m" (savedFPUCW), 478 "m" (newFPUCW) 479 : "eax", "ecx", "st"; 480 } 481 return result; 482 } 483 else 484 static assert(false); 485 } 486 else 487 static assert(false); 488 } 489 490 491 ///ditto 492 long convertDoubleToInt64UsingMXCSR(double value) @trusted 493 { 494 static if (LDC_with_ARM32) 495 { 496 // We have to resort to libc since 32-bit ARM 497 // doesn't seem to have 64-bit registers. 498 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 499 switch(fpscr & _MM_ROUND_MASK_ARM) 500 { 501 default: 502 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 503 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 504 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 505 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 506 } 507 } 508 else static if (LDC_with_ARM64) 509 { 510 // Get current rounding mode. 511 uint fpscr = arm_get_fpcr(); 512 513 switch(fpscr & _MM_ROUND_MASK_ARM) 514 { 515 default: 516 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 517 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 518 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 519 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 520 } 521 } 522 // 64-bit can use an SSE instruction 523 else version(D_InlineAsm_X86_64) 524 { 525 long result; 526 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 527 { 528 asm pure nothrow @nogc @trusted 529 { 530 movsd XMM0, value; 531 cvtsd2si RAX, XMM0; 532 mov result, RAX; 533 } 534 } 535 else 536 { 537 asm pure nothrow @nogc @trusted 538 { 539 movsd XMM0, value; 540 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 541 mov result, RAX; 542 } 543 } 544 return result; 545 } 546 else version(D_InlineAsm_X86) 547 { 548 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 549 // This leads to an unfortunate FPU sequence in every C++ compiler. 550 // See: https://godbolt.org/z/vZym77 551 552 // Get current MXCSR rounding 553 uint sseRounding; 554 ushort savedFPUCW; 555 ushort newFPUCW; 556 long result; 557 asm pure nothrow @nogc @trusted 558 { 559 stmxcsr sseRounding; 560 fld value; 561 fnstcw savedFPUCW; 562 mov AX, savedFPUCW; 563 and AX, 0xf3ff; 564 movzx ECX, word ptr sseRounding; 565 and ECX, 0x6000; 566 shr ECX, 3; 567 or AX, CX; 568 mov newFPUCW, AX; 569 fldcw newFPUCW; 570 fistp result; 571 fldcw savedFPUCW; 572 } 573 return result; 574 } 575 else static if (GDC_with_x86) 576 { 577 version(X86_64) 578 { 579 static assert(GDC_with_SSE2); 580 __m128d A; 581 A.ptr[0] = value; 582 return __builtin_ia32_cvtsd2si64 (A); 583 } 584 else 585 { 586 // This is untested! 587 uint sseRounding; 588 ushort savedFPUCW; 589 ushort newFPUCW; 590 long result; 591 asm pure nothrow @nogc @trusted 592 { 593 "stmxcsr %1;\n" ~ 594 "fld %2;\n" ~ 595 "fnstcw %3;\n" ~ 596 "movw %3, %%ax;\n" ~ 597 "andw $0xf3ff, %%ax;\n" ~ 598 "movzwl %1, %%ecx;\n" ~ 599 "andl $0x6000, %%ecx;\n" ~ 600 "shrl $3, %%ecx;\n" ~ 601 "orw %%cx, %%ax\n" ~ 602 "movw %%ax, %4;\n" ~ 603 "fldcw %4;\n" ~ 604 "fistpll %0;\n" ~ 605 "fldcw %3;\n" 606 : "=m"(result) // %0 607 : "m" (sseRounding), 608 "t" (value), 609 "m" (savedFPUCW), 610 "m" (newFPUCW) 611 : "eax", "ecx", "st"; 612 } 613 return result; 614 } 615 } 616 else 617 static assert(false); 618 } 619 620 // 621 // </ROUNDING> 622 // 623 624 625 // using the Intel terminology here 626 627 byte saturateSignedWordToSignedByte(short value) pure @safe 628 { 629 if (value > 127) value = 127; 630 if (value < -128) value = -128; 631 return cast(byte) value; 632 } 633 634 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 635 { 636 if (value > 255) value = 255; 637 if (value < 0) value = 0; 638 return cast(ubyte) value; 639 } 640 641 short saturateSignedIntToSignedShort(int value) pure @safe 642 { 643 if (value > 32767) value = 32767; 644 if (value < -32768) value = -32768; 645 return cast(short) value; 646 } 647 648 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 649 { 650 if (value > 65535) value = 65535; 651 if (value < 0) value = 0; 652 return cast(ushort) value; 653 } 654 655 unittest // test saturate operations 656 { 657 assert( saturateSignedWordToSignedByte(32000) == 127); 658 assert( saturateSignedWordToUnsignedByte(32000) == 255); 659 assert( saturateSignedWordToSignedByte(-4000) == -128); 660 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 661 assert( saturateSignedIntToSignedShort(32768) == 32767); 662 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 663 assert( saturateSignedIntToSignedShort(-32769) == -32768); 664 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 665 } 666 667 version(unittest) 668 { 669 // This is just for debugging tests 670 import core.stdc.stdio: printf; 671 672 // printing vectors for implementation 673 // Note: you can override `pure` within a `debug` clause 674 675 void _mm_print_pi64(__m64 v) @trusted 676 { 677 long1 vl = cast(long1)v; 678 printf("%lld\n", vl.array[0]); 679 } 680 681 void _mm_print_pi32(__m64 v) @trusted 682 { 683 int[2] C = (cast(int2)v).array; 684 printf("%d %d\n", C[0], C[1]); 685 } 686 687 void _mm_print_pi16(__m64 v) @trusted 688 { 689 short[4] C = (cast(short4)v).array; 690 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 691 } 692 693 void _mm_print_pi8(__m64 v) @trusted 694 { 695 byte[8] C = (cast(byte8)v).array; 696 printf("%d %d %d %d %d %d %d %d\n", 697 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 698 } 699 700 void _mm_print_epi64(__m128i v) @trusted 701 { 702 long2 vl = cast(long2)v; 703 printf("%lld %lld\n", vl.array[0], vl.array[1]); 704 } 705 706 void _mm_print_epi32(__m128i v) @trusted 707 { 708 printf("%d %d %d %d\n", 709 v.array[0], v.array[1], v.array[2], v.array[3]); 710 } 711 712 void _mm_print_epi16(__m128i v) @trusted 713 { 714 short[8] C = (cast(short8)v).array; 715 printf("%d %d %d %d %d %d %d %d\n", 716 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 717 } 718 719 void _mm_print_epi8(__m128i v) @trusted 720 { 721 byte[16] C = (cast(byte16)v).array; 722 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 723 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 724 } 725 726 void _mm_print_ps(__m128 v) @trusted 727 { 728 float[4] C = (cast(float4)v).array; 729 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 730 } 731 732 void _mm_print_pd(__m128d v) @trusted 733 { 734 double[2] C = (cast(double2)v).array; 735 printf("%f %f\n", C[0], C[1]); 736 } 737 } 738 739 740 // 741 // <FLOATING-POINT COMPARISONS> 742 // 743 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 744 // need different IR generation. 745 746 enum FPComparison 747 { 748 oeq, // ordered and equal 749 ogt, // ordered and greater than 750 oge, // ordered and greater than or equal 751 olt, // ordered and less than 752 ole, // ordered and less than or equal 753 one, // ordered and not equal 754 ord, // ordered (no nans) 755 ueq, // unordered or equal 756 ugt, // unordered or greater than ("nle") 757 uge, // unordered or greater than or equal ("nlt") 758 ult, // unordered or less than ("nge") 759 ule, // unordered or less than or equal ("ngt") 760 une, // unordered or not equal ("neq") 761 uno, // unordered (either nans) 762 } 763 764 private static immutable string[FPComparison.max+1] FPComparisonToString = 765 [ 766 "oeq", 767 "ogt", 768 "oge", 769 "olt", 770 "ole", 771 "one", 772 "ord", 773 "ueq", 774 "ugt", 775 "uge", 776 "ult", 777 "ule", 778 "une", 779 "uno", 780 ]; 781 782 // Individual float comparison: returns -1 for true or 0 for false. 783 // Useful for DMD and testing 784 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 785 { 786 bool unordered = isnan(a) || isnan(b); 787 final switch(comparison) with(FPComparison) 788 { 789 case oeq: return a == b; 790 case ogt: return a > b; 791 case oge: return a >= b; 792 case olt: return a < b; 793 case ole: return a <= b; 794 case one: return !unordered && (a != b); // NaN with != always yields true 795 case ord: return !unordered; 796 case ueq: return unordered || (a == b); 797 case ugt: return unordered || (a > b); 798 case uge: return unordered || (a >= b); 799 case ult: return unordered || (a < b); 800 case ule: return unordered || (a <= b); 801 case une: return (a != b); // NaN with != always yields true 802 case uno: return unordered; 803 } 804 } 805 806 version(LDC) 807 { 808 /// Provides packed float comparisons 809 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 810 { 811 enum ir = ` 812 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 813 %r = sext <4 x i1> %cmp to <4 x i32> 814 ret <4 x i32> %r`; 815 816 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 817 } 818 819 /// Provides packed double comparisons 820 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 821 { 822 enum ir = ` 823 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 824 %r = sext <2 x i1> %cmp to <2 x i64> 825 ret <2 x i64> %r`; 826 827 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 828 } 829 830 /// CMPSS-style comparisons 831 /// clang implement it through x86 intrinsics, it is possible with IR alone 832 /// but leads to less optimal code. 833 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 834 /// Not that simple. 835 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 836 { 837 /* 838 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 839 enum bool invertOp = (predicateNumber & 0x80) != 0; 840 static if(invertOp) 841 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 842 else 843 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 844 */ 845 enum ir = ` 846 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 847 %r = sext i1 %cmp to i32 848 %r2 = bitcast i32 %r to float 849 ret float %r2`; 850 851 float4 r = a; 852 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 853 return r; 854 } 855 856 /// CMPSD-style comparisons 857 /// clang implement it through x86 intrinsics, it is possible with IR alone 858 /// but leads to less optimal code. 859 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 860 /// Not that simple. 861 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 862 { 863 enum ir = ` 864 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 865 %r = sext i1 %cmp to i64 866 %r2 = bitcast i64 %r to double 867 ret double %r2`; 868 869 double2 r = a; 870 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 871 return r; 872 } 873 } 874 else 875 { 876 /// Provides packed float comparisons 877 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 878 { 879 int4 result; 880 foreach(i; 0..4) 881 { 882 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 883 } 884 return result; 885 } 886 887 /// Provides packed double comparisons 888 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 889 { 890 long2 result; 891 foreach(i; 0..2) 892 { 893 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 894 } 895 return result; 896 } 897 898 /// Provides CMPSS-style comparison 899 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 900 { 901 int4 result = cast(int4)a; 902 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 903 return cast(float4)result; 904 } 905 906 /// Provides CMPSD-style comparison 907 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 908 { 909 long2 result = cast(long2)a; 910 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 911 return cast(double2)result; 912 } 913 } 914 unittest // cmpps 915 { 916 // Check all comparison type is working 917 float4 A = [1, 3, 5, float.nan]; 918 float4 B = [2, 3, 4, 5]; 919 920 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 921 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 922 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 923 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 924 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 925 int4 result_one = cmpps!(FPComparison.one)(A, B); 926 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 927 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 928 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 929 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 930 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 931 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 932 int4 result_une = cmpps!(FPComparison.une)(A, B); 933 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 934 935 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 936 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 937 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 938 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 939 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 940 static immutable int[4] correct_one = [-1, 0,-1, 0]; 941 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 942 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 943 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 944 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 945 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 946 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 947 static immutable int[4] correct_une = [-1, 0,-1,-1]; 948 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 949 950 assert(result_oeq.array == correct_oeq); 951 assert(result_ogt.array == correct_ogt); 952 assert(result_oge.array == correct_oge); 953 assert(result_olt.array == correct_olt); 954 assert(result_ole.array == correct_ole); 955 assert(result_one.array == correct_one); 956 assert(result_ord.array == correct_ord); 957 assert(result_ueq.array == correct_ueq); 958 assert(result_ugt.array == correct_ugt); 959 assert(result_uge.array == correct_uge); 960 assert(result_ult.array == correct_ult); 961 assert(result_ule.array == correct_ule); 962 assert(result_une.array == correct_une); 963 assert(result_uno.array == correct_uno); 964 } 965 unittest 966 { 967 double2 a = [1, 3]; 968 double2 b = [2, 3]; 969 long2 c = cmppd!(FPComparison.ult)(a, b); 970 static immutable long[2] correct = [cast(long)(-1), 0]; 971 assert(c.array == correct); 972 } 973 unittest // cmpss 974 { 975 void testComparison(FPComparison comparison)(float4 A, float4 B) 976 { 977 float4 result = cmpss!comparison(A, B); 978 int4 iresult = cast(int4)result; 979 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 980 assert(iresult.array[0] == expected); 981 assert(result.array[1] == A.array[1]); 982 assert(result.array[2] == A.array[2]); 983 assert(result.array[3] == A.array[3]); 984 } 985 986 // Check all comparison type is working 987 float4 A = [1, 3, 5, 6]; 988 float4 B = [2, 3, 4, 5]; 989 float4 C = [float.nan, 3, 4, 5]; 990 991 testComparison!(FPComparison.oeq)(A, B); 992 testComparison!(FPComparison.oeq)(A, C); 993 testComparison!(FPComparison.ogt)(A, B); 994 testComparison!(FPComparison.ogt)(A, C); 995 testComparison!(FPComparison.oge)(A, B); 996 testComparison!(FPComparison.oge)(A, C); 997 testComparison!(FPComparison.olt)(A, B); 998 testComparison!(FPComparison.olt)(A, C); 999 testComparison!(FPComparison.ole)(A, B); 1000 testComparison!(FPComparison.ole)(A, C); 1001 testComparison!(FPComparison.one)(A, B); 1002 testComparison!(FPComparison.one)(A, C); 1003 testComparison!(FPComparison.ord)(A, B); 1004 testComparison!(FPComparison.ord)(A, C); 1005 testComparison!(FPComparison.ueq)(A, B); 1006 testComparison!(FPComparison.ueq)(A, C); 1007 testComparison!(FPComparison.ugt)(A, B); 1008 testComparison!(FPComparison.ugt)(A, C); 1009 testComparison!(FPComparison.uge)(A, B); 1010 testComparison!(FPComparison.uge)(A, C); 1011 testComparison!(FPComparison.ult)(A, B); 1012 testComparison!(FPComparison.ult)(A, C); 1013 testComparison!(FPComparison.ule)(A, B); 1014 testComparison!(FPComparison.ule)(A, C); 1015 testComparison!(FPComparison.une)(A, B); 1016 testComparison!(FPComparison.une)(A, C); 1017 testComparison!(FPComparison.uno)(A, B); 1018 testComparison!(FPComparison.uno)(A, C); 1019 } 1020 unittest // cmpsd 1021 { 1022 void testComparison(FPComparison comparison)(double2 A, double2 B) 1023 { 1024 double2 result = cmpsd!comparison(A, B); 1025 long2 iresult = cast(long2)result; 1026 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1027 assert(iresult.array[0] == expected); 1028 assert(result.array[1] == A.array[1]); 1029 } 1030 1031 // Check all comparison type is working 1032 double2 A = [1, 3]; 1033 double2 B = [2, 4]; 1034 double2 C = [double.nan, 5]; 1035 1036 testComparison!(FPComparison.oeq)(A, B); 1037 testComparison!(FPComparison.oeq)(A, C); 1038 testComparison!(FPComparison.ogt)(A, B); 1039 testComparison!(FPComparison.ogt)(A, C); 1040 testComparison!(FPComparison.oge)(A, B); 1041 testComparison!(FPComparison.oge)(A, C); 1042 testComparison!(FPComparison.olt)(A, B); 1043 testComparison!(FPComparison.olt)(A, C); 1044 testComparison!(FPComparison.ole)(A, B); 1045 testComparison!(FPComparison.ole)(A, C); 1046 testComparison!(FPComparison.one)(A, B); 1047 testComparison!(FPComparison.one)(A, C); 1048 testComparison!(FPComparison.ord)(A, B); 1049 testComparison!(FPComparison.ord)(A, C); 1050 testComparison!(FPComparison.ueq)(A, B); 1051 testComparison!(FPComparison.ueq)(A, C); 1052 testComparison!(FPComparison.ugt)(A, B); 1053 testComparison!(FPComparison.ugt)(A, C); 1054 testComparison!(FPComparison.uge)(A, B); 1055 testComparison!(FPComparison.uge)(A, C); 1056 testComparison!(FPComparison.ult)(A, B); 1057 testComparison!(FPComparison.ult)(A, C); 1058 testComparison!(FPComparison.ule)(A, B); 1059 testComparison!(FPComparison.ule)(A, C); 1060 testComparison!(FPComparison.une)(A, B); 1061 testComparison!(FPComparison.une)(A, C); 1062 testComparison!(FPComparison.uno)(A, B); 1063 testComparison!(FPComparison.uno)(A, C); 1064 } 1065 1066 // 1067 // </FLOATING-POINT COMPARISONS> 1068 // 1069 1070 1071 __m64 to_m64(__m128i a) pure @trusted 1072 { 1073 long2 la = cast(long2)a; 1074 long1 r = la.array[0]; 1075 return r; 1076 } 1077 1078 __m128i to_m128i(__m64 a) pure @trusted 1079 { 1080 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1081 1082 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1083 { 1084 long2 r = a.array[0]; 1085 r.ptr[1] = 0; 1086 return cast(int4)r; 1087 } 1088 else */ 1089 { 1090 long2 r = [0, 0]; 1091 r.ptr[0] = a.array[0]; 1092 return cast(__m128i)r; 1093 } 1094 } 1095 1096 // ADDITIONAL x86 INTRINSICS 1097 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1098 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1099 static if (LDC_with_SSE41) 1100 { 1101 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1102 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1103 } 1104 1105 // SOME NEON INTRINSICS 1106 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1107 // Not in the public API but the simde project expose it all for the user to use. 1108 // MAYDO: create a new neon.d module, for internal use only. 1109 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1110 static if (LDC_with_ARM64) 1111 { 1112 // VERY USEFUL LINK 1113 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1114 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1115 1116 pragma(LDC_intrinsic, "llvm.aarch64.crc32cb") 1117 uint __crc32cb(uint a, uint b) pure @safe; 1118 1119 pragma(LDC_intrinsic, "llvm.aarch64.crc32ch") 1120 uint __crc32ch(uint a, uint b) pure @safe; 1121 1122 pragma(LDC_intrinsic, "llvm.aarch64.crc32cw") 1123 uint __crc32cw(uint a, uint b) pure @safe; 1124 1125 pragma(LDC_intrinsic, "llvm.aarch64.crc32cx") 1126 uint __crc32cd(uint a, ulong b) pure @safe; 1127 1128 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1129 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1130 1131 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1132 short8 vabsq_s16(short8 a) pure @safe; 1133 1134 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1135 int4 vabsq_s32(int4 a) pure @safe; 1136 1137 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1138 byte16 vabsq_s8(byte16 a) pure @safe; 1139 1140 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1141 { 1142 return a & b; 1143 } 1144 1145 long2 vandq_s64(long2 a, long2 b) 1146 { 1147 return a & b; 1148 } 1149 1150 long2 vbicq_s64(long2 a, long2 b) pure @safe 1151 { 1152 return a & ~b; 1153 } 1154 1155 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1156 { 1157 return c ^ ((c ^ b) & a); 1158 } 1159 1160 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1161 { 1162 return c ^ ((c ^ b) & a); 1163 } 1164 1165 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1166 { 1167 return c ^ ((c ^ b) & a); 1168 } 1169 1170 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1171 { 1172 short8 r; 1173 r.ptr[0] = lo.array[0]; 1174 r.ptr[1] = lo.array[1]; 1175 r.ptr[2] = lo.array[2]; 1176 r.ptr[3] = lo.array[3]; 1177 r.ptr[4] = hi.array[0]; 1178 r.ptr[5] = hi.array[1]; 1179 r.ptr[6] = hi.array[2]; 1180 r.ptr[7] = hi.array[3]; 1181 return r; 1182 } 1183 1184 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1185 { 1186 int4 r; 1187 r.ptr[0] = lo.array[0]; 1188 r.ptr[1] = lo.array[1]; 1189 r.ptr[2] = hi.array[0]; 1190 r.ptr[3] = hi.array[1]; 1191 return r; 1192 } 1193 1194 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1195 { 1196 byte16 r; 1197 r.ptr[0] = lo.array[0]; 1198 r.ptr[1] = lo.array[1]; 1199 r.ptr[2] = lo.array[2]; 1200 r.ptr[3] = lo.array[3]; 1201 r.ptr[4] = lo.array[4]; 1202 r.ptr[5] = lo.array[5]; 1203 r.ptr[6] = lo.array[6]; 1204 r.ptr[7] = lo.array[7]; 1205 r.ptr[8] = hi.array[0]; 1206 r.ptr[9] = hi.array[1]; 1207 r.ptr[10] = hi.array[2]; 1208 r.ptr[11] = hi.array[3]; 1209 r.ptr[12] = hi.array[4]; 1210 r.ptr[13] = hi.array[5]; 1211 r.ptr[14] = hi.array[6]; 1212 r.ptr[15] = hi.array[7]; 1213 return r; 1214 } 1215 1216 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1217 { 1218 short8 r; 1219 r.ptr[0] = lo.array[0]; 1220 r.ptr[1] = lo.array[1]; 1221 r.ptr[2] = lo.array[2]; 1222 r.ptr[3] = lo.array[3]; 1223 r.ptr[4] = hi.array[0]; 1224 r.ptr[5] = hi.array[1]; 1225 r.ptr[6] = hi.array[2]; 1226 r.ptr[7] = hi.array[3]; 1227 return r; 1228 } 1229 1230 1231 // float4 => int4 1232 1233 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1234 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1235 1236 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1237 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1238 1239 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1240 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1241 1242 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1243 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1244 1245 1246 // double2 => long2 1247 1248 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1249 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1250 1251 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1252 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1253 1254 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1255 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1256 1257 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1258 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1259 1260 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1261 int vcvtms_s32_f32(float a) pure @safe; 1262 1263 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1264 int vcvtns_s32_f32(float a) pure @safe; 1265 1266 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1267 int vcvtps_s32_f32(float a) pure @safe; 1268 1269 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1270 int vcvts_s32_f32(float a) pure @safe; 1271 1272 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1273 int vcvtms_s32_f64(double a) pure @safe; 1274 1275 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1276 int vcvtns_s32_f64(double a) pure @safe; 1277 1278 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1279 int vcvtps_s32_f64(double a) pure @safe; 1280 1281 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1282 int vcvts_s32_f64(double a) pure @safe; 1283 1284 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1285 long vcvtms_s64_f32(float a) pure @safe; 1286 1287 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1288 long vcvtns_s64_f32(float a) pure @safe; 1289 1290 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1291 long vcvtps_s64_f32(float a) pure @safe; 1292 1293 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1294 long vcvts_s64_f32(float a) pure @safe; 1295 1296 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1297 long vcvtms_s64_f64(double a) pure @safe; 1298 1299 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1300 long vcvtns_s64_f64(double a) pure @safe; 1301 1302 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1303 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1304 1305 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1306 long vcvts_s64_f64(double a) pure @safe; 1307 1308 long2 vdupq_n_s64(long value) pure @safe 1309 { 1310 long2 r; 1311 r = value; 1312 return r; 1313 } 1314 1315 short4 vget_high_s16(short8 a) pure @trusted 1316 { 1317 short4 r; 1318 r.ptr[0] = a.array[4]; 1319 r.ptr[1] = a.array[5]; 1320 r.ptr[2] = a.array[6]; 1321 r.ptr[3] = a.array[7]; 1322 return r; 1323 } 1324 1325 int2 vget_high_s32(int4 a) pure @trusted 1326 { 1327 int2 r; 1328 r.ptr[0] = a.array[2]; 1329 r.ptr[1] = a.array[3]; 1330 return r; 1331 } 1332 1333 byte8 vget_high_u8(byte16 a) pure @trusted 1334 { 1335 byte8 r; 1336 r.ptr[0] = a.array[8]; 1337 r.ptr[1] = a.array[9]; 1338 r.ptr[2] = a.array[10]; 1339 r.ptr[3] = a.array[11]; 1340 r.ptr[4] = a.array[12]; 1341 r.ptr[5] = a.array[13]; 1342 r.ptr[6] = a.array[14]; 1343 r.ptr[7] = a.array[15]; 1344 return r; 1345 } 1346 1347 short4 vget_low_s16(short8 a) pure @trusted 1348 { 1349 short4 r; 1350 r.ptr[0] = a.array[0]; 1351 r.ptr[1] = a.array[1]; 1352 r.ptr[2] = a.array[2]; 1353 r.ptr[3] = a.array[3]; 1354 return r; 1355 } 1356 1357 int2 vget_low_s32(int4 a) pure @trusted 1358 { 1359 int2 r; 1360 r.ptr[0] = a.array[0]; 1361 r.ptr[1] = a.array[1]; 1362 return r; 1363 } 1364 1365 byte8 vget_low_u8(byte16 a) pure @trusted 1366 { 1367 byte8 r; 1368 r.ptr[0] = a.array[0]; 1369 r.ptr[1] = a.array[1]; 1370 r.ptr[2] = a.array[2]; 1371 r.ptr[3] = a.array[3]; 1372 r.ptr[4] = a.array[4]; 1373 r.ptr[5] = a.array[5]; 1374 r.ptr[6] = a.array[6]; 1375 r.ptr[7] = a.array[7]; 1376 return r; 1377 } 1378 1379 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1380 { 1381 return v.array[lane]; 1382 } 1383 1384 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1385 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1386 1387 int4 vmaxq_s32(int4 a, int4 b) 1388 { 1389 int4 r; 1390 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1391 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1392 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1393 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1394 return r; 1395 } 1396 1397 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1398 short8 vminq_s16(short8 a, short8 b) pure @safe; 1399 1400 int2 vmovn_s64(long2 a) pure @trusted 1401 { 1402 int2 r; 1403 r.ptr[0] = cast(int)(a.array[0]); 1404 r.ptr[1] = cast(int)(a.array[1]); 1405 return r; 1406 } 1407 1408 int4 vmull_s16(short4 a, short4 b) pure @trusted 1409 { 1410 int4 r; 1411 r.ptr[0] = a.array[0] * b.array[0]; 1412 r.ptr[1] = a.array[1] * b.array[1]; 1413 r.ptr[2] = a.array[2] * b.array[2]; 1414 r.ptr[3] = a.array[3] * b.array[3]; 1415 return r; 1416 } 1417 1418 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1419 long2 vmull_s32(int2 a, int2 b) pure @safe; 1420 1421 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1422 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1423 1424 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1425 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1426 1427 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1428 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1429 1430 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1431 short8 vpaddlq_u8 (byte16 a) pure @safe; 1432 1433 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1434 { 1435 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1436 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1437 } 1438 else 1439 { 1440 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1441 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1442 } 1443 1444 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1445 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1446 1447 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1448 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1449 1450 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1451 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1452 1453 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1454 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1455 1456 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1457 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1458 1459 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1460 byte8 vqmovn_s16(short8 a) pure @safe; 1461 1462 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1463 short4 vqmovn_s32(int4 a) pure @safe; 1464 1465 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1466 short4 vqmovn_u32(int4 a) pure @safe; 1467 1468 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1469 byte8 vqmovun_s16(short8 a) pure @safe; 1470 1471 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1472 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1473 1474 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1475 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1476 1477 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1478 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1479 1480 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1481 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1482 1483 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1484 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1485 1486 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1487 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1488 1489 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1490 { 1491 return a >>> b; 1492 } 1493 1494 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1495 { 1496 a = a >> byte16(cast(byte)r); 1497 return a; 1498 } 1499 1500 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1501 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1502 } 1503 1504 version(unittest) 1505 { 1506 double abs_double(double x) @trusted 1507 { 1508 version(LDC) 1509 return llvm_fabs(x); 1510 else 1511 { 1512 long uf = *cast(long*)(&x); 1513 uf &= 0x7fffffff_ffffffff; 1514 return *cast(double*)(&uf); 1515 } 1516 } 1517 } 1518 1519 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1520 1521 bool isnan(float x) pure @trusted 1522 { 1523 uint u = *cast(uint*)(&x); 1524 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1525 return result; 1526 } 1527 unittest 1528 { 1529 float x = float.nan; 1530 assert(isnan(x)); 1531 1532 x = 0; 1533 assert(!isnan(x)); 1534 1535 x = float.infinity; 1536 assert(!isnan(x)); 1537 } 1538 1539 bool isnan(double x) pure @trusted 1540 { 1541 ulong u = *cast(ulong*)(&x); 1542 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1543 } 1544 unittest 1545 { 1546 double x = double.nan; 1547 assert(isnan(x)); 1548 1549 x = 0; 1550 assert(!isnan(x)); 1551 1552 x = double.infinity; 1553 assert(!isnan(x)); 1554 }