1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_SHA = false; 34 } 35 else version (X86_64) 36 { 37 // GDC support uses extended inline assembly: 38 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 39 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 40 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 41 42 public import core.simd; 43 44 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 45 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 46 public import gcc.builtins; 47 48 enum GDC_with_x86 = true; 49 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 50 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 51 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 52 53 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 54 enum GDC_with_SSSE3 = false; // TODO: we don't have a way to detect that at CT 55 enum GDC_with_SSE41 = false; // TODO: we don't have a way to detect that at CT 56 enum GDC_with_SSE42 = false; // TODO: we don't have a way to detect that at CT 57 enum GDC_with_SHA = false; 58 } 59 else 60 { 61 enum GDC_with_x86 = false; 62 enum GDC_with_MMX = false; 63 enum GDC_with_SSE = false; 64 enum GDC_with_SSE2 = false; 65 enum GDC_with_SSE3 = false; 66 enum GDC_with_SSSE3 = false; 67 enum GDC_with_SSE41 = false; 68 enum GDC_with_SSE42 = false; 69 enum GDC_with_SHA = false; 70 } 71 } 72 else 73 { 74 enum GDC_with_x86 = false; 75 enum GDC_with_MMX = false; 76 enum GDC_with_SSE = false; 77 enum GDC_with_SSE2 = false; 78 enum GDC_with_SSE3 = false; 79 enum GDC_with_SSSE3 = false; 80 enum GDC_with_SSE41 = false; 81 enum GDC_with_SSE42 = false; 82 enum GDC_with_SHA = false; 83 } 84 85 version(LDC) 86 { 87 public import core.simd; 88 public import ldc.simd; 89 public import ldc.intrinsics; 90 public import ldc.llvmasm: __asm; 91 92 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 93 static if (__VERSION__ >= 2083) 94 { 95 import ldc.llvmasm; 96 alias LDCInlineIR = __ir_pure; 97 98 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 99 alias LDCInlineIREx = __irEx_pure; 100 } 101 else 102 { 103 alias LDCInlineIR = inlineIR; 104 } 105 106 version(ARM) 107 { 108 public import ldc.gccbuiltins_arm; 109 enum LDC_with_ARM32 = true; 110 enum LDC_with_ARM64 = false; 111 enum LDC_with_SSE1 = false; 112 enum LDC_with_SSE2 = false; 113 enum LDC_with_SSE3 = false; 114 enum LDC_with_SSSE3 = false; 115 enum LDC_with_SSE41 = false; 116 enum LDC_with_SSE42 = false; 117 enum LDC_with_AVX = false; 118 enum LDC_with_AVX2 = false; 119 enum LDC_with_SHA = false; 120 } 121 else version(AArch64) 122 { 123 enum LDC_with_ARM32 = false; 124 enum LDC_with_ARM64 = true; 125 enum LDC_with_SSE1 = false; 126 enum LDC_with_SSE2 = false; 127 enum LDC_with_SSE3 = false; 128 enum LDC_with_SSSE3 = false; 129 enum LDC_with_SSE41 = false; 130 enum LDC_with_SSE42 = false; 131 enum LDC_with_AVX = false; 132 enum LDC_with_AVX2 = false; 133 enum LDC_with_SHA = false; 134 } 135 else 136 { 137 public import ldc.gccbuiltins_x86; 138 enum LDC_with_ARM32 = false; 139 enum LDC_with_ARM64 = false; 140 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 141 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 142 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 143 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 144 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 145 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 146 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 147 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 148 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 149 } 150 } 151 else 152 { 153 enum LDC_with_ARM32 = false; 154 enum LDC_with_ARM64 = false; 155 enum LDC_with_SSE1 = false; 156 enum LDC_with_SSE2 = false; 157 enum LDC_with_SSE3 = false; 158 enum LDC_with_SSSE3 = false; 159 enum LDC_with_SSE41 = false; 160 enum LDC_with_SSE42 = false; 161 enum LDC_with_AVX = false; 162 enum LDC_with_AVX2 = false; 163 enum LDC_with_SHA = false; 164 } 165 166 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 167 168 version(DigitalMars) 169 { 170 version(D_InlineAsm_X86) 171 enum DMD_with_asm = true; 172 else version(D_InlineAsm_X86_64) 173 enum DMD_with_asm = true; 174 else 175 enum DMD_with_asm = false; 176 177 version(D_InlineAsm_X86) 178 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 179 else 180 enum DMD_with_32bit_asm = false; 181 182 version (D_SIMD) 183 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 184 else 185 enum DMD_with_DSIMD = false; 186 } 187 else 188 { 189 enum DMD_with_asm = false; 190 enum DMD_with_32bit_asm = false; 191 enum DMD_with_DSIMD = false; 192 } 193 194 static if (LDC_with_ARM32) 195 { 196 package uint arm_get_fpcr() nothrow @nogc @trusted 197 { 198 return __builtin_arm_get_fpscr(); 199 } 200 201 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 202 { 203 __builtin_arm_set_fpscr(cw); 204 } 205 } 206 207 static if (LDC_with_ARM64) 208 { 209 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 210 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 211 212 package uint arm_get_fpcr() pure nothrow @nogc @trusted 213 { 214 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 215 return __asm!uint("mrs $0, fpcr", "=r"); 216 } 217 218 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 219 { 220 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 221 long save_x2; 222 __asm!void("str x2, $1 \n" ~ 223 "ldr w2, $0 \n" ~ 224 "msr fpcr, x2 \n" ~ 225 "ldr x2, $1 " , "m,m", cw, &save_x2); 226 } 227 } 228 229 230 // For internal use only, since public API deals with a x86 semantic emulation 231 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 232 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 233 enum uint _MM_ROUND_UP_ARM = 0x00400000; 234 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 235 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 236 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 237 238 239 // 240 // <ROUNDING> 241 // 242 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 243 // doesn't change the FPU rounding mode, and isn't expected to do so. 244 // So we devised these rounding function to help having consistent rouding between 245 // LDC and DMD. It's important that DMD uses what is in MXCSR to round. 246 // 247 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 248 // functionality. 249 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 250 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 251 252 int convertFloatToInt32UsingMXCSR(float value) @trusted 253 { 254 int result; 255 version(GNU) 256 { 257 asm pure nothrow @nogc @trusted 258 { 259 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 260 } 261 } 262 else static if (LDC_with_ARM32) 263 { 264 // TODO: this is a bug, it won't preserve registers when optimized 265 result = __asm!int(`vldr s2, $1 266 vcvtr.s32.f32 s2, s2 267 vmov $0, s2`, "=r,m", value); 268 } 269 else static if (LDC_with_ARM64) 270 { 271 // Get current rounding mode. 272 uint fpscr = arm_get_fpcr(); 273 274 switch(fpscr & _MM_ROUND_MASK_ARM) 275 { 276 default: 277 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 278 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 279 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 280 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 281 } 282 } 283 else 284 { 285 asm pure nothrow @nogc @trusted 286 { 287 cvtss2si EAX, value; 288 mov result, EAX; 289 } 290 } 291 return result; 292 } 293 294 int convertDoubleToInt32UsingMXCSR(double value) @trusted 295 { 296 int result; 297 version(GNU) 298 { 299 asm pure nothrow @nogc @trusted 300 { 301 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 302 } 303 } 304 else static if (LDC_with_ARM32) 305 { 306 // TODO: bug, doesn't preserve registers 307 result = __asm!int(`vldr d2, $1 308 vcvtr.s32.f64 s2, d2 309 vmov $0, s2`, "=r,m", value); 310 } 311 else static if (LDC_with_ARM64) 312 { 313 // Get current rounding mode. 314 uint fpscr = arm_get_fpcr(); 315 316 switch(fpscr & _MM_ROUND_MASK_ARM) 317 { 318 default: 319 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 320 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 321 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 322 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 323 } 324 } 325 else 326 { 327 asm pure nothrow @nogc @trusted 328 { 329 cvtsd2si EAX, value; 330 mov result, EAX; 331 } 332 } 333 return result; 334 } 335 336 long convertFloatToInt64UsingMXCSR(float value) @trusted 337 { 338 static if (LDC_with_ARM32) 339 { 340 // We have to resort to libc since 32-bit ARM 341 // doesn't seem to have 64-bit registers. 342 343 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 344 345 // Note: converting to double precision else rounding could be different for large integers 346 double asDouble = value; 347 348 switch(fpscr & _MM_ROUND_MASK_ARM) 349 { 350 default: 351 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 352 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 353 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 354 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 355 } 356 } 357 else static if (LDC_with_ARM64) 358 { 359 uint fpscr = arm_get_fpcr(); 360 361 switch(fpscr & _MM_ROUND_MASK_ARM) 362 { 363 default: 364 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 365 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 366 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 367 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 368 } 369 } 370 // 64-bit can use an SSE instruction 371 else version(D_InlineAsm_X86_64) 372 { 373 long result; 374 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 375 { 376 asm pure nothrow @nogc @trusted 377 { 378 movss XMM0, value; 379 cvtss2si RAX, XMM0; 380 mov result, RAX; 381 } 382 } 383 else 384 { 385 asm pure nothrow @nogc @trusted 386 { 387 movss XMM0, value; 388 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 389 mov result, RAX; 390 } 391 } 392 return result; 393 } 394 else version(D_InlineAsm_X86) 395 { 396 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 397 // This leads to an unfortunate FPU sequence in every C++ compiler. 398 // See: https://godbolt.org/z/vZym77 399 400 // Get current MXCSR rounding 401 uint sseRounding; 402 ushort savedFPUCW; 403 ushort newFPUCW; 404 long result; 405 asm pure nothrow @nogc @trusted 406 { 407 stmxcsr sseRounding; 408 fld value; 409 fnstcw savedFPUCW; 410 mov AX, savedFPUCW; 411 and AX, 0xf3ff; // clear FPU rounding bits 412 movzx ECX, word ptr sseRounding; 413 and ECX, 0x6000; // only keep SSE rounding bits 414 shr ECX, 3; 415 or AX, CX; // make a new control word for FPU with SSE bits 416 mov newFPUCW, AX; 417 fldcw newFPUCW; 418 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 419 fldcw savedFPUCW; 420 } 421 return result; 422 } 423 else static if (GDC_with_x86) 424 { 425 version(X86_64) // 64-bit can just use the right instruction 426 { 427 static assert(GDC_with_SSE); 428 __m128 A; 429 A.ptr[0] = value; 430 return __builtin_ia32_cvtss2si64 (A); 431 } 432 else version(X86) // 32-bit 433 { 434 // This is untested! 435 uint sseRounding; 436 ushort savedFPUCW; 437 ushort newFPUCW; 438 long result; 439 asm pure nothrow @nogc @trusted 440 { 441 "stmxcsr %1;\n" ~ 442 "fld %2;\n" ~ 443 "fnstcw %3;\n" ~ 444 "movw %3, %%ax;\n" ~ 445 "andw $0xf3ff, %%ax;\n" ~ 446 "movzwl %1, %%ecx;\n" ~ 447 "andl $0x6000, %%ecx;\n" ~ 448 "shrl $3, %%ecx;\n" ~ 449 "orw %%cx, %%ax\n" ~ 450 "movw %%ax, %4;\n" ~ 451 "fldcw %4;\n" ~ 452 "fistpll %0;\n" ~ 453 "fldcw %3;\n" 454 : "=m"(result) // %0 455 : "m" (sseRounding), 456 "f" (value), 457 "m" (savedFPUCW), 458 "m" (newFPUCW) 459 : "eax", "ecx", "st"; 460 } 461 return result; 462 } 463 else 464 static assert(false); 465 } 466 else 467 static assert(false); 468 } 469 470 471 ///ditto 472 long convertDoubleToInt64UsingMXCSR(double value) @trusted 473 { 474 static if (LDC_with_ARM32) 475 { 476 // We have to resort to libc since 32-bit ARM 477 // doesn't seem to have 64-bit registers. 478 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 479 switch(fpscr & _MM_ROUND_MASK_ARM) 480 { 481 default: 482 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 483 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 484 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 485 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 486 } 487 } 488 else static if (LDC_with_ARM64) 489 { 490 // Get current rounding mode. 491 uint fpscr = arm_get_fpcr(); 492 493 switch(fpscr & _MM_ROUND_MASK_ARM) 494 { 495 default: 496 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 497 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 498 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 499 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 500 } 501 } 502 // 64-bit can use an SSE instruction 503 else version(D_InlineAsm_X86_64) 504 { 505 long result; 506 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 507 { 508 asm pure nothrow @nogc @trusted 509 { 510 movsd XMM0, value; 511 cvtsd2si RAX, XMM0; 512 mov result, RAX; 513 } 514 } 515 else 516 { 517 asm pure nothrow @nogc @trusted 518 { 519 movsd XMM0, value; 520 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 521 mov result, RAX; 522 } 523 } 524 return result; 525 } 526 else version(D_InlineAsm_X86) 527 { 528 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 529 // This leads to an unfortunate FPU sequence in every C++ compiler. 530 // See: https://godbolt.org/z/vZym77 531 532 // Get current MXCSR rounding 533 uint sseRounding; 534 ushort savedFPUCW; 535 ushort newFPUCW; 536 long result; 537 asm pure nothrow @nogc @trusted 538 { 539 stmxcsr sseRounding; 540 fld value; 541 fnstcw savedFPUCW; 542 mov AX, savedFPUCW; 543 and AX, 0xf3ff; 544 movzx ECX, word ptr sseRounding; 545 and ECX, 0x6000; 546 shr ECX, 3; 547 or AX, CX; 548 mov newFPUCW, AX; 549 fldcw newFPUCW; 550 fistp result; 551 fldcw savedFPUCW; 552 } 553 return result; 554 } 555 else static if (GDC_with_x86) 556 { 557 version(X86_64) 558 { 559 static assert(GDC_with_SSE2); 560 __m128d A; 561 A.ptr[0] = value; 562 return __builtin_ia32_cvtsd2si64 (A); 563 } 564 else 565 { 566 // This is untested! 567 uint sseRounding; 568 ushort savedFPUCW; 569 ushort newFPUCW; 570 long result; 571 asm pure nothrow @nogc @trusted 572 { 573 "stmxcsr %1;\n" ~ 574 "fld %2;\n" ~ 575 "fnstcw %3;\n" ~ 576 "movw %3, %%ax;\n" ~ 577 "andw $0xf3ff, %%ax;\n" ~ 578 "movzwl %1, %%ecx;\n" ~ 579 "andl $0x6000, %%ecx;\n" ~ 580 "shrl $3, %%ecx;\n" ~ 581 "orw %%cx, %%ax\n" ~ 582 "movw %%ax, %4;\n" ~ 583 "fldcw %4;\n" ~ 584 "fistpll %0;\n" ~ 585 "fldcw %3;\n" 586 : "=m"(result) // %0 587 : "m" (sseRounding), 588 "t" (value), 589 "m" (savedFPUCW), 590 "m" (newFPUCW) 591 : "eax", "ecx", "st"; 592 } 593 return result; 594 } 595 } 596 else 597 static assert(false); 598 } 599 600 // 601 // </ROUNDING> 602 // 603 604 605 // using the Intel terminology here 606 607 byte saturateSignedWordToSignedByte(short value) pure @safe 608 { 609 if (value > 127) value = 127; 610 if (value < -128) value = -128; 611 return cast(byte) value; 612 } 613 614 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 615 { 616 if (value > 255) value = 255; 617 if (value < 0) value = 0; 618 return cast(ubyte) value; 619 } 620 621 short saturateSignedIntToSignedShort(int value) pure @safe 622 { 623 if (value > 32767) value = 32767; 624 if (value < -32768) value = -32768; 625 return cast(short) value; 626 } 627 628 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 629 { 630 if (value > 65535) value = 65535; 631 if (value < 0) value = 0; 632 return cast(ushort) value; 633 } 634 635 unittest // test saturate operations 636 { 637 assert( saturateSignedWordToSignedByte(32000) == 127); 638 assert( saturateSignedWordToUnsignedByte(32000) == 255); 639 assert( saturateSignedWordToSignedByte(-4000) == -128); 640 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 641 assert( saturateSignedIntToSignedShort(32768) == 32767); 642 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 643 assert( saturateSignedIntToSignedShort(-32769) == -32768); 644 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 645 } 646 647 version(unittest) 648 { 649 // This is just for debugging tests 650 import core.stdc.stdio: printf; 651 652 // printing vectors for implementation 653 // Note: you can override `pure` within a `debug` clause 654 655 void _mm_print_pi64(__m64 v) @trusted 656 { 657 long1 vl = cast(long1)v; 658 printf("%lld\n", vl.array[0]); 659 } 660 661 void _mm_print_pi32(__m64 v) @trusted 662 { 663 int[2] C = (cast(int2)v).array; 664 printf("%d %d\n", C[0], C[1]); 665 } 666 667 void _mm_print_pi16(__m64 v) @trusted 668 { 669 short[4] C = (cast(short4)v).array; 670 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 671 } 672 673 void _mm_print_pi8(__m64 v) @trusted 674 { 675 byte[8] C = (cast(byte8)v).array; 676 printf("%d %d %d %d %d %d %d %d\n", 677 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 678 } 679 680 void _mm_print_epi64(__m128i v) @trusted 681 { 682 long2 vl = cast(long2)v; 683 printf("%lld %lld\n", vl.array[0], vl.array[1]); 684 } 685 686 void _mm_print_epi32(__m128i v) @trusted 687 { 688 printf("%d %d %d %d\n", 689 v.array[0], v.array[1], v.array[2], v.array[3]); 690 } 691 692 void _mm_print_epi16(__m128i v) @trusted 693 { 694 short[8] C = (cast(short8)v).array; 695 printf("%d %d %d %d %d %d %d %d\n", 696 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 697 } 698 699 void _mm_print_epi8(__m128i v) @trusted 700 { 701 byte[16] C = (cast(byte16)v).array; 702 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 703 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 704 } 705 706 void _mm_print_ps(__m128 v) @trusted 707 { 708 float[4] C = (cast(float4)v).array; 709 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 710 } 711 712 void _mm_print_pd(__m128d v) @trusted 713 { 714 double[2] C = (cast(double2)v).array; 715 printf("%f %f\n", C[0], C[1]); 716 } 717 } 718 719 720 // 721 // <FLOATING-POINT COMPARISONS> 722 // 723 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 724 // need different IR generation. 725 726 enum FPComparison 727 { 728 oeq, // ordered and equal 729 ogt, // ordered and greater than 730 oge, // ordered and greater than or equal 731 olt, // ordered and less than 732 ole, // ordered and less than or equal 733 one, // ordered and not equal 734 ord, // ordered (no nans) 735 ueq, // unordered or equal 736 ugt, // unordered or greater than ("nle") 737 uge, // unordered or greater than or equal ("nlt") 738 ult, // unordered or less than ("nge") 739 ule, // unordered or less than or equal ("ngt") 740 une, // unordered or not equal ("neq") 741 uno, // unordered (either nans) 742 } 743 744 private static immutable string[FPComparison.max+1] FPComparisonToString = 745 [ 746 "oeq", 747 "ogt", 748 "oge", 749 "olt", 750 "ole", 751 "one", 752 "ord", 753 "ueq", 754 "ugt", 755 "uge", 756 "ult", 757 "ule", 758 "une", 759 "uno", 760 ]; 761 762 // Individual float comparison: returns -1 for true or 0 for false. 763 // Useful for DMD and testing 764 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 765 { 766 bool unordered = isnan(a) || isnan(b); 767 final switch(comparison) with(FPComparison) 768 { 769 case oeq: return a == b; 770 case ogt: return a > b; 771 case oge: return a >= b; 772 case olt: return a < b; 773 case ole: return a <= b; 774 case one: return !unordered && (a != b); // NaN with != always yields true 775 case ord: return !unordered; 776 case ueq: return unordered || (a == b); 777 case ugt: return unordered || (a > b); 778 case uge: return unordered || (a >= b); 779 case ult: return unordered || (a < b); 780 case ule: return unordered || (a <= b); 781 case une: return (a != b); // NaN with != always yields true 782 case uno: return unordered; 783 } 784 } 785 786 version(LDC) 787 { 788 /// Provides packed float comparisons 789 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 790 { 791 enum ir = ` 792 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 793 %r = sext <4 x i1> %cmp to <4 x i32> 794 ret <4 x i32> %r`; 795 796 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 797 } 798 799 /// Provides packed double comparisons 800 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 801 { 802 enum ir = ` 803 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 804 %r = sext <2 x i1> %cmp to <2 x i64> 805 ret <2 x i64> %r`; 806 807 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 808 } 809 810 /// CMPSS-style comparisons 811 /// clang implement it through x86 intrinsics, it is possible with IR alone 812 /// but leads to less optimal code. 813 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 814 /// Not that simple. 815 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 816 { 817 /* 818 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 819 enum bool invertOp = (predicateNumber & 0x80) != 0; 820 static if(invertOp) 821 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 822 else 823 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 824 */ 825 enum ir = ` 826 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 827 %r = sext i1 %cmp to i32 828 %r2 = bitcast i32 %r to float 829 ret float %r2`; 830 831 float4 r = a; 832 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 833 return r; 834 } 835 836 /// CMPSD-style comparisons 837 /// clang implement it through x86 intrinsics, it is possible with IR alone 838 /// but leads to less optimal code. 839 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 840 /// Not that simple. 841 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 842 { 843 enum ir = ` 844 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 845 %r = sext i1 %cmp to i64 846 %r2 = bitcast i64 %r to double 847 ret double %r2`; 848 849 double2 r = a; 850 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 851 return r; 852 } 853 } 854 else 855 { 856 /// Provides packed float comparisons 857 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 858 { 859 int4 result; 860 foreach(i; 0..4) 861 { 862 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 863 } 864 return result; 865 } 866 867 /// Provides packed double comparisons 868 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 869 { 870 long2 result; 871 foreach(i; 0..2) 872 { 873 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 874 } 875 return result; 876 } 877 878 /// Provides CMPSS-style comparison 879 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 880 { 881 int4 result = cast(int4)a; 882 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 883 return cast(float4)result; 884 } 885 886 /// Provides CMPSD-style comparison 887 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 888 { 889 long2 result = cast(long2)a; 890 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 891 return cast(double2)result; 892 } 893 } 894 unittest // cmpps 895 { 896 // Check all comparison type is working 897 float4 A = [1, 3, 5, float.nan]; 898 float4 B = [2, 3, 4, 5]; 899 900 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 901 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 902 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 903 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 904 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 905 int4 result_one = cmpps!(FPComparison.one)(A, B); 906 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 907 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 908 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 909 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 910 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 911 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 912 int4 result_une = cmpps!(FPComparison.une)(A, B); 913 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 914 915 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 916 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 917 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 918 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 919 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 920 static immutable int[4] correct_one = [-1, 0,-1, 0]; 921 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 922 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 923 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 924 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 925 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 926 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 927 static immutable int[4] correct_une = [-1, 0,-1,-1]; 928 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 929 930 assert(result_oeq.array == correct_oeq); 931 assert(result_ogt.array == correct_ogt); 932 assert(result_oge.array == correct_oge); 933 assert(result_olt.array == correct_olt); 934 assert(result_ole.array == correct_ole); 935 assert(result_one.array == correct_one); 936 assert(result_ord.array == correct_ord); 937 assert(result_ueq.array == correct_ueq); 938 assert(result_ugt.array == correct_ugt); 939 assert(result_uge.array == correct_uge); 940 assert(result_ult.array == correct_ult); 941 assert(result_ule.array == correct_ule); 942 assert(result_une.array == correct_une); 943 assert(result_uno.array == correct_uno); 944 } 945 unittest 946 { 947 double2 a = [1, 3]; 948 double2 b = [2, 3]; 949 long2 c = cmppd!(FPComparison.ult)(a, b); 950 static immutable long[2] correct = [cast(long)(-1), 0]; 951 assert(c.array == correct); 952 } 953 unittest // cmpss 954 { 955 void testComparison(FPComparison comparison)(float4 A, float4 B) 956 { 957 float4 result = cmpss!comparison(A, B); 958 int4 iresult = cast(int4)result; 959 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 960 assert(iresult.array[0] == expected); 961 assert(result.array[1] == A.array[1]); 962 assert(result.array[2] == A.array[2]); 963 assert(result.array[3] == A.array[3]); 964 } 965 966 // Check all comparison type is working 967 float4 A = [1, 3, 5, 6]; 968 float4 B = [2, 3, 4, 5]; 969 float4 C = [float.nan, 3, 4, 5]; 970 971 testComparison!(FPComparison.oeq)(A, B); 972 testComparison!(FPComparison.oeq)(A, C); 973 testComparison!(FPComparison.ogt)(A, B); 974 testComparison!(FPComparison.ogt)(A, C); 975 testComparison!(FPComparison.oge)(A, B); 976 testComparison!(FPComparison.oge)(A, C); 977 testComparison!(FPComparison.olt)(A, B); 978 testComparison!(FPComparison.olt)(A, C); 979 testComparison!(FPComparison.ole)(A, B); 980 testComparison!(FPComparison.ole)(A, C); 981 testComparison!(FPComparison.one)(A, B); 982 testComparison!(FPComparison.one)(A, C); 983 testComparison!(FPComparison.ord)(A, B); 984 testComparison!(FPComparison.ord)(A, C); 985 testComparison!(FPComparison.ueq)(A, B); 986 testComparison!(FPComparison.ueq)(A, C); 987 testComparison!(FPComparison.ugt)(A, B); 988 testComparison!(FPComparison.ugt)(A, C); 989 testComparison!(FPComparison.uge)(A, B); 990 testComparison!(FPComparison.uge)(A, C); 991 testComparison!(FPComparison.ult)(A, B); 992 testComparison!(FPComparison.ult)(A, C); 993 testComparison!(FPComparison.ule)(A, B); 994 testComparison!(FPComparison.ule)(A, C); 995 testComparison!(FPComparison.une)(A, B); 996 testComparison!(FPComparison.une)(A, C); 997 testComparison!(FPComparison.uno)(A, B); 998 testComparison!(FPComparison.uno)(A, C); 999 } 1000 unittest // cmpsd 1001 { 1002 void testComparison(FPComparison comparison)(double2 A, double2 B) 1003 { 1004 double2 result = cmpsd!comparison(A, B); 1005 long2 iresult = cast(long2)result; 1006 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1007 assert(iresult.array[0] == expected); 1008 assert(result.array[1] == A.array[1]); 1009 } 1010 1011 // Check all comparison type is working 1012 double2 A = [1, 3]; 1013 double2 B = [2, 4]; 1014 double2 C = [double.nan, 5]; 1015 1016 testComparison!(FPComparison.oeq)(A, B); 1017 testComparison!(FPComparison.oeq)(A, C); 1018 testComparison!(FPComparison.ogt)(A, B); 1019 testComparison!(FPComparison.ogt)(A, C); 1020 testComparison!(FPComparison.oge)(A, B); 1021 testComparison!(FPComparison.oge)(A, C); 1022 testComparison!(FPComparison.olt)(A, B); 1023 testComparison!(FPComparison.olt)(A, C); 1024 testComparison!(FPComparison.ole)(A, B); 1025 testComparison!(FPComparison.ole)(A, C); 1026 testComparison!(FPComparison.one)(A, B); 1027 testComparison!(FPComparison.one)(A, C); 1028 testComparison!(FPComparison.ord)(A, B); 1029 testComparison!(FPComparison.ord)(A, C); 1030 testComparison!(FPComparison.ueq)(A, B); 1031 testComparison!(FPComparison.ueq)(A, C); 1032 testComparison!(FPComparison.ugt)(A, B); 1033 testComparison!(FPComparison.ugt)(A, C); 1034 testComparison!(FPComparison.uge)(A, B); 1035 testComparison!(FPComparison.uge)(A, C); 1036 testComparison!(FPComparison.ult)(A, B); 1037 testComparison!(FPComparison.ult)(A, C); 1038 testComparison!(FPComparison.ule)(A, B); 1039 testComparison!(FPComparison.ule)(A, C); 1040 testComparison!(FPComparison.une)(A, B); 1041 testComparison!(FPComparison.une)(A, C); 1042 testComparison!(FPComparison.uno)(A, B); 1043 testComparison!(FPComparison.uno)(A, C); 1044 } 1045 1046 // 1047 // </FLOATING-POINT COMPARISONS> 1048 // 1049 1050 1051 __m64 to_m64(__m128i a) pure @trusted 1052 { 1053 long2 la = cast(long2)a; 1054 long1 r = la.array[0]; 1055 return r; 1056 } 1057 1058 __m128i to_m128i(__m64 a) pure @trusted 1059 { 1060 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1061 1062 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1063 { 1064 long2 r = a.array[0]; 1065 r.ptr[1] = 0; 1066 return cast(int4)r; 1067 } 1068 else */ 1069 { 1070 long2 r = [0, 0]; 1071 r.ptr[0] = a.array[0]; 1072 return cast(__m128i)r; 1073 } 1074 } 1075 1076 // ADDITIONAL x86 INTRINSICS 1077 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1078 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1079 static if (LDC_with_SSE41) 1080 { 1081 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1082 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1083 } 1084 1085 // SOME NEON INTRINSICS 1086 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1087 // Not in the public API but the simde project expose it all for the user to use. 1088 // MAYDO: create a new neon.d module, for internal use only. 1089 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1090 static if (LDC_with_ARM64) 1091 { 1092 // VERY USEFUL LINK 1093 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1094 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1095 1096 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1097 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1098 1099 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1100 short8 vabsq_s16(short8 a) pure @safe; 1101 1102 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1103 int4 vabsq_s32(int4 a) pure @safe; 1104 1105 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1106 byte16 vabsq_s8(byte16 a) pure @safe; 1107 1108 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1109 { 1110 return a & b; 1111 } 1112 1113 long2 vandq_s64(long2 a, long2 b) 1114 { 1115 return a & b; 1116 } 1117 1118 long2 vbicq_s64(long2 a, long2 b) pure @safe 1119 { 1120 return a & ~b; 1121 } 1122 1123 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1124 { 1125 return c ^ ((c ^ b) & a); 1126 } 1127 1128 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1129 { 1130 return c ^ ((c ^ b) & a); 1131 } 1132 1133 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1134 { 1135 return c ^ ((c ^ b) & a); 1136 } 1137 1138 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1139 { 1140 short8 r; 1141 r.ptr[0] = lo.array[0]; 1142 r.ptr[1] = lo.array[1]; 1143 r.ptr[2] = lo.array[2]; 1144 r.ptr[3] = lo.array[3]; 1145 r.ptr[4] = hi.array[0]; 1146 r.ptr[5] = hi.array[1]; 1147 r.ptr[6] = hi.array[2]; 1148 r.ptr[7] = hi.array[3]; 1149 return r; 1150 } 1151 1152 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1153 { 1154 int4 r; 1155 r.ptr[0] = lo.array[0]; 1156 r.ptr[1] = lo.array[1]; 1157 r.ptr[2] = hi.array[0]; 1158 r.ptr[3] = hi.array[1]; 1159 return r; 1160 } 1161 1162 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1163 { 1164 byte16 r; 1165 r.ptr[0] = lo.array[0]; 1166 r.ptr[1] = lo.array[1]; 1167 r.ptr[2] = lo.array[2]; 1168 r.ptr[3] = lo.array[3]; 1169 r.ptr[4] = lo.array[4]; 1170 r.ptr[5] = lo.array[5]; 1171 r.ptr[6] = lo.array[6]; 1172 r.ptr[7] = lo.array[7]; 1173 r.ptr[8] = hi.array[0]; 1174 r.ptr[9] = hi.array[1]; 1175 r.ptr[10] = hi.array[2]; 1176 r.ptr[11] = hi.array[3]; 1177 r.ptr[12] = hi.array[4]; 1178 r.ptr[13] = hi.array[5]; 1179 r.ptr[14] = hi.array[6]; 1180 r.ptr[15] = hi.array[7]; 1181 return r; 1182 } 1183 1184 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1185 { 1186 short8 r; 1187 r.ptr[0] = lo.array[0]; 1188 r.ptr[1] = lo.array[1]; 1189 r.ptr[2] = lo.array[2]; 1190 r.ptr[3] = lo.array[3]; 1191 r.ptr[4] = hi.array[0]; 1192 r.ptr[5] = hi.array[1]; 1193 r.ptr[6] = hi.array[2]; 1194 r.ptr[7] = hi.array[3]; 1195 return r; 1196 } 1197 1198 1199 // float4 => int4 1200 1201 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1202 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1203 1204 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1205 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1206 1207 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1208 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1209 1210 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1211 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1212 1213 1214 // double2 => long2 1215 1216 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1217 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1218 1219 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1220 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1221 1222 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1223 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1224 1225 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1226 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1227 1228 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1229 int vcvtms_s32_f32(float a) pure @safe; 1230 1231 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1232 int vcvtns_s32_f32(float a) pure @safe; 1233 1234 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1235 int vcvtps_s32_f32(float a) pure @safe; 1236 1237 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1238 int vcvts_s32_f32(float a) pure @safe; 1239 1240 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1241 int vcvtms_s32_f64(double a) pure @safe; 1242 1243 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1244 int vcvtns_s32_f64(double a) pure @safe; 1245 1246 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1247 int vcvtps_s32_f64(double a) pure @safe; 1248 1249 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1250 int vcvts_s32_f64(double a) pure @safe; 1251 1252 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1253 long vcvtms_s64_f32(float a) pure @safe; 1254 1255 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1256 long vcvtns_s64_f32(float a) pure @safe; 1257 1258 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1259 long vcvtps_s64_f32(float a) pure @safe; 1260 1261 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1262 long vcvts_s64_f32(float a) pure @safe; 1263 1264 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1265 long vcvtms_s64_f64(double a) pure @safe; 1266 1267 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1268 long vcvtns_s64_f64(double a) pure @safe; 1269 1270 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1271 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1272 1273 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1274 long vcvts_s64_f64(double a) pure @safe; 1275 1276 long2 vdupq_n_s64(long value) pure @safe 1277 { 1278 long2 r; 1279 r = value; 1280 return r; 1281 } 1282 1283 short4 vget_high_s16(short8 a) pure @trusted 1284 { 1285 short4 r; 1286 r.ptr[0] = a.array[4]; 1287 r.ptr[1] = a.array[5]; 1288 r.ptr[2] = a.array[6]; 1289 r.ptr[3] = a.array[7]; 1290 return r; 1291 } 1292 1293 int2 vget_high_s32(int4 a) pure @trusted 1294 { 1295 int2 r; 1296 r.ptr[0] = a.array[2]; 1297 r.ptr[1] = a.array[3]; 1298 return r; 1299 } 1300 1301 byte8 vget_high_u8(byte16 a) pure @trusted 1302 { 1303 byte8 r; 1304 r.ptr[0] = a.array[8]; 1305 r.ptr[1] = a.array[9]; 1306 r.ptr[2] = a.array[10]; 1307 r.ptr[3] = a.array[11]; 1308 r.ptr[4] = a.array[12]; 1309 r.ptr[5] = a.array[13]; 1310 r.ptr[6] = a.array[14]; 1311 r.ptr[7] = a.array[15]; 1312 return r; 1313 } 1314 1315 short4 vget_low_s16(short8 a) pure @trusted 1316 { 1317 short4 r; 1318 r.ptr[0] = a.array[0]; 1319 r.ptr[1] = a.array[1]; 1320 r.ptr[2] = a.array[2]; 1321 r.ptr[3] = a.array[3]; 1322 return r; 1323 } 1324 1325 int2 vget_low_s32(int4 a) pure @trusted 1326 { 1327 int2 r; 1328 r.ptr[0] = a.array[0]; 1329 r.ptr[1] = a.array[1]; 1330 return r; 1331 } 1332 1333 byte8 vget_low_u8(byte16 a) pure @trusted 1334 { 1335 byte8 r; 1336 r.ptr[0] = a.array[0]; 1337 r.ptr[1] = a.array[1]; 1338 r.ptr[2] = a.array[2]; 1339 r.ptr[3] = a.array[3]; 1340 r.ptr[4] = a.array[4]; 1341 r.ptr[5] = a.array[5]; 1342 r.ptr[6] = a.array[6]; 1343 r.ptr[7] = a.array[7]; 1344 return r; 1345 } 1346 1347 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1348 { 1349 return v.array[lane]; 1350 } 1351 1352 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1353 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1354 1355 int4 vmaxq_s32(int4 a, int4 b) 1356 { 1357 int4 r; 1358 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1359 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1360 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1361 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1362 return r; 1363 } 1364 1365 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1366 short8 vminq_s16(short8 a, short8 b) pure @safe; 1367 1368 int2 vmovn_s64(long2 a) pure @trusted 1369 { 1370 int2 r; 1371 r.ptr[0] = cast(int)(a.array[0]); 1372 r.ptr[1] = cast(int)(a.array[1]); 1373 return r; 1374 } 1375 1376 int4 vmull_s16(short4 a, short4 b) pure @trusted 1377 { 1378 int4 r; 1379 r.ptr[0] = a.array[0] * b.array[0]; 1380 r.ptr[1] = a.array[1] * b.array[1]; 1381 r.ptr[2] = a.array[2] * b.array[2]; 1382 r.ptr[3] = a.array[3] * b.array[3]; 1383 return r; 1384 } 1385 1386 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1387 long2 vmull_s32(int2 a, int2 b) pure @safe; 1388 1389 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1390 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1391 1392 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1393 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1394 1395 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1396 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1397 1398 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1399 short8 vpaddlq_u8 (byte16 a) pure @safe; 1400 1401 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1402 { 1403 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1404 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1405 } 1406 else 1407 { 1408 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1409 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1410 } 1411 1412 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1413 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1414 1415 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1416 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1417 1418 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1419 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1420 1421 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1422 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1423 1424 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1425 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1426 1427 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1428 byte8 vqmovn_s16(short8 a) pure @safe; 1429 1430 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1431 short4 vqmovn_s32(int4 a) pure @safe; 1432 1433 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1434 short4 vqmovn_u32(int4 a) pure @safe; 1435 1436 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1437 byte8 vqmovun_s16(short8 a) pure @safe; 1438 1439 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1440 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1441 1442 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1443 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1444 1445 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1446 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1447 1448 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1449 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1450 1451 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1452 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1453 1454 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1455 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1456 1457 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1458 { 1459 return a >>> b; 1460 } 1461 1462 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1463 { 1464 a = a >> byte16(cast(byte)r); 1465 return a; 1466 } 1467 1468 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1469 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1470 } 1471 1472 version(unittest) 1473 { 1474 double abs_double(double x) @trusted 1475 { 1476 version(LDC) 1477 return llvm_fabs(x); 1478 else 1479 { 1480 long uf = *cast(long*)(&x); 1481 uf &= 0x7fffffff_ffffffff; 1482 return *cast(double*)(&uf); 1483 } 1484 } 1485 } 1486 1487 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1488 1489 bool isnan(float x) pure @trusted 1490 { 1491 uint u = *cast(uint*)(&x); 1492 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1493 return result; 1494 } 1495 unittest 1496 { 1497 float x = float.nan; 1498 assert(isnan(x)); 1499 1500 x = 0; 1501 assert(!isnan(x)); 1502 1503 x = float.infinity; 1504 assert(!isnan(x)); 1505 } 1506 1507 bool isnan(double x) pure @trusted 1508 { 1509 ulong u = *cast(ulong*)(&x); 1510 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1511 } 1512 unittest 1513 { 1514 double x = double.nan; 1515 assert(isnan(x)); 1516 1517 x = 0; 1518 assert(!isnan(x)); 1519 1520 x = double.infinity; 1521 assert(!isnan(x)); 1522 }