1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit 14 15 package: 16 nothrow: 17 @nogc: 18 19 20 version(GNU) 21 { 22 version (X86) 23 { 24 // For 32-bit x86, disable vector extensions with GDC. 25 // It just doesn't work well. 26 enum GDC_with_x86 = true; 27 enum GDC_with_MMX = false; 28 enum GDC_with_SSE = false; 29 enum GDC_with_SSE2 = false; 30 enum GDC_with_SSE3 = false; 31 } 32 else version (X86_64) 33 { 34 // GDC support uses extended inline assembly: 35 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 36 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 37 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 38 39 public import core.simd; 40 41 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 42 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 43 public import gcc.builtins; 44 45 enum GDC_with_x86 = true; 46 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 47 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 48 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 49 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 50 } 51 else 52 { 53 enum GDC_with_x86 = false; 54 enum GDC_with_MMX = false; 55 enum GDC_with_SSE = false; 56 enum GDC_with_SSE2 = false; 57 enum GDC_with_SSE3 = false; 58 } 59 } 60 else 61 { 62 enum GDC_with_x86 = false; 63 enum GDC_with_MMX = false; 64 enum GDC_with_SSE = false; 65 enum GDC_with_SSE2 = false; 66 enum GDC_with_SSE3 = false; 67 } 68 69 version(LDC) 70 { 71 public import core.simd; 72 public import ldc.simd; 73 public import ldc.intrinsics; 74 public import ldc.llvmasm: __asm; 75 76 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 77 static if (__VERSION__ >= 2083) 78 { 79 import ldc.llvmasm; 80 alias LDCInlineIR = __ir_pure; 81 82 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 83 alias LDCInlineIREx = __irEx_pure; 84 } 85 else 86 { 87 alias LDCInlineIR = inlineIR; 88 } 89 90 version(ARM) 91 { 92 public import ldc.gccbuiltins_arm; 93 enum LDC_with_ARM32 = true; 94 enum LDC_with_ARM64 = false; 95 enum LDC_with_SSE1 = false; 96 enum LDC_with_SSE2 = false; 97 enum LDC_with_SSE3 = false; 98 } 99 else version(AArch64) 100 { 101 enum LDC_with_ARM32 = false; 102 enum LDC_with_ARM64 = true; 103 enum LDC_with_SSE1 = false; 104 enum LDC_with_SSE2 = false; 105 enum LDC_with_SSE3 = false; 106 } 107 else 108 { 109 public import ldc.gccbuiltins_x86; 110 enum LDC_with_ARM32 = false; 111 enum LDC_with_ARM64 = false; 112 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 113 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 114 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 115 } 116 } 117 else 118 { 119 enum LDC_with_ARM32 = false; 120 enum LDC_with_ARM64 = false; 121 enum LDC_with_SSE1 = false; 122 enum LDC_with_SSE2 = false; 123 enum LDC_with_SSE3 = false; 124 } 125 126 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 127 128 version(DigitalMars) 129 { 130 version(D_InlineAsm_X86) 131 enum DMD_with_asm = true; 132 else version(D_InlineAsm_X86_64) 133 enum DMD_with_asm = true; 134 else 135 enum DMD_with_asm = false; 136 137 version(D_InlineAsm_X86) 138 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 139 else 140 enum DMD_with_32bit_asm = false; 141 142 version (D_SIMD) 143 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 144 else 145 enum DMD_with_DSIMD = false; 146 } 147 else 148 { 149 enum DMD_with_asm = false; 150 enum DMD_with_32bit_asm = false; 151 enum DMD_with_DSIMD = false; 152 } 153 154 static if (LDC_with_ARM32) 155 { 156 package uint arm_get_fpcr() nothrow @nogc @trusted 157 { 158 return __builtin_arm_get_fpscr(); 159 } 160 161 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 162 { 163 __builtin_arm_set_fpscr(cw); 164 } 165 } 166 167 static if (LDC_with_ARM64) 168 { 169 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 170 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 171 172 package uint arm_get_fpcr() pure nothrow @nogc @trusted 173 { 174 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 175 return __asm!uint("mrs $0, fpcr", "=r"); 176 } 177 178 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 179 { 180 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 181 long save_x2; 182 __asm!void("str x2, $1 \n" ~ 183 "ldr w2, $0 \n" ~ 184 "msr fpcr, x2 \n" ~ 185 "ldr x2, $1 " , "m,m", cw, &save_x2); 186 } 187 } 188 189 190 // For internal use only, since public API deals with a x86 semantic emulation 191 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 192 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 193 enum uint _MM_ROUND_UP_ARM = 0x00400000; 194 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 195 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 196 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 197 198 199 // 200 // <ROUNDING> 201 // 202 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 203 // doesn't change the FPU rounding mode, and isn't expected to do so. 204 // So we devised these rounding function to help having consistent rouding between 205 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 206 // 207 // Note: There is no MXCSR in ARM. But there is fpscr that implements similar 208 // functionality the same. 209 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 210 // There is no 211 // We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 212 213 int convertFloatToInt32UsingMXCSR(float value) @trusted 214 { 215 int result; 216 version(GNU) 217 { 218 asm pure nothrow @nogc @trusted 219 { 220 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 221 } 222 } 223 else static if (LDC_with_ARM32) 224 { 225 // TODO: this is a bug, it won't preserve registers when optimized 226 result = __asm!int(`vldr s2, $1 227 vcvtr.s32.f32 s2, s2 228 vmov $0, s2`, "=r,m", value); 229 } 230 else static if (LDC_with_ARM64) 231 { 232 // Get current rounding mode. 233 uint fpscr = arm_get_fpcr(); 234 235 switch(fpscr & _MM_ROUND_MASK_ARM) 236 { 237 default: 238 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 239 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 240 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 241 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 242 } 243 } 244 else 245 { 246 asm pure nothrow @nogc @trusted 247 { 248 cvtss2si EAX, value; 249 mov result, EAX; 250 } 251 } 252 return result; 253 } 254 255 int convertDoubleToInt32UsingMXCSR(double value) @trusted 256 { 257 int result; 258 version(GNU) 259 { 260 asm pure nothrow @nogc @trusted 261 { 262 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 263 } 264 } 265 else static if (LDC_with_ARM32) 266 { 267 // TODO: bug, doesn't preserve registers 268 result = __asm!int(`vldr d2, $1 269 vcvtr.s32.f64 s2, d2 270 vmov $0, s2`, "=r,m", value); 271 } 272 else static if (LDC_with_ARM64) 273 { 274 // Get current rounding mode. 275 uint fpscr = arm_get_fpcr(); 276 277 switch(fpscr & _MM_ROUND_MASK_ARM) 278 { 279 default: 280 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 281 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 282 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 283 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 284 } 285 } 286 else 287 { 288 asm pure nothrow @nogc @trusted 289 { 290 cvtsd2si EAX, value; 291 mov result, EAX; 292 } 293 } 294 return result; 295 } 296 297 long convertFloatToInt64UsingMXCSR(float value) @trusted 298 { 299 static if (LDC_with_ARM32) 300 { 301 // We have to resort to libc since 32-bit ARM 302 // doesn't seem to have 64-bit registers. 303 304 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 305 306 // Note: converting to double precision else rounding could be different for large integers 307 double asDouble = value; 308 309 switch(fpscr & _MM_ROUND_MASK_ARM) 310 { 311 default: 312 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 313 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 314 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 315 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 316 } 317 } 318 else static if (LDC_with_ARM64) 319 { 320 uint fpscr = arm_get_fpcr(); 321 322 switch(fpscr & _MM_ROUND_MASK_ARM) 323 { 324 default: 325 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 326 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 327 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 328 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 329 } 330 } 331 // 64-bit can use an SSE instruction 332 else version(D_InlineAsm_X86_64) 333 { 334 long result; 335 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 336 { 337 asm pure nothrow @nogc @trusted 338 { 339 movss XMM0, value; 340 cvtss2si RAX, XMM0; 341 mov result, RAX; 342 } 343 } 344 else 345 { 346 asm pure nothrow @nogc @trusted 347 { 348 movss XMM0, value; 349 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 350 mov result, RAX; 351 } 352 } 353 return result; 354 } 355 else version(D_InlineAsm_X86) 356 { 357 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 358 // This leads to an unfortunate FPU sequence in every C++ compiler. 359 // See: https://godbolt.org/z/vZym77 360 361 // Get current MXCSR rounding 362 uint sseRounding; 363 ushort savedFPUCW; 364 ushort newFPUCW; 365 long result; 366 asm pure nothrow @nogc @trusted 367 { 368 stmxcsr sseRounding; 369 fld value; 370 fnstcw savedFPUCW; 371 mov AX, savedFPUCW; 372 and AX, 0xf3ff; // clear FPU rounding bits 373 movzx ECX, word ptr sseRounding; 374 and ECX, 0x6000; // only keep SSE rounding bits 375 shr ECX, 3; 376 or AX, CX; // make a new control word for FPU with SSE bits 377 mov newFPUCW, AX; 378 fldcw newFPUCW; 379 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 380 fldcw savedFPUCW; 381 } 382 return result; 383 } 384 else static if (GDC_with_x86) 385 { 386 version(X86_64) // 64-bit can just use the right instruction 387 { 388 static assert(GDC_with_SSE); 389 __m128 A; 390 A.ptr[0] = value; 391 return __builtin_ia32_cvtss2si64 (A); 392 } 393 else version(X86) // 32-bit 394 { 395 // This is untested! 396 uint sseRounding; 397 ushort savedFPUCW; 398 ushort newFPUCW; 399 long result; 400 asm pure nothrow @nogc @trusted 401 { 402 "stmxcsr %1;\n" ~ 403 "fld %2;\n" ~ 404 "fnstcw %3;\n" ~ 405 "movw %3, %%ax;\n" ~ 406 "andw $0xf3ff, %%ax;\n" ~ 407 "movzwl %1, %%ecx;\n" ~ 408 "andl $0x6000, %%ecx;\n" ~ 409 "shrl $3, %%ecx;\n" ~ 410 "orw %%cx, %%ax\n" ~ 411 "movw %%ax, %4;\n" ~ 412 "fldcw %4;\n" ~ 413 "fistpll %0;\n" ~ 414 "fldcw %3;\n" 415 : "=m"(result) // %0 416 : "m" (sseRounding), 417 "f" (value), 418 "m" (savedFPUCW), 419 "m" (newFPUCW) 420 : "eax", "ecx", "st"; 421 } 422 return result; 423 } 424 else 425 static assert(false); 426 } 427 else 428 static assert(false); 429 } 430 431 432 ///ditto 433 long convertDoubleToInt64UsingMXCSR(double value) @trusted 434 { 435 static if (LDC_with_ARM32) 436 { 437 // We have to resort to libc since 32-bit ARM 438 // doesn't seem to have 64-bit registers. 439 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 440 switch(fpscr & _MM_ROUND_MASK_ARM) 441 { 442 default: 443 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 444 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 445 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 446 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 447 } 448 } 449 else static if (LDC_with_ARM64) 450 { 451 // Get current rounding mode. 452 uint fpscr = arm_get_fpcr(); 453 454 switch(fpscr & _MM_ROUND_MASK_ARM) 455 { 456 default: 457 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 458 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 459 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 460 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 461 } 462 } 463 // 64-bit can use an SSE instruction 464 else version(D_InlineAsm_X86_64) 465 { 466 long result; 467 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 468 { 469 asm pure nothrow @nogc @trusted 470 { 471 movsd XMM0, value; 472 cvtsd2si RAX, XMM0; 473 mov result, RAX; 474 } 475 } 476 else 477 { 478 asm pure nothrow @nogc @trusted 479 { 480 movsd XMM0, value; 481 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 482 mov result, RAX; 483 } 484 } 485 return result; 486 } 487 else version(D_InlineAsm_X86) 488 { 489 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 490 // This leads to an unfortunate FPU sequence in every C++ compiler. 491 // See: https://godbolt.org/z/vZym77 492 493 // Get current MXCSR rounding 494 uint sseRounding; 495 ushort savedFPUCW; 496 ushort newFPUCW; 497 long result; 498 asm pure nothrow @nogc @trusted 499 { 500 stmxcsr sseRounding; 501 fld value; 502 fnstcw savedFPUCW; 503 mov AX, savedFPUCW; 504 and AX, 0xf3ff; 505 movzx ECX, word ptr sseRounding; 506 and ECX, 0x6000; 507 shr ECX, 3; 508 or AX, CX; 509 mov newFPUCW, AX; 510 fldcw newFPUCW; 511 fistp result; 512 fldcw savedFPUCW; 513 } 514 return result; 515 } 516 else static if (GDC_with_x86) 517 { 518 version(X86_64) 519 { 520 static assert(GDC_with_SSE2); 521 __m128d A; 522 A.ptr[0] = value; 523 return __builtin_ia32_cvtsd2si64 (A); 524 } 525 else 526 { 527 // This is untested! 528 uint sseRounding; 529 ushort savedFPUCW; 530 ushort newFPUCW; 531 long result; 532 asm pure nothrow @nogc @trusted 533 { 534 "stmxcsr %1;\n" ~ 535 "fld %2;\n" ~ 536 "fnstcw %3;\n" ~ 537 "movw %3, %%ax;\n" ~ 538 "andw $0xf3ff, %%ax;\n" ~ 539 "movzwl %1, %%ecx;\n" ~ 540 "andl $0x6000, %%ecx;\n" ~ 541 "shrl $3, %%ecx;\n" ~ 542 "orw %%cx, %%ax\n" ~ 543 "movw %%ax, %4;\n" ~ 544 "fldcw %4;\n" ~ 545 "fistpll %0;\n" ~ 546 "fldcw %3;\n" 547 : "=m"(result) // %0 548 : "m" (sseRounding), 549 "t" (value), 550 "m" (savedFPUCW), 551 "m" (newFPUCW) 552 : "eax", "ecx", "st"; 553 } 554 return result; 555 } 556 } 557 else 558 static assert(false); 559 } 560 561 // 562 // </ROUNDING> 563 // 564 565 566 // using the Intel terminology here 567 568 byte saturateSignedWordToSignedByte(short value) pure @safe 569 { 570 if (value > 127) value = 127; 571 if (value < -128) value = -128; 572 return cast(byte) value; 573 } 574 575 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 576 { 577 if (value > 255) value = 255; 578 if (value < 0) value = 0; 579 return cast(ubyte) value; 580 } 581 582 short saturateSignedIntToSignedShort(int value) pure @safe 583 { 584 if (value > 32767) value = 32767; 585 if (value < -32768) value = -32768; 586 return cast(short) value; 587 } 588 589 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 590 { 591 if (value > 65535) value = 65535; 592 if (value < 0) value = 0; 593 return cast(ushort) value; 594 } 595 596 unittest // test saturate operations 597 { 598 assert( saturateSignedWordToSignedByte(32000) == 127); 599 assert( saturateSignedWordToUnsignedByte(32000) == 255); 600 assert( saturateSignedWordToSignedByte(-4000) == -128); 601 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 602 assert( saturateSignedIntToSignedShort(32768) == 32767); 603 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 604 assert( saturateSignedIntToSignedShort(-32769) == -32768); 605 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 606 } 607 608 version(unittest) 609 { 610 // This is just for debugging tests 611 import core.stdc.stdio: printf; 612 613 // printing vectors for implementation 614 // Note: you can override `pure` within a `debug` clause 615 616 void _mm_print_pi64(__m64 v) @trusted 617 { 618 long1 vl = cast(long1)v; 619 printf("%lld\n", vl.array[0]); 620 } 621 622 void _mm_print_pi32(__m64 v) @trusted 623 { 624 int[2] C = (cast(int2)v).array; 625 printf("%d %d\n", C[0], C[1]); 626 } 627 628 void _mm_print_pi16(__m64 v) @trusted 629 { 630 short[4] C = (cast(short4)v).array; 631 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 632 } 633 634 void _mm_print_pi8(__m64 v) @trusted 635 { 636 byte[8] C = (cast(byte8)v).array; 637 printf("%d %d %d %d %d %d %d %d\n", 638 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 639 } 640 641 void _mm_print_epi64(__m128i v) @trusted 642 { 643 long2 vl = cast(long2)v; 644 printf("%lld %lld\n", vl.array[0], vl.array[1]); 645 } 646 647 void _mm_print_epi32(__m128i v) @trusted 648 { 649 printf("%d %d %d %d\n", 650 v.array[0], v.array[1], v.array[2], v.array[3]); 651 } 652 653 void _mm_print_epi16(__m128i v) @trusted 654 { 655 short[8] C = (cast(short8)v).array; 656 printf("%d %d %d %d %d %d %d %d\n", 657 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 658 } 659 660 void _mm_print_epi8(__m128i v) @trusted 661 { 662 byte[16] C = (cast(byte16)v).array; 663 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 664 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 665 } 666 667 void _mm_print_ps(__m128 v) @trusted 668 { 669 float[4] C = (cast(float4)v).array; 670 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 671 } 672 673 void _mm_print_pd(__m128d v) @trusted 674 { 675 double[2] C = (cast(double2)v).array; 676 printf("%f %f\n", C[0], C[1]); 677 } 678 } 679 680 681 // 682 // <FLOATING-POINT COMPARISONS> 683 // 684 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 685 // need different IR generation. 686 687 enum FPComparison 688 { 689 oeq, // ordered and equal 690 ogt, // ordered and greater than 691 oge, // ordered and greater than or equal 692 olt, // ordered and less than 693 ole, // ordered and less than or equal 694 one, // ordered and not equal 695 ord, // ordered (no nans) 696 ueq, // unordered or equal 697 ugt, // unordered or greater than ("nle") 698 uge, // unordered or greater than or equal ("nlt") 699 ult, // unordered or less than ("nge") 700 ule, // unordered or less than or equal ("ngt") 701 une, // unordered or not equal ("neq") 702 uno, // unordered (either nans) 703 } 704 705 private static immutable string[FPComparison.max+1] FPComparisonToString = 706 [ 707 "oeq", 708 "ogt", 709 "oge", 710 "olt", 711 "ole", 712 "one", 713 "ord", 714 "ueq", 715 "ugt", 716 "uge", 717 "ult", 718 "ule", 719 "une", 720 "uno", 721 ]; 722 723 // Individual float comparison: returns -1 for true or 0 for false. 724 // Useful for DMD and testing 725 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 726 { 727 import std.math; 728 bool unordered = isNaN(a) || isNaN(b); 729 final switch(comparison) with(FPComparison) 730 { 731 case oeq: return a == b; 732 case ogt: return a > b; 733 case oge: return a >= b; 734 case olt: return a < b; 735 case ole: return a <= b; 736 case one: return !unordered && (a != b); // NaN with != always yields true 737 case ord: return !unordered; 738 case ueq: return unordered || (a == b); 739 case ugt: return unordered || (a > b); 740 case uge: return unordered || (a >= b); 741 case ult: return unordered || (a < b); 742 case ule: return unordered || (a <= b); 743 case une: return (a != b); // NaN with != always yields true 744 case uno: return unordered; 745 } 746 } 747 748 version(LDC) 749 { 750 /// Provides packed float comparisons 751 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 752 { 753 enum ir = ` 754 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 755 %r = sext <4 x i1> %cmp to <4 x i32> 756 ret <4 x i32> %r`; 757 758 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 759 } 760 761 /// Provides packed double comparisons 762 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 763 { 764 enum ir = ` 765 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 766 %r = sext <2 x i1> %cmp to <2 x i64> 767 ret <2 x i64> %r`; 768 769 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 770 } 771 772 /// CMPSS-style comparisons 773 /// clang implement it through x86 intrinsics, it is possible with IR alone 774 /// but leads to less optimal code. 775 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 776 /// Not that simple. 777 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 778 { 779 /* 780 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 781 enum bool invertOp = (predicateNumber & 0x80) != 0; 782 static if(invertOp) 783 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 784 else 785 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 786 */ 787 enum ir = ` 788 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 789 %r = sext i1 %cmp to i32 790 %r2 = bitcast i32 %r to float 791 ret float %r2`; 792 793 float4 r = a; 794 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 795 return r; 796 } 797 798 /// CMPSD-style comparisons 799 /// clang implement it through x86 intrinsics, it is possible with IR alone 800 /// but leads to less optimal code. 801 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 802 /// Not that simple. 803 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 804 { 805 enum ir = ` 806 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 807 %r = sext i1 %cmp to i64 808 %r2 = bitcast i64 %r to double 809 ret double %r2`; 810 811 double2 r = a; 812 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 813 return r; 814 } 815 816 // Note: ucomss and ucomsd are left unimplemented 817 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 818 { 819 enum ir = ` 820 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 821 %r = zext i1 %cmp to i32 822 ret i32 %r`; 823 824 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 825 } 826 827 // Note: ucomss and ucomsd are left unimplemented 828 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 829 { 830 enum ir = ` 831 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 832 %r = zext i1 %cmp to i32 833 ret i32 %r`; 834 835 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 836 } 837 } 838 else 839 { 840 /// Provides packed float comparisons 841 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 842 { 843 int4 result; 844 foreach(i; 0..4) 845 { 846 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 847 } 848 return result; 849 } 850 851 /// Provides packed double comparisons 852 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 853 { 854 long2 result; 855 foreach(i; 0..2) 856 { 857 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 858 } 859 return result; 860 } 861 862 /// Provides CMPSS-style comparison 863 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 864 { 865 int4 result = cast(int4)a; 866 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 867 return cast(float4)result; 868 } 869 870 /// Provides CMPSD-style comparison 871 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 872 { 873 long2 result = cast(long2)a; 874 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 875 return cast(double2)result; 876 } 877 878 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 879 { 880 return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0; 881 } 882 883 // Note: ucomss and ucomsd are left unimplemented 884 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 885 { 886 return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0; 887 } 888 } 889 unittest // cmpps 890 { 891 // Check all comparison type is working 892 float4 A = [1, 3, 5, float.nan]; 893 float4 B = [2, 3, 4, 5]; 894 895 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 896 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 897 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 898 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 899 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 900 int4 result_one = cmpps!(FPComparison.one)(A, B); 901 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 902 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 903 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 904 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 905 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 906 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 907 int4 result_une = cmpps!(FPComparison.une)(A, B); 908 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 909 910 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 911 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 912 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 913 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 914 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 915 static immutable int[4] correct_one = [-1, 0,-1, 0]; 916 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 917 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 918 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 919 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 920 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 921 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 922 static immutable int[4] correct_une = [-1, 0,-1,-1]; 923 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 924 925 assert(result_oeq.array == correct_oeq); 926 assert(result_ogt.array == correct_ogt); 927 assert(result_oge.array == correct_oge); 928 assert(result_olt.array == correct_olt); 929 assert(result_ole.array == correct_ole); 930 assert(result_one.array == correct_one); 931 assert(result_ord.array == correct_ord); 932 assert(result_ueq.array == correct_ueq); 933 assert(result_ugt.array == correct_ugt); 934 assert(result_uge.array == correct_uge); 935 assert(result_ult.array == correct_ult); 936 assert(result_ule.array == correct_ule); 937 assert(result_une.array == correct_une); 938 assert(result_uno.array == correct_uno); 939 } 940 unittest 941 { 942 double2 a = [1, 3]; 943 double2 b = [2, 3]; 944 long2 c = cmppd!(FPComparison.ult)(a, b); 945 static immutable long[2] correct = [cast(long)(-1), 0]; 946 assert(c.array == correct); 947 } 948 unittest // cmpss and comss 949 { 950 void testComparison(FPComparison comparison)(float4 A, float4 B) 951 { 952 float4 result = cmpss!comparison(A, B); 953 int4 iresult = cast(int4)result; 954 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 955 assert(iresult.array[0] == expected); 956 assert(result.array[1] == A.array[1]); 957 assert(result.array[2] == A.array[2]); 958 assert(result.array[3] == A.array[3]); 959 960 // check comss 961 int comResult = comss!comparison(A, B); 962 assert( (expected != 0) == (comResult != 0) ); 963 } 964 965 // Check all comparison type is working 966 float4 A = [1, 3, 5, 6]; 967 float4 B = [2, 3, 4, 5]; 968 float4 C = [float.nan, 3, 4, 5]; 969 970 testComparison!(FPComparison.oeq)(A, B); 971 testComparison!(FPComparison.oeq)(A, C); 972 testComparison!(FPComparison.ogt)(A, B); 973 testComparison!(FPComparison.ogt)(A, C); 974 testComparison!(FPComparison.oge)(A, B); 975 testComparison!(FPComparison.oge)(A, C); 976 testComparison!(FPComparison.olt)(A, B); 977 testComparison!(FPComparison.olt)(A, C); 978 testComparison!(FPComparison.ole)(A, B); 979 testComparison!(FPComparison.ole)(A, C); 980 testComparison!(FPComparison.one)(A, B); 981 testComparison!(FPComparison.one)(A, C); 982 testComparison!(FPComparison.ord)(A, B); 983 testComparison!(FPComparison.ord)(A, C); 984 testComparison!(FPComparison.ueq)(A, B); 985 testComparison!(FPComparison.ueq)(A, C); 986 testComparison!(FPComparison.ugt)(A, B); 987 testComparison!(FPComparison.ugt)(A, C); 988 testComparison!(FPComparison.uge)(A, B); 989 testComparison!(FPComparison.uge)(A, C); 990 testComparison!(FPComparison.ult)(A, B); 991 testComparison!(FPComparison.ult)(A, C); 992 testComparison!(FPComparison.ule)(A, B); 993 testComparison!(FPComparison.ule)(A, C); 994 testComparison!(FPComparison.une)(A, B); 995 testComparison!(FPComparison.une)(A, C); 996 testComparison!(FPComparison.uno)(A, B); 997 testComparison!(FPComparison.uno)(A, C); 998 } 999 unittest // cmpsd and comsd 1000 { 1001 void testComparison(FPComparison comparison)(double2 A, double2 B) 1002 { 1003 double2 result = cmpsd!comparison(A, B); 1004 long2 iresult = cast(long2)result; 1005 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1006 assert(iresult.array[0] == expected); 1007 assert(result.array[1] == A.array[1]); 1008 1009 // check comsd 1010 int comResult = comsd!comparison(A, B); 1011 assert( (expected != 0) == (comResult != 0) ); 1012 } 1013 1014 // Check all comparison type is working 1015 double2 A = [1, 3]; 1016 double2 B = [2, 4]; 1017 double2 C = [double.nan, 5]; 1018 1019 testComparison!(FPComparison.oeq)(A, B); 1020 testComparison!(FPComparison.oeq)(A, C); 1021 testComparison!(FPComparison.ogt)(A, B); 1022 testComparison!(FPComparison.ogt)(A, C); 1023 testComparison!(FPComparison.oge)(A, B); 1024 testComparison!(FPComparison.oge)(A, C); 1025 testComparison!(FPComparison.olt)(A, B); 1026 testComparison!(FPComparison.olt)(A, C); 1027 testComparison!(FPComparison.ole)(A, B); 1028 testComparison!(FPComparison.ole)(A, C); 1029 testComparison!(FPComparison.one)(A, B); 1030 testComparison!(FPComparison.one)(A, C); 1031 testComparison!(FPComparison.ord)(A, B); 1032 testComparison!(FPComparison.ord)(A, C); 1033 testComparison!(FPComparison.ueq)(A, B); 1034 testComparison!(FPComparison.ueq)(A, C); 1035 testComparison!(FPComparison.ugt)(A, B); 1036 testComparison!(FPComparison.ugt)(A, C); 1037 testComparison!(FPComparison.uge)(A, B); 1038 testComparison!(FPComparison.uge)(A, C); 1039 testComparison!(FPComparison.ult)(A, B); 1040 testComparison!(FPComparison.ult)(A, C); 1041 testComparison!(FPComparison.ule)(A, B); 1042 testComparison!(FPComparison.ule)(A, C); 1043 testComparison!(FPComparison.une)(A, B); 1044 testComparison!(FPComparison.une)(A, C); 1045 testComparison!(FPComparison.uno)(A, B); 1046 testComparison!(FPComparison.uno)(A, C); 1047 } 1048 1049 // 1050 // </FLOATING-POINT COMPARISONS> 1051 // 1052 1053 1054 __m64 to_m64(__m128i a) pure @trusted 1055 { 1056 long2 la = cast(long2)a; 1057 long1 r = la.array[0]; 1058 return r; 1059 } 1060 1061 __m128i to_m128i(__m64 a) pure @trusted 1062 { 1063 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1064 1065 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1066 { 1067 long2 r = a.array[0]; 1068 r.ptr[1] = 0; 1069 return cast(int4)r; 1070 } 1071 else */ 1072 { 1073 long2 r = [0, 0]; 1074 r.ptr[0] = a.array[0]; 1075 return cast(__m128i)r; 1076 } 1077 } 1078 1079 // SOME NEON INTRINSICS 1080 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1081 // Not in the public API but the simde project expose it all for the user to use. 1082 // MAYDO: create a new neon.d module, for internal use only. 1083 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1084 static if (LDC_with_ARM64) 1085 { 1086 // VERY USEFUL LINK 1087 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1088 1089 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1090 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1091 1092 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1093 { 1094 return a & b; 1095 } 1096 1097 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1098 { 1099 short8 r; 1100 r.ptr[0] = lo.array[0]; 1101 r.ptr[1] = lo.array[1]; 1102 r.ptr[2] = lo.array[2]; 1103 r.ptr[3] = lo.array[3]; 1104 r.ptr[4] = hi.array[0]; 1105 r.ptr[5] = hi.array[1]; 1106 r.ptr[6] = hi.array[2]; 1107 r.ptr[7] = hi.array[3]; 1108 return r; 1109 } 1110 1111 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1112 { 1113 int4 r; 1114 r.ptr[0] = lo.array[0]; 1115 r.ptr[1] = lo.array[1]; 1116 r.ptr[2] = hi.array[0]; 1117 r.ptr[3] = hi.array[1]; 1118 return r; 1119 } 1120 1121 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1122 { 1123 byte16 r; 1124 r.ptr[0] = lo.array[0]; 1125 r.ptr[1] = lo.array[1]; 1126 r.ptr[2] = lo.array[2]; 1127 r.ptr[3] = lo.array[3]; 1128 r.ptr[4] = lo.array[4]; 1129 r.ptr[5] = lo.array[5]; 1130 r.ptr[6] = lo.array[6]; 1131 r.ptr[7] = lo.array[7]; 1132 r.ptr[8] = hi.array[0]; 1133 r.ptr[9] = hi.array[1]; 1134 r.ptr[10] = hi.array[2]; 1135 r.ptr[11] = hi.array[3]; 1136 r.ptr[12] = hi.array[4]; 1137 r.ptr[13] = hi.array[5]; 1138 r.ptr[14] = hi.array[6]; 1139 r.ptr[15] = hi.array[7]; 1140 return r; 1141 } 1142 1143 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1144 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1145 1146 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1147 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1148 1149 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1150 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1151 1152 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1153 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1154 1155 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1156 int vcvtms_s32_f32(float a) pure @safe; 1157 1158 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1159 int vcvtns_s32_f32(float a) pure @safe; 1160 1161 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1162 int vcvtps_s32_f32(float a) pure @safe; 1163 1164 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1165 int vcvts_s32_f32(float a) pure @safe; 1166 1167 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1168 int vcvtms_s32_f64(double a) pure @safe; 1169 1170 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1171 int vcvtns_s32_f64(double a) pure @safe; 1172 1173 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1174 int vcvtps_s32_f64(double a) pure @safe; 1175 1176 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1177 int vcvts_s32_f64(double a) pure @safe; 1178 1179 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1180 long vcvtms_s64_f32(float a) pure @safe; 1181 1182 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1183 long vcvtns_s64_f32(float a) pure @safe; 1184 1185 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1186 long vcvtps_s64_f32(float a) pure @safe; 1187 1188 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1189 long vcvts_s64_f32(float a) pure @safe; 1190 1191 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1192 long vcvtms_s64_f64(double a) pure @safe; 1193 1194 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1195 long vcvtns_s64_f64(double a) pure @safe; 1196 1197 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1198 long vcvtps_s64_f64(double a) pure @safe; 1199 1200 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1201 long vcvts_s64_f64(double a) pure @safe; 1202 1203 short4 vget_high_s16(short8 a) pure @trusted 1204 { 1205 short4 r; 1206 r.ptr[0] = a.array[4]; 1207 r.ptr[1] = a.array[5]; 1208 r.ptr[2] = a.array[6]; 1209 r.ptr[3] = a.array[7]; 1210 return r; 1211 } 1212 1213 int2 vget_high_s32(int4 a) pure @trusted 1214 { 1215 int2 r; 1216 r.ptr[0] = a.array[2]; 1217 r.ptr[1] = a.array[3]; 1218 return r; 1219 } 1220 1221 byte8 vget_high_u8(byte16 a) pure @trusted 1222 { 1223 byte8 r; 1224 r.ptr[0] = a.array[8]; 1225 r.ptr[1] = a.array[9]; 1226 r.ptr[2] = a.array[10]; 1227 r.ptr[3] = a.array[11]; 1228 r.ptr[4] = a.array[12]; 1229 r.ptr[5] = a.array[13]; 1230 r.ptr[6] = a.array[14]; 1231 r.ptr[7] = a.array[15]; 1232 return r; 1233 } 1234 1235 short4 vget_low_s16(short8 a) pure @trusted 1236 { 1237 short4 r; 1238 r.ptr[0] = a.array[0]; 1239 r.ptr[1] = a.array[1]; 1240 r.ptr[2] = a.array[2]; 1241 r.ptr[3] = a.array[3]; 1242 return r; 1243 } 1244 1245 int2 vget_low_s32(int4 a) pure @trusted 1246 { 1247 int2 r; 1248 r.ptr[0] = a.array[0]; 1249 r.ptr[1] = a.array[1]; 1250 return r; 1251 } 1252 1253 byte8 vget_low_u8(byte16 a) pure @trusted 1254 { 1255 byte8 r; 1256 r.ptr[0] = a.array[0]; 1257 r.ptr[1] = a.array[1]; 1258 r.ptr[2] = a.array[2]; 1259 r.ptr[3] = a.array[3]; 1260 r.ptr[4] = a.array[4]; 1261 r.ptr[5] = a.array[5]; 1262 r.ptr[6] = a.array[6]; 1263 r.ptr[7] = a.array[7]; 1264 return r; 1265 } 1266 1267 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1268 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1269 1270 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1271 short8 vminq_s16(short8 a, short8 b) pure @safe; 1272 1273 int4 vmull_s16(short4 a, short4 b) pure @trusted 1274 { 1275 int4 r; 1276 r.ptr[0] = a.array[0] * b.array[0]; 1277 r.ptr[1] = a.array[1] * b.array[1]; 1278 r.ptr[2] = a.array[2] * b.array[2]; 1279 r.ptr[3] = a.array[3] * b.array[3]; 1280 return r; 1281 } 1282 1283 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1284 { 1285 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1286 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1287 } 1288 else 1289 { 1290 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1291 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1292 } 1293 1294 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1295 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1296 1297 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1298 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1299 1300 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1301 byte8 vqmovn_s16(short8 a) pure @safe; 1302 1303 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1304 short4 vqmovn_s32(int4 a) pure @safe; 1305 1306 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1307 byte8 vqmovun_s16(short8 a) pure @safe; 1308 1309 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1310 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1311 1312 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1313 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1314 1315 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1316 { 1317 return a >>> b; 1318 } 1319 } 1320