1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Guillaume Piolat 7 */ 8 module inteli.internals; 9 10 import inteli.types; 11 12 // The only math functions needed for intel-intrinsics 13 public import core.math: sqrt; // since it's an intrinsics 14 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit 15 16 17 version(GNU) 18 { 19 version (X86) 20 { 21 // For 32-bit x86, disable vector extensions with GDC. 22 // It just doesn't work well. 23 enum GDC_with_x86 = true; 24 enum GDC_with_MMX = false; 25 enum GDC_with_SSE = false; 26 enum GDC_with_SSE2 = false; 27 enum GDC_with_SSE3 = false; 28 enum LDC_with_ARM32 = false; 29 enum LDC_with_ARM64 = false; 30 enum LDC_with_SSE1 = false; 31 enum LDC_with_SSE2 = false; 32 enum LDC_with_SSE3 = false; 33 } 34 else version (X86_64) 35 { 36 // GDC support uses extended inline assembly: 37 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 38 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 39 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 40 41 public import core.simd; 42 43 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 44 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 45 public import gcc.builtins; 46 47 enum GDC_with_x86 = true; 48 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 49 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 50 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 51 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 52 enum LDC_with_ARM32 = false; 53 enum LDC_with_ARM64 = false; 54 enum LDC_with_SSE1 = false; 55 enum LDC_with_SSE2 = false; 56 enum LDC_with_SSE3 = false; 57 } 58 else 59 { 60 enum GDC_with_x86 = false; 61 enum GDC_with_MMX = false; 62 enum GDC_with_SSE = false; 63 enum GDC_with_SSE2 = false; 64 enum GDC_with_SSE3 = false; 65 enum LDC_with_ARM32 = false; 66 enum LDC_with_ARM64 = false; 67 enum LDC_with_SSE1 = false; 68 enum LDC_with_SSE2 = false; 69 enum LDC_with_SSE3 = false; 70 } 71 } 72 else version(LDC) 73 { 74 public import core.simd; 75 public import ldc.simd; 76 public import ldc.intrinsics; 77 public import ldc.llvmasm: __asm; 78 79 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 80 static if (__VERSION__ >= 2083) 81 { 82 import ldc.llvmasm; 83 alias LDCInlineIR = __ir_pure; 84 85 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 86 alias LDCInlineIREx = __irEx_pure; 87 } 88 else 89 { 90 alias LDCInlineIR = inlineIR; 91 } 92 93 package(inteli) 94 { 95 enum GDC_with_x86 = false; 96 enum GDC_with_MMX = false; 97 enum GDC_with_SSE = false; 98 enum GDC_with_SSE2 = false; 99 enum GDC_with_SSE3 = false; 100 } 101 102 version(ARM) 103 { 104 public import ldc.gccbuiltins_arm; 105 enum LDC_with_ARM32 = true; 106 enum LDC_with_ARM64 = false; 107 enum LDC_with_SSE1 = false; 108 enum LDC_with_SSE2 = false; 109 enum LDC_with_SSE3 = false; 110 } 111 else version(AArch64) 112 { 113 //public import ldc.gccbuiltins_arm; 114 enum LDC_with_ARM32 = false; 115 enum LDC_with_ARM64 = true; 116 enum LDC_with_SSE1 = false; 117 enum LDC_with_SSE2 = false; 118 enum LDC_with_SSE3 = false; 119 } 120 else 121 { 122 public import ldc.gccbuiltins_x86; 123 enum LDC_with_ARM32 = false; 124 enum LDC_with_ARM64 = false; 125 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 126 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 127 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 128 } 129 } 130 else version(DigitalMars) 131 { 132 package(inteli) 133 { 134 enum GDC_with_x86 = false; 135 enum GDC_with_MMX = false; 136 enum GDC_with_SSE = false; 137 enum GDC_with_SSE2 = false; 138 enum GDC_with_SSE3 = false; 139 enum LDC_with_ARM32 = false; 140 enum LDC_with_ARM64 = false; 141 enum LDC_with_SSE1 = false; 142 enum LDC_with_SSE2 = false; 143 enum LDC_with_SSE3 = false; 144 } 145 } 146 else 147 { 148 static assert(false, "Unknown compiler"); 149 } 150 151 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though 152 153 static if (LDC_with_ARM32) 154 { 155 package uint arm_get_fpcr() nothrow @nogc @trusted 156 { 157 return __builtin_arm_get_fpscr(); 158 } 159 160 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 161 { 162 __builtin_arm_set_fpscr(cw); 163 } 164 } 165 166 static if (LDC_with_ARM64) 167 { 168 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 169 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 170 171 package uint arm_get_fpcr() pure nothrow @nogc @trusted 172 { 173 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 174 return __asm!uint("mrs $0, fpcr", "=r"); 175 } 176 177 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 178 { 179 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 180 long save_x2; 181 __asm!void("str x2, $1 \n" ~ 182 "ldr w2, $0 \n" ~ 183 "msr fpcr, x2 \n" ~ 184 "ldr x2, $1 " , "m,m", cw, &save_x2); 185 } 186 } 187 188 version(DigitalMars) 189 { 190 version(D_InlineAsm_X86) 191 enum DMD_with_asm = true; 192 else version(D_InlineAsm_X86_64) 193 enum DMD_with_asm = true; 194 else 195 enum DMD_with_asm = false; 196 197 version(D_InlineAsm_X86) 198 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 199 else 200 enum DMD_with_32bit_asm = false; 201 } 202 else 203 { 204 enum DMD_with_asm = false; 205 enum DMD_with_32bit_asm = false; 206 } 207 208 209 package: 210 nothrow @nogc: 211 212 213 // For internal use only, since public API deals with a x86 semantic emulation 214 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 215 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 216 enum uint _MM_ROUND_UP_ARM = 0x00400000; 217 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 218 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 219 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 220 221 222 // 223 // <ROUNDING> 224 // 225 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 226 // doesn't change the FPU rounding mode, and isn't expected to do so. 227 // So we devised these rounding function to help having consistent rouding between 228 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 229 // 230 // Note: There is no MXCSR in ARM. But there is fpscr that implements similar 231 // functionality the same. 232 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 233 // There is no 234 // We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 235 236 int convertFloatToInt32UsingMXCSR(float value) @trusted 237 { 238 int result; 239 version(GNU) 240 { 241 asm pure nothrow @nogc @trusted 242 { 243 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 244 } 245 } 246 else static if (LDC_with_ARM32) 247 { 248 // TODO: this is a bug, it won't preserve registers when optimized 249 result = __asm!int(`vldr s2, $1 250 vcvtr.s32.f32 s2, s2 251 vmov $0, s2`, "=r,m", value); 252 } 253 else static if (LDC_with_ARM64) 254 { 255 // Get current rounding mode. 256 uint fpscr = arm_get_fpcr(); 257 258 switch(fpscr & _MM_ROUND_MASK_ARM) 259 { 260 default: 261 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 262 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 263 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 264 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 265 } 266 } 267 else 268 { 269 asm pure nothrow @nogc @trusted 270 { 271 cvtss2si EAX, value; 272 mov result, EAX; 273 } 274 } 275 return result; 276 } 277 278 int convertDoubleToInt32UsingMXCSR(double value) @trusted 279 { 280 int result; 281 version(GNU) 282 { 283 asm pure nothrow @nogc @trusted 284 { 285 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 286 } 287 } 288 else static if (LDC_with_ARM32) 289 { 290 // TODO: bug, doesn't preserve registers 291 result = __asm!int(`vldr d2, $1 292 vcvtr.s32.f64 s2, d2 293 vmov $0, s2`, "=r,m", value); 294 } 295 else static if (LDC_with_ARM64) 296 { 297 // Get current rounding mode. 298 uint fpscr = arm_get_fpcr(); 299 300 switch(fpscr & _MM_ROUND_MASK_ARM) 301 { 302 default: 303 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 304 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 305 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 306 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 307 } 308 } 309 else 310 { 311 asm pure nothrow @nogc @trusted 312 { 313 cvtsd2si EAX, value; 314 mov result, EAX; 315 } 316 } 317 return result; 318 } 319 320 long convertFloatToInt64UsingMXCSR(float value) @trusted 321 { 322 static if (LDC_with_ARM32) 323 { 324 // We have to resort to libc since 32-bit ARM 325 // doesn't seem to have 64-bit registers. 326 327 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 328 329 // Note: converting to double precision else rounding could be different for large integers 330 double asDouble = value; 331 332 switch(fpscr & _MM_ROUND_MASK_ARM) 333 { 334 default: 335 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 336 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 337 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 338 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 339 } 340 } 341 else static if (LDC_with_ARM64) 342 { 343 uint fpscr = arm_get_fpcr(); 344 345 switch(fpscr & _MM_ROUND_MASK_ARM) 346 { 347 default: 348 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 349 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 350 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 351 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 352 } 353 } 354 // 64-bit can use an SSE instruction 355 else version(D_InlineAsm_X86_64) 356 { 357 long result; 358 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 359 { 360 asm pure nothrow @nogc @trusted 361 { 362 movss XMM0, value; 363 cvtss2si RAX, XMM0; 364 mov result, RAX; 365 } 366 } 367 else 368 { 369 asm pure nothrow @nogc @trusted 370 { 371 movss XMM0, value; 372 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 373 mov result, RAX; 374 } 375 } 376 return result; 377 } 378 else version(D_InlineAsm_X86) 379 { 380 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 381 // This leads to an unfortunate FPU sequence in every C++ compiler. 382 // See: https://godbolt.org/z/vZym77 383 384 // Get current MXCSR rounding 385 uint sseRounding; 386 ushort savedFPUCW; 387 ushort newFPUCW; 388 long result; 389 asm pure nothrow @nogc @trusted 390 { 391 stmxcsr sseRounding; 392 fld value; 393 fnstcw savedFPUCW; 394 mov AX, savedFPUCW; 395 and AX, 0xf3ff; // clear FPU rounding bits 396 movzx ECX, word ptr sseRounding; 397 and ECX, 0x6000; // only keep SSE rounding bits 398 shr ECX, 3; 399 or AX, CX; // make a new control word for FPU with SSE bits 400 mov newFPUCW, AX; 401 fldcw newFPUCW; 402 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 403 fldcw savedFPUCW; 404 } 405 return result; 406 } 407 else static if (GDC_with_x86) 408 { 409 version(X86_64) // 64-bit can just use the right instruction 410 { 411 static assert(GDC_with_SSE); 412 __m128 A; 413 A.ptr[0] = value; 414 return __builtin_ia32_cvtss2si64 (A); 415 } 416 else version(X86) // 32-bit 417 { 418 // This is untested! 419 uint sseRounding; 420 ushort savedFPUCW; 421 ushort newFPUCW; 422 long result; 423 asm pure nothrow @nogc @trusted 424 { 425 "stmxcsr %1;\n" ~ 426 "fld %2;\n" ~ 427 "fnstcw %3;\n" ~ 428 "movw %3, %%ax;\n" ~ 429 "andw $0xf3ff, %%ax;\n" ~ 430 "movzwl %1, %%ecx;\n" ~ 431 "andl $0x6000, %%ecx;\n" ~ 432 "shrl $3, %%ecx;\n" ~ 433 "orw %%cx, %%ax\n" ~ 434 "movw %%ax, %4;\n" ~ 435 "fldcw %4;\n" ~ 436 "fistpll %0;\n" ~ 437 "fldcw %3;\n" 438 : "=m"(result) // %0 439 : "m" (sseRounding), 440 "f" (value), 441 "m" (savedFPUCW), 442 "m" (newFPUCW) 443 : "eax", "ecx", "st"; 444 } 445 return result; 446 } 447 else 448 static assert(false); 449 } 450 else 451 static assert(false); 452 } 453 454 455 ///ditto 456 long convertDoubleToInt64UsingMXCSR(double value) @trusted 457 { 458 static if (LDC_with_ARM32) 459 { 460 // We have to resort to libc since 32-bit ARM 461 // doesn't seem to have 64-bit registers. 462 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 463 switch(fpscr & _MM_ROUND_MASK_ARM) 464 { 465 default: 466 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 467 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 468 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 469 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 470 } 471 } 472 else static if (LDC_with_ARM64) 473 { 474 // Get current rounding mode. 475 uint fpscr = arm_get_fpcr(); 476 477 switch(fpscr & _MM_ROUND_MASK_ARM) 478 { 479 default: 480 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 481 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 482 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 483 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 484 } 485 } 486 // 64-bit can use an SSE instruction 487 else version(D_InlineAsm_X86_64) 488 { 489 long result; 490 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 491 { 492 asm pure nothrow @nogc @trusted 493 { 494 movsd XMM0, value; 495 cvtsd2si RAX, XMM0; 496 mov result, RAX; 497 } 498 } 499 else 500 { 501 asm pure nothrow @nogc @trusted 502 { 503 movsd XMM0, value; 504 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 505 mov result, RAX; 506 } 507 } 508 return result; 509 } 510 else version(D_InlineAsm_X86) 511 { 512 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 513 // This leads to an unfortunate FPU sequence in every C++ compiler. 514 // See: https://godbolt.org/z/vZym77 515 516 // Get current MXCSR rounding 517 uint sseRounding; 518 ushort savedFPUCW; 519 ushort newFPUCW; 520 long result; 521 asm pure nothrow @nogc @trusted 522 { 523 stmxcsr sseRounding; 524 fld value; 525 fnstcw savedFPUCW; 526 mov AX, savedFPUCW; 527 and AX, 0xf3ff; 528 movzx ECX, word ptr sseRounding; 529 and ECX, 0x6000; 530 shr ECX, 3; 531 or AX, CX; 532 mov newFPUCW, AX; 533 fldcw newFPUCW; 534 fistp result; 535 fldcw savedFPUCW; 536 } 537 return result; 538 } 539 else static if (GDC_with_x86) 540 { 541 version(X86_64) 542 { 543 static assert(GDC_with_SSE2); 544 __m128d A; 545 A.ptr[0] = value; 546 return __builtin_ia32_cvtsd2si64 (A); 547 } 548 else 549 { 550 // This is untested! 551 uint sseRounding; 552 ushort savedFPUCW; 553 ushort newFPUCW; 554 long result; 555 asm pure nothrow @nogc @trusted 556 { 557 "stmxcsr %1;\n" ~ 558 "fld %2;\n" ~ 559 "fnstcw %3;\n" ~ 560 "movw %3, %%ax;\n" ~ 561 "andw $0xf3ff, %%ax;\n" ~ 562 "movzwl %1, %%ecx;\n" ~ 563 "andl $0x6000, %%ecx;\n" ~ 564 "shrl $3, %%ecx;\n" ~ 565 "orw %%cx, %%ax\n" ~ 566 "movw %%ax, %4;\n" ~ 567 "fldcw %4;\n" ~ 568 "fistpll %0;\n" ~ 569 "fldcw %3;\n" 570 : "=m"(result) // %0 571 : "m" (sseRounding), 572 "t" (value), 573 "m" (savedFPUCW), 574 "m" (newFPUCW) 575 : "eax", "ecx", "st"; 576 } 577 return result; 578 } 579 } 580 else 581 static assert(false); 582 } 583 584 // 585 // </ROUNDING> 586 // 587 588 589 // using the Intel terminology here 590 591 byte saturateSignedWordToSignedByte(short value) pure @safe 592 { 593 if (value > 127) value = 127; 594 if (value < -128) value = -128; 595 return cast(byte) value; 596 } 597 598 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 599 { 600 if (value > 255) value = 255; 601 if (value < 0) value = 0; 602 return cast(ubyte) value; 603 } 604 605 short saturateSignedIntToSignedShort(int value) pure @safe 606 { 607 if (value > 32767) value = 32767; 608 if (value < -32768) value = -32768; 609 return cast(short) value; 610 } 611 612 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 613 { 614 if (value > 65535) value = 65535; 615 if (value < 0) value = 0; 616 return cast(ushort) value; 617 } 618 619 unittest // test saturate operations 620 { 621 assert( saturateSignedWordToSignedByte(32000) == 127); 622 assert( saturateSignedWordToUnsignedByte(32000) == 255); 623 assert( saturateSignedWordToSignedByte(-4000) == -128); 624 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 625 assert( saturateSignedIntToSignedShort(32768) == 32767); 626 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 627 assert( saturateSignedIntToSignedShort(-32769) == -32768); 628 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 629 } 630 631 version(unittest) 632 { 633 // This is just for debugging tests 634 import core.stdc.stdio: printf; 635 636 // printing vectors for implementation 637 // Note: you can override `pure` within a `debug` clause 638 639 void _mm_print_pi64(__m64 v) @trusted 640 { 641 long1 vl = cast(long1)v; 642 printf("%lld\n", vl.array[0]); 643 } 644 645 void _mm_print_pi32(__m64 v) @trusted 646 { 647 int[2] C = (cast(int2)v).array; 648 printf("%d %d\n", C[0], C[1]); 649 } 650 651 void _mm_print_pi16(__m64 v) @trusted 652 { 653 short[4] C = (cast(short4)v).array; 654 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 655 } 656 657 void _mm_print_pi8(__m64 v) @trusted 658 { 659 byte[8] C = (cast(byte8)v).array; 660 printf("%d %d %d %d %d %d %d %d\n", 661 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 662 } 663 664 void _mm_print_epi64(__m128i v) @trusted 665 { 666 long2 vl = cast(long2)v; 667 printf("%lld %lld\n", vl.array[0], vl.array[1]); 668 } 669 670 void _mm_print_epi32(__m128i v) @trusted 671 { 672 printf("%d %d %d %d\n", 673 v.array[0], v.array[1], v.array[2], v.array[3]); 674 } 675 676 void _mm_print_epi16(__m128i v) @trusted 677 { 678 short[8] C = (cast(short8)v).array; 679 printf("%d %d %d %d %d %d %d %d\n", 680 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 681 } 682 683 void _mm_print_epi8(__m128i v) @trusted 684 { 685 byte[16] C = (cast(byte16)v).array; 686 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 687 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 688 } 689 690 void _mm_print_ps(__m128 v) @trusted 691 { 692 float[4] C = (cast(float4)v).array; 693 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 694 } 695 696 void _mm_print_pd(__m128d v) @trusted 697 { 698 double[2] C = (cast(double2)v).array; 699 printf("%f %f\n", C[0], C[1]); 700 } 701 } 702 703 704 // 705 // <FLOATING-POINT COMPARISONS> 706 // 707 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 708 // need different IR generation. 709 710 enum FPComparison 711 { 712 oeq, // ordered and equal 713 ogt, // ordered and greater than 714 oge, // ordered and greater than or equal 715 olt, // ordered and less than 716 ole, // ordered and less than or equal 717 one, // ordered and not equal 718 ord, // ordered (no nans) 719 ueq, // unordered or equal 720 ugt, // unordered or greater than ("nle") 721 uge, // unordered or greater than or equal ("nlt") 722 ult, // unordered or less than ("nge") 723 ule, // unordered or less than or equal ("ngt") 724 une, // unordered or not equal ("neq") 725 uno, // unordered (either nans) 726 } 727 728 private static immutable string[FPComparison.max+1] FPComparisonToString = 729 [ 730 "oeq", 731 "ogt", 732 "oge", 733 "olt", 734 "ole", 735 "one", 736 "ord", 737 "ueq", 738 "ugt", 739 "uge", 740 "ult", 741 "ule", 742 "une", 743 "uno", 744 ]; 745 746 // Individual float comparison: returns -1 for true or 0 for false. 747 // Useful for DMD and testing 748 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 749 { 750 import std.math; 751 bool unordered = isNaN(a) || isNaN(b); 752 final switch(comparison) with(FPComparison) 753 { 754 case oeq: return a == b; 755 case ogt: return a > b; 756 case oge: return a >= b; 757 case olt: return a < b; 758 case ole: return a <= b; 759 case one: return !unordered && (a != b); // NaN with != always yields true 760 case ord: return !unordered; 761 case ueq: return unordered || (a == b); 762 case ugt: return unordered || (a > b); 763 case uge: return unordered || (a >= b); 764 case ult: return unordered || (a < b); 765 case ule: return unordered || (a <= b); 766 case une: return (a != b); // NaN with != always yields true 767 case uno: return unordered; 768 } 769 } 770 771 version(LDC) 772 { 773 /// Provides packed float comparisons 774 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 775 { 776 enum ir = ` 777 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 778 %r = sext <4 x i1> %cmp to <4 x i32> 779 ret <4 x i32> %r`; 780 781 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 782 } 783 784 /// Provides packed double comparisons 785 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 786 { 787 enum ir = ` 788 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 789 %r = sext <2 x i1> %cmp to <2 x i64> 790 ret <2 x i64> %r`; 791 792 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 793 } 794 795 /// CMPSS-style comparisons 796 /// clang implement it through x86 intrinsics, it is possible with IR alone 797 /// but leads to less optimal code. 798 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 799 /// Not that simple. 800 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 801 { 802 /* 803 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 804 enum bool invertOp = (predicateNumber & 0x80) != 0; 805 static if(invertOp) 806 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 807 else 808 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 809 */ 810 enum ir = ` 811 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 812 %r = sext i1 %cmp to i32 813 %r2 = bitcast i32 %r to float 814 ret float %r2`; 815 816 float4 r = a; 817 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 818 return r; 819 } 820 821 /// CMPSD-style comparisons 822 /// clang implement it through x86 intrinsics, it is possible with IR alone 823 /// but leads to less optimal code. 824 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 825 /// Not that simple. 826 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 827 { 828 enum ir = ` 829 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 830 %r = sext i1 %cmp to i64 831 %r2 = bitcast i64 %r to double 832 ret double %r2`; 833 834 double2 r = a; 835 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 836 return r; 837 } 838 839 // Note: ucomss and ucomsd are left unimplemented 840 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 841 { 842 enum ir = ` 843 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 844 %r = zext i1 %cmp to i32 845 ret i32 %r`; 846 847 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 848 } 849 850 // Note: ucomss and ucomsd are left unimplemented 851 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 852 { 853 enum ir = ` 854 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 855 %r = zext i1 %cmp to i32 856 ret i32 %r`; 857 858 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 859 } 860 } 861 else 862 { 863 /// Provides packed float comparisons 864 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 865 { 866 int4 result; 867 foreach(i; 0..4) 868 { 869 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 870 } 871 return result; 872 } 873 874 /// Provides packed double comparisons 875 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 876 { 877 long2 result; 878 foreach(i; 0..2) 879 { 880 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 881 } 882 return result; 883 } 884 885 /// Provides CMPSS-style comparison 886 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 887 { 888 int4 result = cast(int4)a; 889 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 890 return cast(float4)result; 891 } 892 893 /// Provides CMPSD-style comparison 894 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 895 { 896 long2 result = cast(long2)a; 897 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 898 return cast(double2)result; 899 } 900 901 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 902 { 903 return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0; 904 } 905 906 // Note: ucomss and ucomsd are left unimplemented 907 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 908 { 909 return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0; 910 } 911 } 912 unittest // cmpps 913 { 914 // Check all comparison type is working 915 float4 A = [1, 3, 5, float.nan]; 916 float4 B = [2, 3, 4, 5]; 917 918 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 919 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 920 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 921 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 922 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 923 int4 result_one = cmpps!(FPComparison.one)(A, B); 924 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 925 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 926 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 927 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 928 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 929 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 930 int4 result_une = cmpps!(FPComparison.une)(A, B); 931 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 932 933 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 934 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 935 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 936 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 937 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 938 static immutable int[4] correct_one = [-1, 0,-1, 0]; 939 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 940 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 941 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 942 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 943 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 944 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 945 static immutable int[4] correct_une = [-1, 0,-1,-1]; 946 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 947 948 assert(result_oeq.array == correct_oeq); 949 assert(result_ogt.array == correct_ogt); 950 assert(result_oge.array == correct_oge); 951 assert(result_olt.array == correct_olt); 952 assert(result_ole.array == correct_ole); 953 assert(result_one.array == correct_one); 954 assert(result_ord.array == correct_ord); 955 assert(result_ueq.array == correct_ueq); 956 assert(result_ugt.array == correct_ugt); 957 assert(result_uge.array == correct_uge); 958 assert(result_ult.array == correct_ult); 959 assert(result_ule.array == correct_ule); 960 assert(result_une.array == correct_une); 961 assert(result_uno.array == correct_uno); 962 } 963 unittest 964 { 965 double2 a = [1, 3]; 966 double2 b = [2, 3]; 967 long2 c = cmppd!(FPComparison.ult)(a, b); 968 static immutable long[2] correct = [cast(long)(-1), 0]; 969 assert(c.array == correct); 970 } 971 unittest // cmpss and comss 972 { 973 void testComparison(FPComparison comparison)(float4 A, float4 B) 974 { 975 float4 result = cmpss!comparison(A, B); 976 int4 iresult = cast(int4)result; 977 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 978 assert(iresult.array[0] == expected); 979 assert(result.array[1] == A.array[1]); 980 assert(result.array[2] == A.array[2]); 981 assert(result.array[3] == A.array[3]); 982 983 // check comss 984 int comResult = comss!comparison(A, B); 985 assert( (expected != 0) == (comResult != 0) ); 986 } 987 988 // Check all comparison type is working 989 float4 A = [1, 3, 5, 6]; 990 float4 B = [2, 3, 4, 5]; 991 float4 C = [float.nan, 3, 4, 5]; 992 993 testComparison!(FPComparison.oeq)(A, B); 994 testComparison!(FPComparison.oeq)(A, C); 995 testComparison!(FPComparison.ogt)(A, B); 996 testComparison!(FPComparison.ogt)(A, C); 997 testComparison!(FPComparison.oge)(A, B); 998 testComparison!(FPComparison.oge)(A, C); 999 testComparison!(FPComparison.olt)(A, B); 1000 testComparison!(FPComparison.olt)(A, C); 1001 testComparison!(FPComparison.ole)(A, B); 1002 testComparison!(FPComparison.ole)(A, C); 1003 testComparison!(FPComparison.one)(A, B); 1004 testComparison!(FPComparison.one)(A, C); 1005 testComparison!(FPComparison.ord)(A, B); 1006 testComparison!(FPComparison.ord)(A, C); 1007 testComparison!(FPComparison.ueq)(A, B); 1008 testComparison!(FPComparison.ueq)(A, C); 1009 testComparison!(FPComparison.ugt)(A, B); 1010 testComparison!(FPComparison.ugt)(A, C); 1011 testComparison!(FPComparison.uge)(A, B); 1012 testComparison!(FPComparison.uge)(A, C); 1013 testComparison!(FPComparison.ult)(A, B); 1014 testComparison!(FPComparison.ult)(A, C); 1015 testComparison!(FPComparison.ule)(A, B); 1016 testComparison!(FPComparison.ule)(A, C); 1017 testComparison!(FPComparison.une)(A, B); 1018 testComparison!(FPComparison.une)(A, C); 1019 testComparison!(FPComparison.uno)(A, B); 1020 testComparison!(FPComparison.uno)(A, C); 1021 } 1022 unittest // cmpsd and comsd 1023 { 1024 void testComparison(FPComparison comparison)(double2 A, double2 B) 1025 { 1026 double2 result = cmpsd!comparison(A, B); 1027 long2 iresult = cast(long2)result; 1028 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1029 assert(iresult.array[0] == expected); 1030 assert(result.array[1] == A.array[1]); 1031 1032 // check comsd 1033 int comResult = comsd!comparison(A, B); 1034 assert( (expected != 0) == (comResult != 0) ); 1035 } 1036 1037 // Check all comparison type is working 1038 double2 A = [1, 3]; 1039 double2 B = [2, 4]; 1040 double2 C = [double.nan, 5]; 1041 1042 testComparison!(FPComparison.oeq)(A, B); 1043 testComparison!(FPComparison.oeq)(A, C); 1044 testComparison!(FPComparison.ogt)(A, B); 1045 testComparison!(FPComparison.ogt)(A, C); 1046 testComparison!(FPComparison.oge)(A, B); 1047 testComparison!(FPComparison.oge)(A, C); 1048 testComparison!(FPComparison.olt)(A, B); 1049 testComparison!(FPComparison.olt)(A, C); 1050 testComparison!(FPComparison.ole)(A, B); 1051 testComparison!(FPComparison.ole)(A, C); 1052 testComparison!(FPComparison.one)(A, B); 1053 testComparison!(FPComparison.one)(A, C); 1054 testComparison!(FPComparison.ord)(A, B); 1055 testComparison!(FPComparison.ord)(A, C); 1056 testComparison!(FPComparison.ueq)(A, B); 1057 testComparison!(FPComparison.ueq)(A, C); 1058 testComparison!(FPComparison.ugt)(A, B); 1059 testComparison!(FPComparison.ugt)(A, C); 1060 testComparison!(FPComparison.uge)(A, B); 1061 testComparison!(FPComparison.uge)(A, C); 1062 testComparison!(FPComparison.ult)(A, B); 1063 testComparison!(FPComparison.ult)(A, C); 1064 testComparison!(FPComparison.ule)(A, B); 1065 testComparison!(FPComparison.ule)(A, C); 1066 testComparison!(FPComparison.une)(A, B); 1067 testComparison!(FPComparison.une)(A, C); 1068 testComparison!(FPComparison.uno)(A, B); 1069 testComparison!(FPComparison.uno)(A, C); 1070 } 1071 1072 // 1073 // </FLOATING-POINT COMPARISONS> 1074 // 1075 1076 1077 __m64 to_m64(__m128i a) pure @trusted 1078 { 1079 long2 la = cast(long2)a; 1080 long1 r; 1081 r.ptr[0] = la.array[0]; 1082 return r; 1083 } 1084 1085 __m128i to_m128i(__m64 a) pure @trusted 1086 { 1087 long2 r = [0, 0]; 1088 r.ptr[0] = a.array[0]; 1089 return cast(__m128i)r; 1090 } 1091 1092 // SOME NEON INTRINSICS 1093 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1094 // Not in the public API but the simde project expose it all for the user to use. 1095 // MAYDO: create a new neon.d module, for internal use only. 1096 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1097 static if (LDC_with_ARM64) 1098 { 1099 // VERY USEFUL LINK 1100 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1101 1102 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1103 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1104 1105 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1106 { 1107 return a & b; 1108 } 1109 1110 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1111 { 1112 int4 r; 1113 r.ptr[0] = lo.array[0]; 1114 r.ptr[1] = lo.array[1]; 1115 r.ptr[2] = hi.array[0]; 1116 r.ptr[3] = hi.array[1]; 1117 return r; 1118 } 1119 1120 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1121 { 1122 byte16 r; 1123 r.ptr[0] = lo.array[0]; 1124 r.ptr[1] = lo.array[1]; 1125 r.ptr[2] = lo.array[2]; 1126 r.ptr[3] = lo.array[3]; 1127 r.ptr[4] = lo.array[4]; 1128 r.ptr[5] = lo.array[5]; 1129 r.ptr[6] = lo.array[6]; 1130 r.ptr[7] = lo.array[7]; 1131 r.ptr[8] = hi.array[0]; 1132 r.ptr[9] = hi.array[1]; 1133 r.ptr[10] = hi.array[2]; 1134 r.ptr[11] = hi.array[3]; 1135 r.ptr[12] = hi.array[4]; 1136 r.ptr[13] = hi.array[5]; 1137 r.ptr[14] = hi.array[6]; 1138 r.ptr[15] = hi.array[7]; 1139 return r; 1140 } 1141 1142 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1143 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1144 1145 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1146 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1147 1148 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1149 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1150 1151 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1152 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1153 1154 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1155 int vcvtms_s32_f32(float a) pure @safe; 1156 1157 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1158 int vcvtns_s32_f32(float a) pure @safe; 1159 1160 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1161 int vcvtps_s32_f32(float a) pure @safe; 1162 1163 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1164 int vcvts_s32_f32(float a) pure @safe; 1165 1166 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1167 int vcvtms_s32_f64(double a) pure @safe; 1168 1169 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1170 int vcvtns_s32_f64(double a) pure @safe; 1171 1172 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1173 int vcvtps_s32_f64(double a) pure @safe; 1174 1175 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1176 int vcvts_s32_f64(double a) pure @safe; 1177 1178 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1179 long vcvtms_s64_f32(float a) pure @safe; 1180 1181 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1182 long vcvtns_s64_f32(float a) pure @safe; 1183 1184 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1185 long vcvtps_s64_f32(float a) pure @safe; 1186 1187 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1188 long vcvts_s64_f32(float a) pure @safe; 1189 1190 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1191 long vcvtms_s64_f64(double a) pure @safe; 1192 1193 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1194 long vcvtns_s64_f64(double a) pure @safe; 1195 1196 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1197 long vcvtps_s64_f64(double a) pure @safe; 1198 1199 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1200 long vcvts_s64_f64(double a) pure @safe; 1201 1202 short4 vget_high_s16(short8 a) pure @trusted 1203 { 1204 short4 r; 1205 r.ptr[0] = a.array[4]; 1206 r.ptr[1] = a.array[5]; 1207 r.ptr[2] = a.array[6]; 1208 r.ptr[3] = a.array[7]; 1209 return r; 1210 } 1211 1212 int2 vget_high_s32(int4 a) pure @trusted 1213 { 1214 int2 r; 1215 r.ptr[0] = a.array[2]; 1216 r.ptr[1] = a.array[3]; 1217 return r; 1218 } 1219 1220 byte8 vget_high_u8(byte16 a) pure @trusted 1221 { 1222 byte8 r; 1223 r.ptr[0] = a.array[8]; 1224 r.ptr[1] = a.array[9]; 1225 r.ptr[2] = a.array[10]; 1226 r.ptr[3] = a.array[11]; 1227 r.ptr[4] = a.array[12]; 1228 r.ptr[5] = a.array[13]; 1229 r.ptr[6] = a.array[14]; 1230 r.ptr[7] = a.array[15]; 1231 return r; 1232 } 1233 1234 short4 vget_low_s16(short8 a) pure @trusted 1235 { 1236 short4 r; 1237 r.ptr[0] = a.array[0]; 1238 r.ptr[1] = a.array[1]; 1239 r.ptr[2] = a.array[2]; 1240 r.ptr[3] = a.array[3]; 1241 return r; 1242 } 1243 1244 int2 vget_low_s32(int4 a) pure @trusted 1245 { 1246 int2 r; 1247 r.ptr[0] = a.array[0]; 1248 r.ptr[1] = a.array[1]; 1249 return r; 1250 } 1251 1252 byte8 vget_low_u8(byte16 a) pure @trusted 1253 { 1254 byte8 r; 1255 r.ptr[0] = a.array[0]; 1256 r.ptr[1] = a.array[1]; 1257 r.ptr[2] = a.array[2]; 1258 r.ptr[3] = a.array[3]; 1259 r.ptr[4] = a.array[4]; 1260 r.ptr[5] = a.array[5]; 1261 r.ptr[6] = a.array[6]; 1262 r.ptr[7] = a.array[7]; 1263 return r; 1264 } 1265 1266 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1267 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1268 1269 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1270 short8 vminq_s16(short8 a, short8 b) pure @safe; 1271 1272 int4 vmull_s16(short4 a, short4 b) pure @trusted 1273 { 1274 int4 r; 1275 r.ptr[0] = a.array[0] * b.array[0]; 1276 r.ptr[1] = a.array[1] * b.array[1]; 1277 r.ptr[2] = a.array[2] * b.array[2]; 1278 r.ptr[3] = a.array[3] * b.array[3]; 1279 return r; 1280 } 1281 1282 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1283 { 1284 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1285 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1286 } 1287 else 1288 { 1289 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1290 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1291 } 1292 1293 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1294 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1295 1296 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1297 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1298 1299 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1300 byte8 vqmovn_s16(short8 a) pure @safe; 1301 1302 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1303 byte8 vqmovun_s16(short8 a) pure @safe; 1304 1305 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1306 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1307 1308 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1309 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1310 1311 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1312 { 1313 return a >>> b; 1314 } 1315 } 1316