1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SHA = false; 32 } 33 else version (X86_64) 34 { 35 // GDC support uses extended inline assembly: 36 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 37 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 38 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 39 40 public import core.simd; 41 42 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 43 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 44 public import gcc.builtins; 45 46 enum GDC_with_x86 = true; 47 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 48 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 49 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 50 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 51 enum GDC_with_SSSE3 = false; // TODO: we don't have a way to detect that at CT 52 enum GDC_with_SHA = false; 53 } 54 else 55 { 56 enum GDC_with_x86 = false; 57 enum GDC_with_MMX = false; 58 enum GDC_with_SSE = false; 59 enum GDC_with_SSE2 = false; 60 enum GDC_with_SSE3 = false; 61 enum GDC_with_SSSE3 = false; 62 enum GDC_with_SHA = false; 63 } 64 } 65 else 66 { 67 enum GDC_with_x86 = false; 68 enum GDC_with_MMX = false; 69 enum GDC_with_SSE = false; 70 enum GDC_with_SSE2 = false; 71 enum GDC_with_SSE3 = false; 72 enum GDC_with_SSSE3 = false; 73 enum GDC_with_SHA = false; 74 } 75 76 version(LDC) 77 { 78 public import core.simd; 79 public import ldc.simd; 80 public import ldc.intrinsics; 81 public import ldc.llvmasm: __asm; 82 83 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 84 static if (__VERSION__ >= 2083) 85 { 86 import ldc.llvmasm; 87 alias LDCInlineIR = __ir_pure; 88 89 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 90 alias LDCInlineIREx = __irEx_pure; 91 } 92 else 93 { 94 alias LDCInlineIR = inlineIR; 95 } 96 97 version(ARM) 98 { 99 public import ldc.gccbuiltins_arm; 100 enum LDC_with_ARM32 = true; 101 enum LDC_with_ARM64 = false; 102 enum LDC_with_SSE1 = false; 103 enum LDC_with_SSE2 = false; 104 enum LDC_with_SSE3 = false; 105 enum LDC_with_SSSE3 = false; 106 enum LDC_with_SSE41 = false; 107 enum LDC_with_SSE42 = false; 108 enum LDC_with_AVX = false; 109 enum LDC_with_AVX2 = false; 110 enum LDC_with_SHA = false; 111 } 112 else version(AArch64) 113 { 114 enum LDC_with_ARM32 = false; 115 enum LDC_with_ARM64 = true; 116 enum LDC_with_SSE1 = false; 117 enum LDC_with_SSE2 = false; 118 enum LDC_with_SSE3 = false; 119 enum LDC_with_SSSE3 = false; 120 enum LDC_with_SSE41 = false; 121 enum LDC_with_SSE42 = false; 122 enum LDC_with_AVX = false; 123 enum LDC_with_AVX2 = false; 124 enum LDC_with_SHA = false; 125 } 126 else 127 { 128 public import ldc.gccbuiltins_x86; 129 enum LDC_with_ARM32 = false; 130 enum LDC_with_ARM64 = false; 131 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 132 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 133 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 134 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 135 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 136 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 137 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 138 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 139 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 140 } 141 } 142 else 143 { 144 enum LDC_with_ARM32 = false; 145 enum LDC_with_ARM64 = false; 146 enum LDC_with_SSE1 = false; 147 enum LDC_with_SSE2 = false; 148 enum LDC_with_SSE3 = false; 149 enum LDC_with_SSSE3 = false; 150 enum LDC_with_SSE41 = false; 151 enum LDC_with_SSE42 = false; 152 enum LDC_with_AVX = false; 153 enum LDC_with_AVX2 = false; 154 enum LDC_with_SHA = false; 155 } 156 157 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 158 159 version(DigitalMars) 160 { 161 version(D_InlineAsm_X86) 162 enum DMD_with_asm = true; 163 else version(D_InlineAsm_X86_64) 164 enum DMD_with_asm = true; 165 else 166 enum DMD_with_asm = false; 167 168 version(D_InlineAsm_X86) 169 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 170 else 171 enum DMD_with_32bit_asm = false; 172 173 version (D_SIMD) 174 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 175 else 176 enum DMD_with_DSIMD = false; 177 } 178 else 179 { 180 enum DMD_with_asm = false; 181 enum DMD_with_32bit_asm = false; 182 enum DMD_with_DSIMD = false; 183 } 184 185 static if (LDC_with_ARM32) 186 { 187 package uint arm_get_fpcr() nothrow @nogc @trusted 188 { 189 return __builtin_arm_get_fpscr(); 190 } 191 192 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 193 { 194 __builtin_arm_set_fpscr(cw); 195 } 196 } 197 198 static if (LDC_with_ARM64) 199 { 200 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 201 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 202 203 package uint arm_get_fpcr() pure nothrow @nogc @trusted 204 { 205 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 206 return __asm!uint("mrs $0, fpcr", "=r"); 207 } 208 209 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 210 { 211 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 212 long save_x2; 213 __asm!void("str x2, $1 \n" ~ 214 "ldr w2, $0 \n" ~ 215 "msr fpcr, x2 \n" ~ 216 "ldr x2, $1 " , "m,m", cw, &save_x2); 217 } 218 } 219 220 221 // For internal use only, since public API deals with a x86 semantic emulation 222 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 223 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 224 enum uint _MM_ROUND_UP_ARM = 0x00400000; 225 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 226 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 227 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 228 229 230 // 231 // <ROUNDING> 232 // 233 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 234 // doesn't change the FPU rounding mode, and isn't expected to do so. 235 // So we devised these rounding function to help having consistent rouding between 236 // LDC and DMD. It's important that DMD uses what is in MXCSR to round. 237 // 238 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 239 // functionality. 240 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 241 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 242 243 int convertFloatToInt32UsingMXCSR(float value) @trusted 244 { 245 int result; 246 version(GNU) 247 { 248 asm pure nothrow @nogc @trusted 249 { 250 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 251 } 252 } 253 else static if (LDC_with_ARM32) 254 { 255 // TODO: this is a bug, it won't preserve registers when optimized 256 result = __asm!int(`vldr s2, $1 257 vcvtr.s32.f32 s2, s2 258 vmov $0, s2`, "=r,m", value); 259 } 260 else static if (LDC_with_ARM64) 261 { 262 // Get current rounding mode. 263 uint fpscr = arm_get_fpcr(); 264 265 switch(fpscr & _MM_ROUND_MASK_ARM) 266 { 267 default: 268 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 269 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 270 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 271 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 272 } 273 } 274 else 275 { 276 asm pure nothrow @nogc @trusted 277 { 278 cvtss2si EAX, value; 279 mov result, EAX; 280 } 281 } 282 return result; 283 } 284 285 int convertDoubleToInt32UsingMXCSR(double value) @trusted 286 { 287 int result; 288 version(GNU) 289 { 290 asm pure nothrow @nogc @trusted 291 { 292 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 293 } 294 } 295 else static if (LDC_with_ARM32) 296 { 297 // TODO: bug, doesn't preserve registers 298 result = __asm!int(`vldr d2, $1 299 vcvtr.s32.f64 s2, d2 300 vmov $0, s2`, "=r,m", value); 301 } 302 else static if (LDC_with_ARM64) 303 { 304 // Get current rounding mode. 305 uint fpscr = arm_get_fpcr(); 306 307 switch(fpscr & _MM_ROUND_MASK_ARM) 308 { 309 default: 310 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 311 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 312 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 313 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 314 } 315 } 316 else 317 { 318 asm pure nothrow @nogc @trusted 319 { 320 cvtsd2si EAX, value; 321 mov result, EAX; 322 } 323 } 324 return result; 325 } 326 327 long convertFloatToInt64UsingMXCSR(float value) @trusted 328 { 329 static if (LDC_with_ARM32) 330 { 331 // We have to resort to libc since 32-bit ARM 332 // doesn't seem to have 64-bit registers. 333 334 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 335 336 // Note: converting to double precision else rounding could be different for large integers 337 double asDouble = value; 338 339 switch(fpscr & _MM_ROUND_MASK_ARM) 340 { 341 default: 342 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 343 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 344 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 345 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 346 } 347 } 348 else static if (LDC_with_ARM64) 349 { 350 uint fpscr = arm_get_fpcr(); 351 352 switch(fpscr & _MM_ROUND_MASK_ARM) 353 { 354 default: 355 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 356 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 357 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 358 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 359 } 360 } 361 // 64-bit can use an SSE instruction 362 else version(D_InlineAsm_X86_64) 363 { 364 long result; 365 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 366 { 367 asm pure nothrow @nogc @trusted 368 { 369 movss XMM0, value; 370 cvtss2si RAX, XMM0; 371 mov result, RAX; 372 } 373 } 374 else 375 { 376 asm pure nothrow @nogc @trusted 377 { 378 movss XMM0, value; 379 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 380 mov result, RAX; 381 } 382 } 383 return result; 384 } 385 else version(D_InlineAsm_X86) 386 { 387 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 388 // This leads to an unfortunate FPU sequence in every C++ compiler. 389 // See: https://godbolt.org/z/vZym77 390 391 // Get current MXCSR rounding 392 uint sseRounding; 393 ushort savedFPUCW; 394 ushort newFPUCW; 395 long result; 396 asm pure nothrow @nogc @trusted 397 { 398 stmxcsr sseRounding; 399 fld value; 400 fnstcw savedFPUCW; 401 mov AX, savedFPUCW; 402 and AX, 0xf3ff; // clear FPU rounding bits 403 movzx ECX, word ptr sseRounding; 404 and ECX, 0x6000; // only keep SSE rounding bits 405 shr ECX, 3; 406 or AX, CX; // make a new control word for FPU with SSE bits 407 mov newFPUCW, AX; 408 fldcw newFPUCW; 409 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 410 fldcw savedFPUCW; 411 } 412 return result; 413 } 414 else static if (GDC_with_x86) 415 { 416 version(X86_64) // 64-bit can just use the right instruction 417 { 418 static assert(GDC_with_SSE); 419 __m128 A; 420 A.ptr[0] = value; 421 return __builtin_ia32_cvtss2si64 (A); 422 } 423 else version(X86) // 32-bit 424 { 425 // This is untested! 426 uint sseRounding; 427 ushort savedFPUCW; 428 ushort newFPUCW; 429 long result; 430 asm pure nothrow @nogc @trusted 431 { 432 "stmxcsr %1;\n" ~ 433 "fld %2;\n" ~ 434 "fnstcw %3;\n" ~ 435 "movw %3, %%ax;\n" ~ 436 "andw $0xf3ff, %%ax;\n" ~ 437 "movzwl %1, %%ecx;\n" ~ 438 "andl $0x6000, %%ecx;\n" ~ 439 "shrl $3, %%ecx;\n" ~ 440 "orw %%cx, %%ax\n" ~ 441 "movw %%ax, %4;\n" ~ 442 "fldcw %4;\n" ~ 443 "fistpll %0;\n" ~ 444 "fldcw %3;\n" 445 : "=m"(result) // %0 446 : "m" (sseRounding), 447 "f" (value), 448 "m" (savedFPUCW), 449 "m" (newFPUCW) 450 : "eax", "ecx", "st"; 451 } 452 return result; 453 } 454 else 455 static assert(false); 456 } 457 else 458 static assert(false); 459 } 460 461 462 ///ditto 463 long convertDoubleToInt64UsingMXCSR(double value) @trusted 464 { 465 static if (LDC_with_ARM32) 466 { 467 // We have to resort to libc since 32-bit ARM 468 // doesn't seem to have 64-bit registers. 469 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 470 switch(fpscr & _MM_ROUND_MASK_ARM) 471 { 472 default: 473 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 474 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 475 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 476 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 477 } 478 } 479 else static if (LDC_with_ARM64) 480 { 481 // Get current rounding mode. 482 uint fpscr = arm_get_fpcr(); 483 484 switch(fpscr & _MM_ROUND_MASK_ARM) 485 { 486 default: 487 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 488 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 489 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 490 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 491 } 492 } 493 // 64-bit can use an SSE instruction 494 else version(D_InlineAsm_X86_64) 495 { 496 long result; 497 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 498 { 499 asm pure nothrow @nogc @trusted 500 { 501 movsd XMM0, value; 502 cvtsd2si RAX, XMM0; 503 mov result, RAX; 504 } 505 } 506 else 507 { 508 asm pure nothrow @nogc @trusted 509 { 510 movsd XMM0, value; 511 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 512 mov result, RAX; 513 } 514 } 515 return result; 516 } 517 else version(D_InlineAsm_X86) 518 { 519 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 520 // This leads to an unfortunate FPU sequence in every C++ compiler. 521 // See: https://godbolt.org/z/vZym77 522 523 // Get current MXCSR rounding 524 uint sseRounding; 525 ushort savedFPUCW; 526 ushort newFPUCW; 527 long result; 528 asm pure nothrow @nogc @trusted 529 { 530 stmxcsr sseRounding; 531 fld value; 532 fnstcw savedFPUCW; 533 mov AX, savedFPUCW; 534 and AX, 0xf3ff; 535 movzx ECX, word ptr sseRounding; 536 and ECX, 0x6000; 537 shr ECX, 3; 538 or AX, CX; 539 mov newFPUCW, AX; 540 fldcw newFPUCW; 541 fistp result; 542 fldcw savedFPUCW; 543 } 544 return result; 545 } 546 else static if (GDC_with_x86) 547 { 548 version(X86_64) 549 { 550 static assert(GDC_with_SSE2); 551 __m128d A; 552 A.ptr[0] = value; 553 return __builtin_ia32_cvtsd2si64 (A); 554 } 555 else 556 { 557 // This is untested! 558 uint sseRounding; 559 ushort savedFPUCW; 560 ushort newFPUCW; 561 long result; 562 asm pure nothrow @nogc @trusted 563 { 564 "stmxcsr %1;\n" ~ 565 "fld %2;\n" ~ 566 "fnstcw %3;\n" ~ 567 "movw %3, %%ax;\n" ~ 568 "andw $0xf3ff, %%ax;\n" ~ 569 "movzwl %1, %%ecx;\n" ~ 570 "andl $0x6000, %%ecx;\n" ~ 571 "shrl $3, %%ecx;\n" ~ 572 "orw %%cx, %%ax\n" ~ 573 "movw %%ax, %4;\n" ~ 574 "fldcw %4;\n" ~ 575 "fistpll %0;\n" ~ 576 "fldcw %3;\n" 577 : "=m"(result) // %0 578 : "m" (sseRounding), 579 "t" (value), 580 "m" (savedFPUCW), 581 "m" (newFPUCW) 582 : "eax", "ecx", "st"; 583 } 584 return result; 585 } 586 } 587 else 588 static assert(false); 589 } 590 591 // 592 // </ROUNDING> 593 // 594 595 596 // using the Intel terminology here 597 598 byte saturateSignedWordToSignedByte(short value) pure @safe 599 { 600 if (value > 127) value = 127; 601 if (value < -128) value = -128; 602 return cast(byte) value; 603 } 604 605 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 606 { 607 if (value > 255) value = 255; 608 if (value < 0) value = 0; 609 return cast(ubyte) value; 610 } 611 612 short saturateSignedIntToSignedShort(int value) pure @safe 613 { 614 if (value > 32767) value = 32767; 615 if (value < -32768) value = -32768; 616 return cast(short) value; 617 } 618 619 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 620 { 621 if (value > 65535) value = 65535; 622 if (value < 0) value = 0; 623 return cast(ushort) value; 624 } 625 626 unittest // test saturate operations 627 { 628 assert( saturateSignedWordToSignedByte(32000) == 127); 629 assert( saturateSignedWordToUnsignedByte(32000) == 255); 630 assert( saturateSignedWordToSignedByte(-4000) == -128); 631 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 632 assert( saturateSignedIntToSignedShort(32768) == 32767); 633 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 634 assert( saturateSignedIntToSignedShort(-32769) == -32768); 635 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 636 } 637 638 version(unittest) 639 { 640 // This is just for debugging tests 641 import core.stdc.stdio: printf; 642 643 // printing vectors for implementation 644 // Note: you can override `pure` within a `debug` clause 645 646 void _mm_print_pi64(__m64 v) @trusted 647 { 648 long1 vl = cast(long1)v; 649 printf("%lld\n", vl.array[0]); 650 } 651 652 void _mm_print_pi32(__m64 v) @trusted 653 { 654 int[2] C = (cast(int2)v).array; 655 printf("%d %d\n", C[0], C[1]); 656 } 657 658 void _mm_print_pi16(__m64 v) @trusted 659 { 660 short[4] C = (cast(short4)v).array; 661 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 662 } 663 664 void _mm_print_pi8(__m64 v) @trusted 665 { 666 byte[8] C = (cast(byte8)v).array; 667 printf("%d %d %d %d %d %d %d %d\n", 668 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 669 } 670 671 void _mm_print_epi64(__m128i v) @trusted 672 { 673 long2 vl = cast(long2)v; 674 printf("%lld %lld\n", vl.array[0], vl.array[1]); 675 } 676 677 void _mm_print_epi32(__m128i v) @trusted 678 { 679 printf("%d %d %d %d\n", 680 v.array[0], v.array[1], v.array[2], v.array[3]); 681 } 682 683 void _mm_print_epi16(__m128i v) @trusted 684 { 685 short[8] C = (cast(short8)v).array; 686 printf("%d %d %d %d %d %d %d %d\n", 687 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 688 } 689 690 void _mm_print_epi8(__m128i v) @trusted 691 { 692 byte[16] C = (cast(byte16)v).array; 693 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 694 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 695 } 696 697 void _mm_print_ps(__m128 v) @trusted 698 { 699 float[4] C = (cast(float4)v).array; 700 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 701 } 702 703 void _mm_print_pd(__m128d v) @trusted 704 { 705 double[2] C = (cast(double2)v).array; 706 printf("%f %f\n", C[0], C[1]); 707 } 708 } 709 710 711 // 712 // <FLOATING-POINT COMPARISONS> 713 // 714 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 715 // need different IR generation. 716 717 enum FPComparison 718 { 719 oeq, // ordered and equal 720 ogt, // ordered and greater than 721 oge, // ordered and greater than or equal 722 olt, // ordered and less than 723 ole, // ordered and less than or equal 724 one, // ordered and not equal 725 ord, // ordered (no nans) 726 ueq, // unordered or equal 727 ugt, // unordered or greater than ("nle") 728 uge, // unordered or greater than or equal ("nlt") 729 ult, // unordered or less than ("nge") 730 ule, // unordered or less than or equal ("ngt") 731 une, // unordered or not equal ("neq") 732 uno, // unordered (either nans) 733 } 734 735 private static immutable string[FPComparison.max+1] FPComparisonToString = 736 [ 737 "oeq", 738 "ogt", 739 "oge", 740 "olt", 741 "ole", 742 "one", 743 "ord", 744 "ueq", 745 "ugt", 746 "uge", 747 "ult", 748 "ule", 749 "une", 750 "uno", 751 ]; 752 753 // Individual float comparison: returns -1 for true or 0 for false. 754 // Useful for DMD and testing 755 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 756 { 757 bool unordered = isnan(a) || isnan(b); 758 final switch(comparison) with(FPComparison) 759 { 760 case oeq: return a == b; 761 case ogt: return a > b; 762 case oge: return a >= b; 763 case olt: return a < b; 764 case ole: return a <= b; 765 case one: return !unordered && (a != b); // NaN with != always yields true 766 case ord: return !unordered; 767 case ueq: return unordered || (a == b); 768 case ugt: return unordered || (a > b); 769 case uge: return unordered || (a >= b); 770 case ult: return unordered || (a < b); 771 case ule: return unordered || (a <= b); 772 case une: return (a != b); // NaN with != always yields true 773 case uno: return unordered; 774 } 775 } 776 777 version(LDC) 778 { 779 /// Provides packed float comparisons 780 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 781 { 782 enum ir = ` 783 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 784 %r = sext <4 x i1> %cmp to <4 x i32> 785 ret <4 x i32> %r`; 786 787 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 788 } 789 790 /// Provides packed double comparisons 791 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 792 { 793 enum ir = ` 794 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 795 %r = sext <2 x i1> %cmp to <2 x i64> 796 ret <2 x i64> %r`; 797 798 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 799 } 800 801 /// CMPSS-style comparisons 802 /// clang implement it through x86 intrinsics, it is possible with IR alone 803 /// but leads to less optimal code. 804 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 805 /// Not that simple. 806 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 807 { 808 /* 809 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 810 enum bool invertOp = (predicateNumber & 0x80) != 0; 811 static if(invertOp) 812 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 813 else 814 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 815 */ 816 enum ir = ` 817 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 818 %r = sext i1 %cmp to i32 819 %r2 = bitcast i32 %r to float 820 ret float %r2`; 821 822 float4 r = a; 823 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 824 return r; 825 } 826 827 /// CMPSD-style comparisons 828 /// clang implement it through x86 intrinsics, it is possible with IR alone 829 /// but leads to less optimal code. 830 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 831 /// Not that simple. 832 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 833 { 834 enum ir = ` 835 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 836 %r = sext i1 %cmp to i64 837 %r2 = bitcast i64 %r to double 838 ret double %r2`; 839 840 double2 r = a; 841 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 842 return r; 843 } 844 } 845 else 846 { 847 /// Provides packed float comparisons 848 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 849 { 850 int4 result; 851 foreach(i; 0..4) 852 { 853 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 854 } 855 return result; 856 } 857 858 /// Provides packed double comparisons 859 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 860 { 861 long2 result; 862 foreach(i; 0..2) 863 { 864 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 865 } 866 return result; 867 } 868 869 /// Provides CMPSS-style comparison 870 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 871 { 872 int4 result = cast(int4)a; 873 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 874 return cast(float4)result; 875 } 876 877 /// Provides CMPSD-style comparison 878 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 879 { 880 long2 result = cast(long2)a; 881 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 882 return cast(double2)result; 883 } 884 } 885 unittest // cmpps 886 { 887 // Check all comparison type is working 888 float4 A = [1, 3, 5, float.nan]; 889 float4 B = [2, 3, 4, 5]; 890 891 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 892 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 893 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 894 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 895 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 896 int4 result_one = cmpps!(FPComparison.one)(A, B); 897 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 898 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 899 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 900 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 901 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 902 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 903 int4 result_une = cmpps!(FPComparison.une)(A, B); 904 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 905 906 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 907 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 908 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 909 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 910 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 911 static immutable int[4] correct_one = [-1, 0,-1, 0]; 912 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 913 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 914 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 915 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 916 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 917 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 918 static immutable int[4] correct_une = [-1, 0,-1,-1]; 919 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 920 921 assert(result_oeq.array == correct_oeq); 922 assert(result_ogt.array == correct_ogt); 923 assert(result_oge.array == correct_oge); 924 assert(result_olt.array == correct_olt); 925 assert(result_ole.array == correct_ole); 926 assert(result_one.array == correct_one); 927 assert(result_ord.array == correct_ord); 928 assert(result_ueq.array == correct_ueq); 929 assert(result_ugt.array == correct_ugt); 930 assert(result_uge.array == correct_uge); 931 assert(result_ult.array == correct_ult); 932 assert(result_ule.array == correct_ule); 933 assert(result_une.array == correct_une); 934 assert(result_uno.array == correct_uno); 935 } 936 unittest 937 { 938 double2 a = [1, 3]; 939 double2 b = [2, 3]; 940 long2 c = cmppd!(FPComparison.ult)(a, b); 941 static immutable long[2] correct = [cast(long)(-1), 0]; 942 assert(c.array == correct); 943 } 944 unittest // cmpss 945 { 946 void testComparison(FPComparison comparison)(float4 A, float4 B) 947 { 948 float4 result = cmpss!comparison(A, B); 949 int4 iresult = cast(int4)result; 950 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 951 assert(iresult.array[0] == expected); 952 assert(result.array[1] == A.array[1]); 953 assert(result.array[2] == A.array[2]); 954 assert(result.array[3] == A.array[3]); 955 } 956 957 // Check all comparison type is working 958 float4 A = [1, 3, 5, 6]; 959 float4 B = [2, 3, 4, 5]; 960 float4 C = [float.nan, 3, 4, 5]; 961 962 testComparison!(FPComparison.oeq)(A, B); 963 testComparison!(FPComparison.oeq)(A, C); 964 testComparison!(FPComparison.ogt)(A, B); 965 testComparison!(FPComparison.ogt)(A, C); 966 testComparison!(FPComparison.oge)(A, B); 967 testComparison!(FPComparison.oge)(A, C); 968 testComparison!(FPComparison.olt)(A, B); 969 testComparison!(FPComparison.olt)(A, C); 970 testComparison!(FPComparison.ole)(A, B); 971 testComparison!(FPComparison.ole)(A, C); 972 testComparison!(FPComparison.one)(A, B); 973 testComparison!(FPComparison.one)(A, C); 974 testComparison!(FPComparison.ord)(A, B); 975 testComparison!(FPComparison.ord)(A, C); 976 testComparison!(FPComparison.ueq)(A, B); 977 testComparison!(FPComparison.ueq)(A, C); 978 testComparison!(FPComparison.ugt)(A, B); 979 testComparison!(FPComparison.ugt)(A, C); 980 testComparison!(FPComparison.uge)(A, B); 981 testComparison!(FPComparison.uge)(A, C); 982 testComparison!(FPComparison.ult)(A, B); 983 testComparison!(FPComparison.ult)(A, C); 984 testComparison!(FPComparison.ule)(A, B); 985 testComparison!(FPComparison.ule)(A, C); 986 testComparison!(FPComparison.une)(A, B); 987 testComparison!(FPComparison.une)(A, C); 988 testComparison!(FPComparison.uno)(A, B); 989 testComparison!(FPComparison.uno)(A, C); 990 } 991 unittest // cmpsd 992 { 993 void testComparison(FPComparison comparison)(double2 A, double2 B) 994 { 995 double2 result = cmpsd!comparison(A, B); 996 long2 iresult = cast(long2)result; 997 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 998 assert(iresult.array[0] == expected); 999 assert(result.array[1] == A.array[1]); 1000 } 1001 1002 // Check all comparison type is working 1003 double2 A = [1, 3]; 1004 double2 B = [2, 4]; 1005 double2 C = [double.nan, 5]; 1006 1007 testComparison!(FPComparison.oeq)(A, B); 1008 testComparison!(FPComparison.oeq)(A, C); 1009 testComparison!(FPComparison.ogt)(A, B); 1010 testComparison!(FPComparison.ogt)(A, C); 1011 testComparison!(FPComparison.oge)(A, B); 1012 testComparison!(FPComparison.oge)(A, C); 1013 testComparison!(FPComparison.olt)(A, B); 1014 testComparison!(FPComparison.olt)(A, C); 1015 testComparison!(FPComparison.ole)(A, B); 1016 testComparison!(FPComparison.ole)(A, C); 1017 testComparison!(FPComparison.one)(A, B); 1018 testComparison!(FPComparison.one)(A, C); 1019 testComparison!(FPComparison.ord)(A, B); 1020 testComparison!(FPComparison.ord)(A, C); 1021 testComparison!(FPComparison.ueq)(A, B); 1022 testComparison!(FPComparison.ueq)(A, C); 1023 testComparison!(FPComparison.ugt)(A, B); 1024 testComparison!(FPComparison.ugt)(A, C); 1025 testComparison!(FPComparison.uge)(A, B); 1026 testComparison!(FPComparison.uge)(A, C); 1027 testComparison!(FPComparison.ult)(A, B); 1028 testComparison!(FPComparison.ult)(A, C); 1029 testComparison!(FPComparison.ule)(A, B); 1030 testComparison!(FPComparison.ule)(A, C); 1031 testComparison!(FPComparison.une)(A, B); 1032 testComparison!(FPComparison.une)(A, C); 1033 testComparison!(FPComparison.uno)(A, B); 1034 testComparison!(FPComparison.uno)(A, C); 1035 } 1036 1037 // 1038 // </FLOATING-POINT COMPARISONS> 1039 // 1040 1041 1042 __m64 to_m64(__m128i a) pure @trusted 1043 { 1044 long2 la = cast(long2)a; 1045 long1 r = la.array[0]; 1046 return r; 1047 } 1048 1049 __m128i to_m128i(__m64 a) pure @trusted 1050 { 1051 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1052 1053 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1054 { 1055 long2 r = a.array[0]; 1056 r.ptr[1] = 0; 1057 return cast(int4)r; 1058 } 1059 else */ 1060 { 1061 long2 r = [0, 0]; 1062 r.ptr[0] = a.array[0]; 1063 return cast(__m128i)r; 1064 } 1065 } 1066 1067 // SOME NEON INTRINSICS 1068 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1069 // Not in the public API but the simde project expose it all for the user to use. 1070 // MAYDO: create a new neon.d module, for internal use only. 1071 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1072 static if (LDC_with_ARM64) 1073 { 1074 // VERY USEFUL LINK 1075 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1076 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1077 1078 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1079 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1080 1081 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1082 short8 vabsq_s16(short8 a) pure @safe; 1083 1084 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1085 int4 vabsq_s32(int4 a) pure @safe; 1086 1087 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1088 byte16 vabsq_s8(byte16 a) pure @safe; 1089 1090 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1091 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1092 1093 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1094 short8 vpaddlq_u8 (byte16 a) pure @safe; 1095 1096 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1097 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1098 1099 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1100 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1101 1102 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1103 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1104 1105 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1106 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1107 1108 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1109 { 1110 return a & b; 1111 } 1112 1113 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1114 { 1115 short8 r; 1116 r.ptr[0] = lo.array[0]; 1117 r.ptr[1] = lo.array[1]; 1118 r.ptr[2] = lo.array[2]; 1119 r.ptr[3] = lo.array[3]; 1120 r.ptr[4] = hi.array[0]; 1121 r.ptr[5] = hi.array[1]; 1122 r.ptr[6] = hi.array[2]; 1123 r.ptr[7] = hi.array[3]; 1124 return r; 1125 } 1126 1127 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1128 { 1129 int4 r; 1130 r.ptr[0] = lo.array[0]; 1131 r.ptr[1] = lo.array[1]; 1132 r.ptr[2] = hi.array[0]; 1133 r.ptr[3] = hi.array[1]; 1134 return r; 1135 } 1136 1137 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1138 { 1139 byte16 r; 1140 r.ptr[0] = lo.array[0]; 1141 r.ptr[1] = lo.array[1]; 1142 r.ptr[2] = lo.array[2]; 1143 r.ptr[3] = lo.array[3]; 1144 r.ptr[4] = lo.array[4]; 1145 r.ptr[5] = lo.array[5]; 1146 r.ptr[6] = lo.array[6]; 1147 r.ptr[7] = lo.array[7]; 1148 r.ptr[8] = hi.array[0]; 1149 r.ptr[9] = hi.array[1]; 1150 r.ptr[10] = hi.array[2]; 1151 r.ptr[11] = hi.array[3]; 1152 r.ptr[12] = hi.array[4]; 1153 r.ptr[13] = hi.array[5]; 1154 r.ptr[14] = hi.array[6]; 1155 r.ptr[15] = hi.array[7]; 1156 return r; 1157 } 1158 1159 1160 // float4 => int4 1161 1162 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1163 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1164 1165 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1166 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1167 1168 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1169 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1170 1171 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1172 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1173 1174 1175 // double2 => long2 1176 1177 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1178 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1179 1180 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1181 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1182 1183 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1184 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1185 1186 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1187 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1188 1189 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1190 int vcvtms_s32_f32(float a) pure @safe; 1191 1192 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1193 int vcvtns_s32_f32(float a) pure @safe; 1194 1195 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1196 int vcvtps_s32_f32(float a) pure @safe; 1197 1198 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1199 int vcvts_s32_f32(float a) pure @safe; 1200 1201 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1202 int vcvtms_s32_f64(double a) pure @safe; 1203 1204 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1205 int vcvtns_s32_f64(double a) pure @safe; 1206 1207 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1208 int vcvtps_s32_f64(double a) pure @safe; 1209 1210 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1211 int vcvts_s32_f64(double a) pure @safe; 1212 1213 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1214 long vcvtms_s64_f32(float a) pure @safe; 1215 1216 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1217 long vcvtns_s64_f32(float a) pure @safe; 1218 1219 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1220 long vcvtps_s64_f32(float a) pure @safe; 1221 1222 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1223 long vcvts_s64_f32(float a) pure @safe; 1224 1225 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1226 long vcvtms_s64_f64(double a) pure @safe; 1227 1228 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1229 long vcvtns_s64_f64(double a) pure @safe; 1230 1231 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1232 long vcvtps_s64_f64(double a) pure @safe; 1233 1234 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1235 long vcvts_s64_f64(double a) pure @safe; 1236 1237 short4 vget_high_s16(short8 a) pure @trusted 1238 { 1239 short4 r; 1240 r.ptr[0] = a.array[4]; 1241 r.ptr[1] = a.array[5]; 1242 r.ptr[2] = a.array[6]; 1243 r.ptr[3] = a.array[7]; 1244 return r; 1245 } 1246 1247 int2 vget_high_s32(int4 a) pure @trusted 1248 { 1249 int2 r; 1250 r.ptr[0] = a.array[2]; 1251 r.ptr[1] = a.array[3]; 1252 return r; 1253 } 1254 1255 byte8 vget_high_u8(byte16 a) pure @trusted 1256 { 1257 byte8 r; 1258 r.ptr[0] = a.array[8]; 1259 r.ptr[1] = a.array[9]; 1260 r.ptr[2] = a.array[10]; 1261 r.ptr[3] = a.array[11]; 1262 r.ptr[4] = a.array[12]; 1263 r.ptr[5] = a.array[13]; 1264 r.ptr[6] = a.array[14]; 1265 r.ptr[7] = a.array[15]; 1266 return r; 1267 } 1268 1269 short4 vget_low_s16(short8 a) pure @trusted 1270 { 1271 short4 r; 1272 r.ptr[0] = a.array[0]; 1273 r.ptr[1] = a.array[1]; 1274 r.ptr[2] = a.array[2]; 1275 r.ptr[3] = a.array[3]; 1276 return r; 1277 } 1278 1279 int2 vget_low_s32(int4 a) pure @trusted 1280 { 1281 int2 r; 1282 r.ptr[0] = a.array[0]; 1283 r.ptr[1] = a.array[1]; 1284 return r; 1285 } 1286 1287 byte8 vget_low_u8(byte16 a) pure @trusted 1288 { 1289 byte8 r; 1290 r.ptr[0] = a.array[0]; 1291 r.ptr[1] = a.array[1]; 1292 r.ptr[2] = a.array[2]; 1293 r.ptr[3] = a.array[3]; 1294 r.ptr[4] = a.array[4]; 1295 r.ptr[5] = a.array[5]; 1296 r.ptr[6] = a.array[6]; 1297 r.ptr[7] = a.array[7]; 1298 return r; 1299 } 1300 1301 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1302 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1303 1304 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1305 short8 vminq_s16(short8 a, short8 b) pure @safe; 1306 1307 int4 vmull_s16(short4 a, short4 b) pure @trusted 1308 { 1309 int4 r; 1310 r.ptr[0] = a.array[0] * b.array[0]; 1311 r.ptr[1] = a.array[1] * b.array[1]; 1312 r.ptr[2] = a.array[2] * b.array[2]; 1313 r.ptr[3] = a.array[3] * b.array[3]; 1314 return r; 1315 } 1316 1317 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1318 { 1319 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1320 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1321 } 1322 else 1323 { 1324 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1325 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1326 } 1327 1328 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1329 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1330 1331 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1332 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1333 1334 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1335 byte8 vqmovn_s16(short8 a) pure @safe; 1336 1337 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1338 short4 vqmovn_s32(int4 a) pure @safe; 1339 1340 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1341 byte8 vqmovun_s16(short8 a) pure @safe; 1342 1343 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1344 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1345 1346 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1347 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1348 1349 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1350 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1351 1352 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1353 { 1354 return a >>> b; 1355 } 1356 1357 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1358 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1359 } 1360 1361 version(unittest) 1362 { 1363 double abs_double(double x) @trusted 1364 { 1365 version(LDC) 1366 return llvm_fabs(x); 1367 else 1368 { 1369 long uf = *cast(long*)(&x); 1370 uf &= 0x7fffffff_ffffffff; 1371 return *cast(double*)(&uf); 1372 } 1373 } 1374 } 1375 1376 // needed because in olg GDC from travis, core.stdc.math.isnan isn't pure 1377 1378 bool isnan(float x) pure @trusted 1379 { 1380 uint u = *cast(uint*)(&x); 1381 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1382 return result; 1383 } 1384 unittest 1385 { 1386 float x = float.nan; 1387 assert(isnan(x)); 1388 1389 x = 0; 1390 assert(!isnan(x)); 1391 1392 x = float.infinity; 1393 assert(!isnan(x)); 1394 } 1395 1396 bool isnan(double x) pure @trusted 1397 { 1398 ulong u = *cast(ulong*)(&x); 1399 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1400 } 1401 unittest 1402 { 1403 double x = double.nan; 1404 assert(isnan(x)); 1405 1406 x = 0; 1407 assert(!isnan(x)); 1408 1409 x = double.infinity; 1410 assert(!isnan(x)); 1411 }