1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_AVX = false; 34 enum GDC_with_AVX2 = false; 35 enum GDC_with_SHA = false; 36 enum GDC_with_BMI2 = false; 37 } 38 else version (X86_64) 39 { 40 // GDC support uses extended inline assembly: 41 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 42 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 43 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 44 45 public import core.simd: byte16, short8, int4, float4, double2; 46 47 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 48 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 49 public import gcc.builtins; 50 51 // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 52 // want to support other archs with GDC 53 54 enum GDC_with_x86 = true; 55 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 56 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 57 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 58 59 static if (__VERSION__ >= 2100) // Starting at GDC 12.1 60 { 61 enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps); 62 enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128); 63 enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps); 64 enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq); 65 enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256); 66 enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df); 67 } 68 else 69 { 70 // Before GCC 11.3, no reliable way to detect instruction sets. 71 // We start above detection at GCC 12, with DMDFE 2.100, which 72 // is more conservative. 73 enum GDC_with_SSE3 = false; 74 enum GDC_with_SSSE3 = false; 75 enum GDC_with_SSE41 = false; 76 enum GDC_with_SSE42 = false; 77 enum GDC_with_AVX = false; 78 enum GDC_with_AVX2 = false; 79 } 80 81 enum GDC_with_SHA = false; // TODO: detect that 82 enum GDC_with_BMI2 = false; // TODO: detect that 83 } 84 else 85 { 86 enum GDC_with_x86 = false; 87 enum GDC_with_MMX = false; 88 enum GDC_with_SSE = false; 89 enum GDC_with_SSE2 = false; 90 enum GDC_with_SSE3 = false; 91 enum GDC_with_SSSE3 = false; 92 enum GDC_with_SSE41 = false; 93 enum GDC_with_SSE42 = false; 94 enum GDC_with_AVX = false; 95 enum GDC_with_AVX2 = false; 96 enum GDC_with_SHA = false; 97 enum GDC_with_BMI2 = false; 98 } 99 } 100 else 101 { 102 enum GDC_with_x86 = false; 103 enum GDC_with_MMX = false; 104 enum GDC_with_SSE = false; 105 enum GDC_with_SSE2 = false; 106 enum GDC_with_SSE3 = false; 107 enum GDC_with_SSSE3 = false; 108 enum GDC_with_SSE41 = false; 109 enum GDC_with_SSE42 = false; 110 enum GDC_with_AVX = false; 111 enum GDC_with_AVX2 = false; 112 enum GDC_with_SHA = false; 113 enum GDC_with_BMI2 = false; 114 } 115 116 version(LDC) 117 { 118 public import core.simd; 119 public import ldc.simd; 120 public import ldc.intrinsics; 121 public import ldc.llvmasm: __asm; 122 123 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 124 static if (__VERSION__ >= 2083) 125 { 126 import ldc.llvmasm; 127 alias LDCInlineIR = __ir_pure; 128 129 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 130 alias LDCInlineIREx = __irEx_pure; 131 } 132 else 133 { 134 alias LDCInlineIR = inlineIR; 135 } 136 137 version(ARM) 138 { 139 public import ldc.gccbuiltins_arm; 140 enum LDC_with_ARM32 = true; 141 enum LDC_with_ARM64 = false; 142 enum LDC_with_ARM64_CRC = false; 143 enum LDC_with_SSE1 = false; 144 enum LDC_with_SSE2 = false; 145 enum LDC_with_SSE3 = false; 146 enum LDC_with_SSSE3 = false; 147 enum LDC_with_SSE41 = false; 148 enum LDC_with_SSE42 = false; 149 enum LDC_with_AVX = false; 150 enum LDC_with_AVX2 = false; 151 enum LDC_with_SHA = false; 152 enum LDC_with_BMI2 = false; 153 } 154 else version(AArch64) 155 { 156 public import ldc.gccbuiltins_aarch64; 157 enum LDC_with_ARM32 = false; 158 enum LDC_with_ARM64 = true; // implies "has Neon" 159 enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc"); 160 enum LDC_with_SSE1 = false; 161 enum LDC_with_SSE2 = false; 162 enum LDC_with_SSE3 = false; 163 enum LDC_with_SSSE3 = false; 164 enum LDC_with_SSE41 = false; 165 enum LDC_with_SSE42 = false; 166 enum LDC_with_AVX = false; 167 enum LDC_with_AVX2 = false; 168 enum LDC_with_SHA = false; 169 enum LDC_with_BMI2 = false; 170 } 171 else 172 { 173 public import ldc.gccbuiltins_x86; 174 enum LDC_with_ARM32 = false; 175 enum LDC_with_ARM64 = false; 176 enum LDC_with_ARM64_CRC = false; 177 enum LDC_with_SSE1 = __traits(targetHasFeature, "sse"); 178 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 179 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 180 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 181 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 182 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 183 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 184 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 185 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 186 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2"); 187 } 188 } 189 else 190 { 191 enum LDC_with_ARM32 = false; 192 enum LDC_with_ARM64 = false; 193 enum LDC_with_ARM64_CRC = false; 194 enum LDC_with_SSE1 = false; 195 enum LDC_with_SSE2 = false; 196 enum LDC_with_SSE3 = false; 197 enum LDC_with_SSSE3 = false; 198 enum LDC_with_SSE41 = false; 199 enum LDC_with_SSE42 = false; 200 enum LDC_with_AVX = false; 201 enum LDC_with_AVX2 = false; 202 enum LDC_with_SHA = false; 203 enum LDC_with_BMI2 = false; 204 } 205 206 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 207 208 version(DigitalMars) 209 { 210 version(D_InlineAsm_X86) 211 enum DMD_with_asm = true; 212 else version(D_InlineAsm_X86_64) 213 enum DMD_with_asm = true; 214 else 215 enum DMD_with_asm = false; 216 217 version(D_InlineAsm_X86) 218 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 219 else 220 enum DMD_with_32bit_asm = false; 221 222 version (D_SIMD) 223 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 224 else 225 enum DMD_with_DSIMD = false; 226 } 227 else 228 { 229 enum DMD_with_asm = false; 230 enum DMD_with_32bit_asm = false; 231 enum DMD_with_DSIMD = false; 232 } 233 234 static if (LDC_with_ARM32) 235 { 236 package uint arm_get_fpcr() nothrow @nogc @trusted 237 { 238 return __builtin_arm_get_fpscr(); 239 } 240 241 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 242 { 243 __builtin_arm_set_fpscr(cw); 244 } 245 } 246 247 static if (LDC_with_ARM64) 248 { 249 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 250 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 251 252 package uint arm_get_fpcr() pure nothrow @nogc @trusted 253 { 254 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 255 return __asm!uint("mrs $0, fpcr", "=r"); 256 } 257 258 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 259 { 260 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 261 long save_x2; 262 __asm!void("str x2, $1 \n" ~ 263 "ldr w2, $0 \n" ~ 264 "msr fpcr, x2 \n" ~ 265 "ldr x2, $1 " , "m,m", cw, &save_x2); 266 } 267 } 268 269 270 // For internal use only, since public API deals with a x86 semantic emulation 271 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 272 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 273 enum uint _MM_ROUND_UP_ARM = 0x00400000; 274 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 275 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 276 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 277 278 279 // 280 // <ROUNDING> 281 // 282 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 283 // doesn't change the FPU rounding mode, and isn't expected to do so. 284 // So we devised these rounding function to help having consistent rounding between 285 // LDC and DMD. It's important that DMD uses whatever is in MXCSR to round. 286 // 287 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 288 // functionality. 289 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 290 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 291 292 int convertFloatToInt32UsingMXCSR(float value) @trusted 293 { 294 int result; 295 version(GNU) 296 { 297 asm pure nothrow @nogc @trusted 298 { 299 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 300 } 301 } 302 else static if (LDC_with_ARM32) 303 { 304 // TODO: this is a bug, it won't preserve registers when optimized 305 result = __asm!int(`vldr s2, $1 306 vcvtr.s32.f32 s2, s2 307 vmov $0, s2`, "=r,m", value); 308 } 309 else static if (LDC_with_ARM64) 310 { 311 // Get current rounding mode. 312 uint fpscr = arm_get_fpcr(); 313 314 switch(fpscr & _MM_ROUND_MASK_ARM) 315 { 316 default: 317 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 318 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 319 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 320 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 321 } 322 } 323 else 324 { 325 asm pure nothrow @nogc @trusted 326 { 327 cvtss2si EAX, value; 328 mov result, EAX; 329 } 330 } 331 return result; 332 } 333 334 int convertDoubleToInt32UsingMXCSR(double value) @trusted 335 { 336 int result; 337 version(GNU) 338 { 339 asm pure nothrow @nogc @trusted 340 { 341 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 342 } 343 } 344 else static if (LDC_with_ARM32) 345 { 346 // TODO: bug, doesn't preserve registers 347 result = __asm!int(`vldr d2, $1 348 vcvtr.s32.f64 s2, d2 349 vmov $0, s2`, "=r,m", value); 350 } 351 else static if (LDC_with_ARM64) 352 { 353 // Get current rounding mode. 354 uint fpscr = arm_get_fpcr(); 355 356 switch(fpscr & _MM_ROUND_MASK_ARM) 357 { 358 default: 359 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 360 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 361 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 362 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 363 } 364 } 365 else 366 { 367 asm pure nothrow @nogc @trusted 368 { 369 cvtsd2si EAX, value; 370 mov result, EAX; 371 } 372 } 373 return result; 374 } 375 376 long convertFloatToInt64UsingMXCSR(float value) @trusted 377 { 378 static if (LDC_with_ARM32) 379 { 380 // We have to resort to libc since 32-bit ARM 381 // doesn't seem to have 64-bit registers. 382 383 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 384 385 // Note: converting to double precision else rounding could be different for large integers 386 double asDouble = value; 387 388 switch(fpscr & _MM_ROUND_MASK_ARM) 389 { 390 default: 391 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 392 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 393 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 394 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 395 } 396 } 397 else static if (LDC_with_ARM64) 398 { 399 uint fpscr = arm_get_fpcr(); 400 401 switch(fpscr & _MM_ROUND_MASK_ARM) 402 { 403 default: 404 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 405 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 406 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 407 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 408 } 409 } 410 // 64-bit can use an SSE instruction 411 else version(D_InlineAsm_X86_64) 412 { 413 long result; 414 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 415 { 416 asm pure nothrow @nogc @trusted 417 { 418 movss XMM0, value; 419 cvtss2si RAX, XMM0; 420 mov result, RAX; 421 } 422 } 423 else 424 { 425 asm pure nothrow @nogc @trusted 426 { 427 movss XMM0, value; 428 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 429 mov result, RAX; 430 } 431 } 432 return result; 433 } 434 else version(D_InlineAsm_X86) 435 { 436 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 437 // This leads to an unfortunate FPU sequence in every C++ compiler. 438 // See: https://godbolt.org/z/vZym77 439 440 // Get current MXCSR rounding 441 uint sseRounding; 442 ushort savedFPUCW; 443 ushort newFPUCW; 444 long result; 445 asm pure nothrow @nogc @trusted 446 { 447 stmxcsr sseRounding; 448 fld value; 449 fnstcw savedFPUCW; 450 mov AX, savedFPUCW; 451 and AX, 0xf3ff; // clear FPU rounding bits 452 movzx ECX, word ptr sseRounding; 453 and ECX, 0x6000; // only keep SSE rounding bits 454 shr ECX, 3; 455 or AX, CX; // make a new control word for FPU with SSE bits 456 mov newFPUCW, AX; 457 fldcw newFPUCW; 458 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 459 fldcw savedFPUCW; 460 } 461 return result; 462 } 463 else static if (GDC_with_x86) 464 { 465 version(X86_64) // 64-bit can just use the right instruction 466 { 467 static assert(GDC_with_SSE); 468 __m128 A; 469 A.ptr[0] = value; 470 return __builtin_ia32_cvtss2si64 (A); 471 } 472 else version(X86) // 32-bit 473 { 474 // This is untested! 475 uint sseRounding; 476 ushort savedFPUCW; 477 ushort newFPUCW; 478 long result; 479 asm pure nothrow @nogc @trusted 480 { 481 "stmxcsr %1;\n" ~ 482 "fld %2;\n" ~ 483 "fnstcw %3;\n" ~ 484 "movw %3, %%ax;\n" ~ 485 "andw $0xf3ff, %%ax;\n" ~ 486 "movzwl %1, %%ecx;\n" ~ 487 "andl $0x6000, %%ecx;\n" ~ 488 "shrl $3, %%ecx;\n" ~ 489 "orw %%cx, %%ax\n" ~ 490 "movw %%ax, %4;\n" ~ 491 "fldcw %4;\n" ~ 492 "fistpll %0;\n" ~ 493 "fldcw %3;\n" 494 : "=m"(result) // %0 495 : "m" (sseRounding), 496 "f" (value), 497 "m" (savedFPUCW), 498 "m" (newFPUCW) 499 : "eax", "ecx", "st"; 500 } 501 return result; 502 } 503 else 504 static assert(false); 505 } 506 else 507 static assert(false); 508 } 509 510 511 ///ditto 512 long convertDoubleToInt64UsingMXCSR(double value) @trusted 513 { 514 static if (LDC_with_ARM32) 515 { 516 // We have to resort to libc since 32-bit ARM 517 // doesn't seem to have 64-bit registers. 518 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 519 switch(fpscr & _MM_ROUND_MASK_ARM) 520 { 521 default: 522 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 523 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 524 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 525 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 526 } 527 } 528 else static if (LDC_with_ARM64) 529 { 530 // Get current rounding mode. 531 uint fpscr = arm_get_fpcr(); 532 533 switch(fpscr & _MM_ROUND_MASK_ARM) 534 { 535 default: 536 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 537 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 538 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 539 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 540 } 541 } 542 // 64-bit can use an SSE instruction 543 else version(D_InlineAsm_X86_64) 544 { 545 long result; 546 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 547 { 548 asm pure nothrow @nogc @trusted 549 { 550 movsd XMM0, value; 551 cvtsd2si RAX, XMM0; 552 mov result, RAX; 553 } 554 } 555 else 556 { 557 asm pure nothrow @nogc @trusted 558 { 559 movsd XMM0, value; 560 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 561 mov result, RAX; 562 } 563 } 564 return result; 565 } 566 else version(D_InlineAsm_X86) 567 { 568 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 569 // This leads to an unfortunate FPU sequence in every C++ compiler. 570 // See: https://godbolt.org/z/vZym77 571 572 // Get current MXCSR rounding 573 uint sseRounding; 574 ushort savedFPUCW; 575 ushort newFPUCW; 576 long result; 577 asm pure nothrow @nogc @trusted 578 { 579 stmxcsr sseRounding; 580 fld value; 581 fnstcw savedFPUCW; 582 mov AX, savedFPUCW; 583 and AX, 0xf3ff; 584 movzx ECX, word ptr sseRounding; 585 and ECX, 0x6000; 586 shr ECX, 3; 587 or AX, CX; 588 mov newFPUCW, AX; 589 fldcw newFPUCW; 590 fistp result; 591 fldcw savedFPUCW; 592 } 593 return result; 594 } 595 else static if (GDC_with_x86) 596 { 597 version(X86_64) 598 { 599 static assert(GDC_with_SSE2); 600 __m128d A; 601 A.ptr[0] = value; 602 return __builtin_ia32_cvtsd2si64 (A); 603 } 604 else 605 { 606 // This is untested! 607 uint sseRounding; 608 ushort savedFPUCW; 609 ushort newFPUCW; 610 long result; 611 asm pure nothrow @nogc @trusted 612 { 613 "stmxcsr %1;\n" ~ 614 "fld %2;\n" ~ 615 "fnstcw %3;\n" ~ 616 "movw %3, %%ax;\n" ~ 617 "andw $0xf3ff, %%ax;\n" ~ 618 "movzwl %1, %%ecx;\n" ~ 619 "andl $0x6000, %%ecx;\n" ~ 620 "shrl $3, %%ecx;\n" ~ 621 "orw %%cx, %%ax\n" ~ 622 "movw %%ax, %4;\n" ~ 623 "fldcw %4;\n" ~ 624 "fistpll %0;\n" ~ 625 "fldcw %3;\n" 626 : "=m"(result) // %0 627 : "m" (sseRounding), 628 "t" (value), 629 "m" (savedFPUCW), 630 "m" (newFPUCW) 631 : "eax", "ecx", "st"; 632 } 633 return result; 634 } 635 } 636 else 637 static assert(false); 638 } 639 640 // 641 // </ROUNDING> 642 // 643 644 645 // using the Intel terminology here 646 647 byte saturateSignedWordToSignedByte(short value) pure @safe 648 { 649 if (value > 127) value = 127; 650 if (value < -128) value = -128; 651 return cast(byte) value; 652 } 653 654 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 655 { 656 if (value > 255) value = 255; 657 if (value < 0) value = 0; 658 return cast(ubyte) value; 659 } 660 661 short saturateSignedIntToSignedShort(int value) pure @safe 662 { 663 if (value > 32767) value = 32767; 664 if (value < -32768) value = -32768; 665 return cast(short) value; 666 } 667 668 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 669 { 670 if (value > 65535) value = 65535; 671 if (value < 0) value = 0; 672 return cast(ushort) value; 673 } 674 675 unittest // test saturate operations 676 { 677 assert( saturateSignedWordToSignedByte(32000) == 127); 678 assert( saturateSignedWordToUnsignedByte(32000) == 255); 679 assert( saturateSignedWordToSignedByte(-4000) == -128); 680 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 681 assert( saturateSignedIntToSignedShort(32768) == 32767); 682 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 683 assert( saturateSignedIntToSignedShort(-32769) == -32768); 684 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 685 } 686 687 version(unittest) 688 { 689 // This is just for debugging tests 690 import core.stdc.stdio: printf; 691 692 // printing vectors for implementation 693 // Note: you can override `pure` within a `debug` clause 694 695 void _mm_print_pi64(__m64 v) @trusted 696 { 697 long1 vl = cast(long1)v; 698 printf("%lld\n", vl.array[0]); 699 } 700 701 void _mm_print_pi32(__m64 v) @trusted 702 { 703 int[2] C = (cast(int2)v).array; 704 printf("%d %d\n", C[0], C[1]); 705 } 706 707 void _mm_print_pi16(__m64 v) @trusted 708 { 709 short[4] C = (cast(short4)v).array; 710 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 711 } 712 713 void _mm_print_pi8(__m64 v) @trusted 714 { 715 byte[8] C = (cast(byte8)v).array; 716 printf("%d %d %d %d %d %d %d %d\n", 717 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 718 } 719 720 void _mm_print_epi64(__m128i v) @trusted 721 { 722 long2 vl = cast(long2)v; 723 printf("%lld %lld\n", vl.array[0], vl.array[1]); 724 } 725 726 void _mm_print_epi32(__m128i v) @trusted 727 { 728 printf("%d %d %d %d\n", 729 v.array[0], v.array[1], v.array[2], v.array[3]); 730 } 731 732 void _mm_print_epi16(__m128i v) @trusted 733 { 734 short[8] C = (cast(short8)v).array; 735 printf("%d %d %d %d %d %d %d %d\n", 736 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 737 } 738 739 void _mm_print_epi8(__m128i v) @trusted 740 { 741 byte[16] C = (cast(byte16)v).array; 742 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 743 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 744 } 745 746 void _mm_print_ps(__m128 v) @trusted 747 { 748 // %g because %f can conceal very small numbers and prints zero instead 749 float[4] C = (cast(float4)v).array; 750 printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]); 751 } 752 753 void _mm_print_pd(__m128d v) @trusted 754 { 755 double[2] C = (cast(double2)v).array; 756 printf("%f %f\n", C[0], C[1]); 757 } 758 759 void _mm256_print_pd(__m256d v) @trusted 760 { 761 // %g because %f can conceal very small numbers and prints zero instead 762 printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 763 } 764 765 void _mm256_print_epi64(__m256i v) @trusted 766 { 767 long4 vl = cast(long4)v; 768 printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]); 769 } 770 } 771 772 773 // 774 // <FLOATING-POINT COMPARISONS> 775 // 776 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 777 // need different IR generation. 778 779 enum FPComparison 780 { 781 oeq, // ordered and equal 782 ogt, // ordered and greater than 783 oge, // ordered and greater than or equal 784 olt, // ordered and less than 785 ole, // ordered and less than or equal 786 one, // ordered and not equal 787 ord, // ordered (no nans) 788 ueq, // unordered or equal 789 ugt, // unordered or greater than ("nle") 790 uge, // unordered or greater than or equal ("nlt") 791 ult, // unordered or less than ("nge") 792 ule, // unordered or less than or equal ("ngt") 793 une, // unordered or not equal ("neq") 794 uno, // unordered (either nans) 795 } 796 797 private static immutable string[FPComparison.max+1] FPComparisonToString = 798 [ 799 "oeq", 800 "ogt", 801 "oge", 802 "olt", 803 "ole", 804 "one", 805 "ord", 806 "ueq", 807 "ugt", 808 "uge", 809 "ult", 810 "ule", 811 "une", 812 "uno", 813 ]; 814 815 // Individual float comparison: returns -1 for true or 0 for false. 816 // Useful for DMD and testing 817 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 818 { 819 bool unordered = isnan(a) || isnan(b); 820 final switch(comparison) with(FPComparison) 821 { 822 case oeq: return a == b; 823 case ogt: return a > b; 824 case oge: return a >= b; 825 case olt: return a < b; 826 case ole: return a <= b; 827 case one: return !unordered && (a != b); // NaN with != always yields true 828 case ord: return !unordered; 829 case ueq: return unordered || (a == b); 830 case ugt: return unordered || (a > b); 831 case uge: return unordered || (a >= b); 832 case ult: return unordered || (a < b); 833 case ule: return unordered || (a <= b); 834 case une: return (a != b); // NaN with != always yields true 835 case uno: return unordered; 836 } 837 } 838 839 version(LDC) 840 { 841 /// Provides packed float comparisons 842 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 843 { 844 enum ir = ` 845 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 846 %r = sext <4 x i1> %cmp to <4 x i32> 847 ret <4 x i32> %r`; 848 849 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 850 } 851 852 /// Provides packed double comparisons 853 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 854 { 855 enum ir = ` 856 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 857 %r = sext <2 x i1> %cmp to <2 x i64> 858 ret <2 x i64> %r`; 859 860 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 861 } 862 863 /// CMPSS-style comparisons 864 /// clang implement it through x86 intrinsics, it is possible with IR alone 865 /// but leads to less optimal code. 866 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 867 /// Not that simple. 868 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 869 { 870 /* 871 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 872 enum bool invertOp = (predicateNumber & 0x80) != 0; 873 static if(invertOp) 874 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 875 else 876 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 877 */ 878 enum ir = ` 879 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 880 %r = sext i1 %cmp to i32 881 %r2 = bitcast i32 %r to float 882 ret float %r2`; 883 884 float4 r = a; 885 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 886 return r; 887 } 888 889 /// CMPSD-style comparisons 890 /// clang implement it through x86 intrinsics, it is possible with IR alone 891 /// but leads to less optimal code. 892 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 893 /// Not that simple. 894 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 895 { 896 enum ir = ` 897 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 898 %r = sext i1 %cmp to i64 899 %r2 = bitcast i64 %r to double 900 ret double %r2`; 901 902 double2 r = a; 903 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 904 return r; 905 } 906 } 907 else 908 { 909 /// Provides packed float comparisons 910 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 911 { 912 int4 result; 913 foreach(i; 0..4) 914 { 915 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 916 } 917 return result; 918 } 919 920 /// Provides packed double comparisons 921 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 922 { 923 long2 result; 924 foreach(i; 0..2) 925 { 926 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 927 } 928 return result; 929 } 930 931 /// Provides CMPSS-style comparison 932 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 933 { 934 int4 result = cast(int4)a; 935 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 936 return cast(float4)result; 937 } 938 939 /// Provides CMPSD-style comparison 940 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 941 { 942 long2 result = cast(long2)a; 943 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 944 return cast(double2)result; 945 } 946 } 947 unittest // cmpps 948 { 949 // Check all comparison type is working 950 float4 A = [1, 3, 5, float.nan]; 951 float4 B = [2, 3, 4, 5]; 952 953 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 954 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 955 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 956 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 957 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 958 int4 result_one = cmpps!(FPComparison.one)(A, B); 959 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 960 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 961 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 962 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 963 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 964 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 965 int4 result_une = cmpps!(FPComparison.une)(A, B); 966 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 967 968 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 969 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 970 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 971 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 972 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 973 static immutable int[4] correct_one = [-1, 0,-1, 0]; 974 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 975 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 976 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 977 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 978 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 979 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 980 static immutable int[4] correct_une = [-1, 0,-1,-1]; 981 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 982 983 assert(result_oeq.array == correct_oeq); 984 assert(result_ogt.array == correct_ogt); 985 assert(result_oge.array == correct_oge); 986 assert(result_olt.array == correct_olt); 987 assert(result_ole.array == correct_ole); 988 assert(result_one.array == correct_one); 989 assert(result_ord.array == correct_ord); 990 assert(result_ueq.array == correct_ueq); 991 assert(result_ugt.array == correct_ugt); 992 assert(result_uge.array == correct_uge); 993 assert(result_ult.array == correct_ult); 994 assert(result_ule.array == correct_ule); 995 assert(result_une.array == correct_une); 996 assert(result_uno.array == correct_uno); 997 } 998 unittest 999 { 1000 double2 a = [1, 3]; 1001 double2 b = [2, 3]; 1002 long2 c = cmppd!(FPComparison.ult)(a, b); 1003 static immutable long[2] correct = [cast(long)(-1), 0]; 1004 assert(c.array == correct); 1005 } 1006 unittest // cmpss 1007 { 1008 void testComparison(FPComparison comparison)(float4 A, float4 B) 1009 { 1010 float4 result = cmpss!comparison(A, B); 1011 int4 iresult = cast(int4)result; 1012 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1013 assert(iresult.array[0] == expected); 1014 assert(result.array[1] == A.array[1]); 1015 assert(result.array[2] == A.array[2]); 1016 assert(result.array[3] == A.array[3]); 1017 } 1018 1019 // Check all comparison type is working 1020 float4 A = [1, 3, 5, 6]; 1021 float4 B = [2, 3, 4, 5]; 1022 float4 C = [float.nan, 3, 4, 5]; 1023 1024 testComparison!(FPComparison.oeq)(A, B); 1025 testComparison!(FPComparison.oeq)(A, C); 1026 testComparison!(FPComparison.ogt)(A, B); 1027 testComparison!(FPComparison.ogt)(A, C); 1028 testComparison!(FPComparison.oge)(A, B); 1029 testComparison!(FPComparison.oge)(A, C); 1030 testComparison!(FPComparison.olt)(A, B); 1031 testComparison!(FPComparison.olt)(A, C); 1032 testComparison!(FPComparison.ole)(A, B); 1033 testComparison!(FPComparison.ole)(A, C); 1034 testComparison!(FPComparison.one)(A, B); 1035 testComparison!(FPComparison.one)(A, C); 1036 testComparison!(FPComparison.ord)(A, B); 1037 testComparison!(FPComparison.ord)(A, C); 1038 testComparison!(FPComparison.ueq)(A, B); 1039 testComparison!(FPComparison.ueq)(A, C); 1040 testComparison!(FPComparison.ugt)(A, B); 1041 testComparison!(FPComparison.ugt)(A, C); 1042 testComparison!(FPComparison.uge)(A, B); 1043 testComparison!(FPComparison.uge)(A, C); 1044 testComparison!(FPComparison.ult)(A, B); 1045 testComparison!(FPComparison.ult)(A, C); 1046 testComparison!(FPComparison.ule)(A, B); 1047 testComparison!(FPComparison.ule)(A, C); 1048 testComparison!(FPComparison.une)(A, B); 1049 testComparison!(FPComparison.une)(A, C); 1050 testComparison!(FPComparison.uno)(A, B); 1051 testComparison!(FPComparison.uno)(A, C); 1052 } 1053 unittest // cmpsd 1054 { 1055 void testComparison(FPComparison comparison)(double2 A, double2 B) 1056 { 1057 double2 result = cmpsd!comparison(A, B); 1058 long2 iresult = cast(long2)result; 1059 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1060 assert(iresult.array[0] == expected); 1061 assert(result.array[1] == A.array[1]); 1062 } 1063 1064 // Check all comparison type is working 1065 double2 A = [1, 3]; 1066 double2 B = [2, 4]; 1067 double2 C = [double.nan, 5]; 1068 1069 testComparison!(FPComparison.oeq)(A, B); 1070 testComparison!(FPComparison.oeq)(A, C); 1071 testComparison!(FPComparison.ogt)(A, B); 1072 testComparison!(FPComparison.ogt)(A, C); 1073 testComparison!(FPComparison.oge)(A, B); 1074 testComparison!(FPComparison.oge)(A, C); 1075 testComparison!(FPComparison.olt)(A, B); 1076 testComparison!(FPComparison.olt)(A, C); 1077 testComparison!(FPComparison.ole)(A, B); 1078 testComparison!(FPComparison.ole)(A, C); 1079 testComparison!(FPComparison.one)(A, B); 1080 testComparison!(FPComparison.one)(A, C); 1081 testComparison!(FPComparison.ord)(A, B); 1082 testComparison!(FPComparison.ord)(A, C); 1083 testComparison!(FPComparison.ueq)(A, B); 1084 testComparison!(FPComparison.ueq)(A, C); 1085 testComparison!(FPComparison.ugt)(A, B); 1086 testComparison!(FPComparison.ugt)(A, C); 1087 testComparison!(FPComparison.uge)(A, B); 1088 testComparison!(FPComparison.uge)(A, C); 1089 testComparison!(FPComparison.ult)(A, B); 1090 testComparison!(FPComparison.ult)(A, C); 1091 testComparison!(FPComparison.ule)(A, B); 1092 testComparison!(FPComparison.ule)(A, C); 1093 testComparison!(FPComparison.une)(A, B); 1094 testComparison!(FPComparison.une)(A, C); 1095 testComparison!(FPComparison.uno)(A, B); 1096 testComparison!(FPComparison.uno)(A, C); 1097 } 1098 1099 // 1100 // </FLOATING-POINT COMPARISONS> 1101 // 1102 1103 1104 __m64 to_m64(__m128i a) pure @trusted 1105 { 1106 long2 la = cast(long2)a; 1107 long1 r = la.array[0]; 1108 return r; 1109 } 1110 1111 __m128i to_m128i(__m64 a) pure @trusted 1112 { 1113 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1114 1115 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1116 { 1117 long2 r = a.array[0]; 1118 r.ptr[1] = 0; 1119 return cast(int4)r; 1120 } 1121 else */ 1122 { 1123 long2 r = [0, 0]; 1124 r.ptr[0] = a.array[0]; 1125 return cast(__m128i)r; 1126 } 1127 } 1128 1129 // ADDITIONAL x86 INTRINSICS 1130 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1131 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1132 static if (LDC_with_SSE41) 1133 { 1134 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1135 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1136 } 1137 1138 // SOME NEON INTRINSICS 1139 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1140 // Not in the public API but the simde project expose it all for the user to use. 1141 // MAYDO: create a new neon.d module, for internal use only. 1142 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1143 static if (LDC_with_ARM64) 1144 { 1145 // VERY USEFUL LINK 1146 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1147 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1148 1149 pragma(LDC_intrinsic, "llvm.aarch64.crc32cb") 1150 uint __crc32cb(uint a, uint b) pure @safe; 1151 1152 pragma(LDC_intrinsic, "llvm.aarch64.crc32ch") 1153 uint __crc32ch(uint a, uint b) pure @safe; 1154 1155 pragma(LDC_intrinsic, "llvm.aarch64.crc32cw") 1156 uint __crc32cw(uint a, uint b) pure @safe; 1157 1158 pragma(LDC_intrinsic, "llvm.aarch64.crc32cx") 1159 uint __crc32cd(uint a, ulong b) pure @safe; 1160 1161 //pragma(LDC_intrinsic, "llvm.aarch64.dmb") 1162 // uint __dmb(int a) @safe; // didn't found a name in intrinsic list 1163 1164 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1165 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1166 1167 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1168 short8 vabsq_s16(short8 a) pure @safe; 1169 1170 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1171 int4 vabsq_s32(int4 a) pure @safe; 1172 1173 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1174 byte16 vabsq_s8(byte16 a) pure @safe; 1175 1176 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1177 { 1178 return a & b; 1179 } 1180 1181 long2 vandq_s64(long2 a, long2 b) 1182 { 1183 return a & b; 1184 } 1185 1186 long2 vbicq_s64(long2 a, long2 b) pure @safe 1187 { 1188 return a & ~b; 1189 } 1190 1191 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1192 { 1193 return c ^ ((c ^ b) & a); 1194 } 1195 1196 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1197 { 1198 return c ^ ((c ^ b) & a); 1199 } 1200 1201 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1202 { 1203 return c ^ ((c ^ b) & a); 1204 } 1205 1206 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1207 { 1208 short8 r; 1209 r.ptr[0] = lo.array[0]; 1210 r.ptr[1] = lo.array[1]; 1211 r.ptr[2] = lo.array[2]; 1212 r.ptr[3] = lo.array[3]; 1213 r.ptr[4] = hi.array[0]; 1214 r.ptr[5] = hi.array[1]; 1215 r.ptr[6] = hi.array[2]; 1216 r.ptr[7] = hi.array[3]; 1217 return r; 1218 } 1219 1220 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1221 { 1222 int4 r; 1223 r.ptr[0] = lo.array[0]; 1224 r.ptr[1] = lo.array[1]; 1225 r.ptr[2] = hi.array[0]; 1226 r.ptr[3] = hi.array[1]; 1227 return r; 1228 } 1229 1230 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1231 { 1232 byte16 r; 1233 r.ptr[0] = lo.array[0]; 1234 r.ptr[1] = lo.array[1]; 1235 r.ptr[2] = lo.array[2]; 1236 r.ptr[3] = lo.array[3]; 1237 r.ptr[4] = lo.array[4]; 1238 r.ptr[5] = lo.array[5]; 1239 r.ptr[6] = lo.array[6]; 1240 r.ptr[7] = lo.array[7]; 1241 r.ptr[8] = hi.array[0]; 1242 r.ptr[9] = hi.array[1]; 1243 r.ptr[10] = hi.array[2]; 1244 r.ptr[11] = hi.array[3]; 1245 r.ptr[12] = hi.array[4]; 1246 r.ptr[13] = hi.array[5]; 1247 r.ptr[14] = hi.array[6]; 1248 r.ptr[15] = hi.array[7]; 1249 return r; 1250 } 1251 1252 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1253 { 1254 short8 r; 1255 r.ptr[0] = lo.array[0]; 1256 r.ptr[1] = lo.array[1]; 1257 r.ptr[2] = lo.array[2]; 1258 r.ptr[3] = lo.array[3]; 1259 r.ptr[4] = hi.array[0]; 1260 r.ptr[5] = hi.array[1]; 1261 r.ptr[6] = hi.array[2]; 1262 r.ptr[7] = hi.array[3]; 1263 return r; 1264 } 1265 1266 1267 // float4 => int4 1268 1269 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1270 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1271 1272 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1273 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1274 1275 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1276 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1277 1278 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1279 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1280 1281 1282 // double2 => long2 1283 1284 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1285 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1286 1287 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1288 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1289 1290 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1291 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1292 1293 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1294 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1295 1296 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1297 int vcvtms_s32_f32(float a) pure @safe; 1298 1299 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1300 int vcvtns_s32_f32(float a) pure @safe; 1301 1302 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1303 int vcvtps_s32_f32(float a) pure @safe; 1304 1305 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1306 int vcvts_s32_f32(float a) pure @safe; 1307 1308 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1309 int vcvtms_s32_f64(double a) pure @safe; 1310 1311 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1312 int vcvtns_s32_f64(double a) pure @safe; 1313 1314 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1315 int vcvtps_s32_f64(double a) pure @safe; 1316 1317 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1318 int vcvts_s32_f64(double a) pure @safe; 1319 1320 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1321 long vcvtms_s64_f32(float a) pure @safe; 1322 1323 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1324 long vcvtns_s64_f32(float a) pure @safe; 1325 1326 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1327 long vcvtps_s64_f32(float a) pure @safe; 1328 1329 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1330 long vcvts_s64_f32(float a) pure @safe; 1331 1332 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1333 long vcvtms_s64_f64(double a) pure @safe; 1334 1335 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1336 long vcvtns_s64_f64(double a) pure @safe; 1337 1338 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1339 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1340 1341 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1342 long vcvts_s64_f64(double a) pure @safe; 1343 1344 long2 vdupq_n_s64(long value) pure @safe 1345 { 1346 long2 r; 1347 r = value; 1348 return r; 1349 } 1350 1351 short4 vget_high_s16(short8 a) pure @trusted 1352 { 1353 short4 r; 1354 r.ptr[0] = a.array[4]; 1355 r.ptr[1] = a.array[5]; 1356 r.ptr[2] = a.array[6]; 1357 r.ptr[3] = a.array[7]; 1358 return r; 1359 } 1360 1361 int2 vget_high_s32(int4 a) pure @trusted 1362 { 1363 int2 r; 1364 r.ptr[0] = a.array[2]; 1365 r.ptr[1] = a.array[3]; 1366 return r; 1367 } 1368 1369 byte8 vget_high_u8(byte16 a) pure @trusted 1370 { 1371 byte8 r; 1372 r.ptr[0] = a.array[8]; 1373 r.ptr[1] = a.array[9]; 1374 r.ptr[2] = a.array[10]; 1375 r.ptr[3] = a.array[11]; 1376 r.ptr[4] = a.array[12]; 1377 r.ptr[5] = a.array[13]; 1378 r.ptr[6] = a.array[14]; 1379 r.ptr[7] = a.array[15]; 1380 return r; 1381 } 1382 1383 short4 vget_low_s16(short8 a) pure @trusted 1384 { 1385 short4 r; 1386 r.ptr[0] = a.array[0]; 1387 r.ptr[1] = a.array[1]; 1388 r.ptr[2] = a.array[2]; 1389 r.ptr[3] = a.array[3]; 1390 return r; 1391 } 1392 1393 int2 vget_low_s32(int4 a) pure @trusted 1394 { 1395 int2 r; 1396 r.ptr[0] = a.array[0]; 1397 r.ptr[1] = a.array[1]; 1398 return r; 1399 } 1400 1401 byte8 vget_low_u8(byte16 a) pure @trusted 1402 { 1403 byte8 r; 1404 r.ptr[0] = a.array[0]; 1405 r.ptr[1] = a.array[1]; 1406 r.ptr[2] = a.array[2]; 1407 r.ptr[3] = a.array[3]; 1408 r.ptr[4] = a.array[4]; 1409 r.ptr[5] = a.array[5]; 1410 r.ptr[6] = a.array[6]; 1411 r.ptr[7] = a.array[7]; 1412 return r; 1413 } 1414 1415 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1416 { 1417 return v.array[lane]; 1418 } 1419 1420 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1421 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1422 1423 int4 vmaxq_s32(int4 a, int4 b) 1424 { 1425 int4 r; 1426 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1427 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1428 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1429 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1430 return r; 1431 } 1432 1433 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1434 short8 vminq_s16(short8 a, short8 b) pure @safe; 1435 1436 int2 vmovn_s64(long2 a) pure @trusted 1437 { 1438 int2 r; 1439 r.ptr[0] = cast(int)(a.array[0]); 1440 r.ptr[1] = cast(int)(a.array[1]); 1441 return r; 1442 } 1443 1444 int4 vmull_s16(short4 a, short4 b) pure @trusted 1445 { 1446 int4 r; 1447 r.ptr[0] = a.array[0] * b.array[0]; 1448 r.ptr[1] = a.array[1] * b.array[1]; 1449 r.ptr[2] = a.array[2] * b.array[2]; 1450 r.ptr[3] = a.array[3] * b.array[3]; 1451 return r; 1452 } 1453 1454 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1455 long2 vmull_s32(int2 a, int2 b) pure @safe; 1456 1457 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1458 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1459 1460 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1461 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1462 1463 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1464 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1465 1466 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1467 short8 vpaddlq_u8 (byte16 a) pure @safe; 1468 1469 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1470 { 1471 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1472 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1473 } 1474 else 1475 { 1476 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1477 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1478 } 1479 1480 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1481 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1482 1483 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1484 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1485 1486 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1487 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1488 1489 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1490 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1491 1492 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1493 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1494 1495 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1496 byte8 vqmovn_s16(short8 a) pure @safe; 1497 1498 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1499 short4 vqmovn_s32(int4 a) pure @safe; 1500 1501 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1502 short4 vqmovn_u32(int4 a) pure @safe; 1503 1504 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1505 byte8 vqmovun_s16(short8 a) pure @safe; 1506 1507 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1508 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1509 1510 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1511 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1512 1513 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1514 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1515 1516 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1517 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1518 1519 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1520 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1521 1522 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1523 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1524 1525 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1526 { 1527 return a >>> b; 1528 } 1529 1530 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1531 { 1532 a = a >> byte16(cast(byte)r); 1533 return a; 1534 } 1535 1536 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1537 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1538 } 1539 1540 version(unittest) 1541 { 1542 double abs_double(double x) @trusted 1543 { 1544 version(LDC) 1545 return llvm_fabs(x); 1546 else 1547 { 1548 long uf = *cast(long*)(&x); 1549 uf &= 0x7fffffff_ffffffff; 1550 return *cast(double*)(&uf); 1551 } 1552 } 1553 } 1554 1555 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1556 1557 bool isnan(float x) pure @trusted 1558 { 1559 uint u = *cast(uint*)(&x); 1560 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1561 return result; 1562 } 1563 unittest 1564 { 1565 float x = float.nan; 1566 assert(isnan(x)); 1567 1568 x = 0; 1569 assert(!isnan(x)); 1570 1571 x = float.infinity; 1572 assert(!isnan(x)); 1573 } 1574 1575 bool isnan(double x) pure @trusted 1576 { 1577 ulong u = *cast(ulong*)(&x); 1578 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1579 } 1580 unittest 1581 { 1582 double x = double.nan; 1583 assert(isnan(x)); 1584 1585 x = 0; 1586 assert(!isnan(x)); 1587 1588 x = double.infinity; 1589 assert(!isnan(x)); 1590 }