1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_AVX = false; 34 enum GDC_with_AVX2 = false; 35 enum GDC_with_SHA = false; 36 enum GDC_with_BMI2 = false; 37 } 38 else version (X86_64) 39 { 40 // GDC support uses extended inline assembly: 41 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 42 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 43 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 44 45 public import core.simd: byte16, short8, int4, float4, double2; 46 47 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 48 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 49 public import gcc.builtins; 50 51 // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 52 // want to support other archs with GDC 53 54 enum GDC_with_x86 = true; 55 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 56 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 57 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 58 59 static if (__VERSION__ >= 2100) // Starting at GDC 12.1 60 { 61 enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps); 62 enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128); 63 enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps); 64 enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq); 65 enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256); 66 enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df); 67 enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si); 68 69 } 70 else 71 { 72 // Before GCC 11.3, no reliable way to detect instruction sets. 73 // We start above detection at GCC 12, with DMDFE 2.100, which 74 // is more conservative. 75 enum GDC_with_SSE3 = false; 76 enum GDC_with_SSSE3 = false; 77 enum GDC_with_SSE41 = false; 78 enum GDC_with_SSE42 = false; 79 enum GDC_with_AVX = false; 80 enum GDC_with_AVX2 = false; 81 enum GDC_with_BMI2 = false; 82 } 83 84 enum GDC_with_SHA = false; // TODO: detect that 85 } 86 else 87 { 88 enum GDC_with_x86 = false; 89 enum GDC_with_MMX = false; 90 enum GDC_with_SSE = false; 91 enum GDC_with_SSE2 = false; 92 enum GDC_with_SSE3 = false; 93 enum GDC_with_SSSE3 = false; 94 enum GDC_with_SSE41 = false; 95 enum GDC_with_SSE42 = false; 96 enum GDC_with_AVX = false; 97 enum GDC_with_AVX2 = false; 98 enum GDC_with_SHA = false; 99 enum GDC_with_BMI2 = false; 100 } 101 } 102 else 103 { 104 enum GDC_with_x86 = false; 105 enum GDC_with_MMX = false; 106 enum GDC_with_SSE = false; 107 enum GDC_with_SSE2 = false; 108 enum GDC_with_SSE3 = false; 109 enum GDC_with_SSSE3 = false; 110 enum GDC_with_SSE41 = false; 111 enum GDC_with_SSE42 = false; 112 enum GDC_with_AVX = false; 113 enum GDC_with_AVX2 = false; 114 enum GDC_with_SHA = false; 115 enum GDC_with_BMI2 = false; 116 } 117 118 version(LDC) 119 { 120 public import core.simd; 121 public import ldc.simd; 122 public import ldc.intrinsics; 123 public import ldc.llvmasm: __asm; 124 125 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 126 static if (__VERSION__ >= 2083) 127 { 128 import ldc.llvmasm; 129 alias LDCInlineIR = __ir_pure; 130 131 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 132 alias LDCInlineIREx = __irEx_pure; 133 } 134 else 135 { 136 alias LDCInlineIR = inlineIR; 137 } 138 139 version(ARM) 140 { 141 public import ldc.gccbuiltins_arm; 142 enum LDC_with_ARM32 = true; 143 enum LDC_with_ARM64 = false; 144 enum LDC_with_ARM64_CRC = false; 145 enum LDC_with_SSE = false; 146 enum LDC_with_SSE2 = false; 147 enum LDC_with_SSE3 = false; 148 enum LDC_with_SSSE3 = false; 149 enum LDC_with_SSE41 = false; 150 enum LDC_with_SSE42 = false; 151 enum LDC_with_AVX = false; 152 enum LDC_with_AVX2 = false; 153 enum LDC_with_SHA = false; 154 enum LDC_with_BMI2 = false; 155 } 156 else version(AArch64) 157 { 158 public import ldc.gccbuiltins_aarch64; 159 enum LDC_with_ARM32 = false; 160 enum LDC_with_ARM64 = true; // implies "has Neon" 161 enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc"); 162 enum LDC_with_SSE = false; 163 enum LDC_with_SSE2 = false; 164 enum LDC_with_SSE3 = false; 165 enum LDC_with_SSSE3 = false; 166 enum LDC_with_SSE41 = false; 167 enum LDC_with_SSE42 = false; 168 enum LDC_with_AVX = false; 169 enum LDC_with_AVX2 = false; 170 enum LDC_with_SHA = false; 171 enum LDC_with_BMI2 = false; 172 } 173 else 174 { 175 public import ldc.gccbuiltins_x86; 176 enum LDC_with_ARM32 = false; 177 enum LDC_with_ARM64 = false; 178 enum LDC_with_ARM64_CRC = false; 179 enum LDC_with_SSE = __traits(targetHasFeature, "sse"); 180 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 181 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 182 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 183 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 184 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 185 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 186 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 187 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 188 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2"); 189 } 190 } 191 else 192 { 193 enum LDC_with_ARM32 = false; 194 enum LDC_with_ARM64 = false; 195 enum LDC_with_ARM64_CRC = false; 196 enum LDC_with_SSE = false; 197 enum LDC_with_SSE2 = false; 198 enum LDC_with_SSE3 = false; 199 enum LDC_with_SSSE3 = false; 200 enum LDC_with_SSE41 = false; 201 enum LDC_with_SSE42 = false; 202 enum LDC_with_AVX = false; 203 enum LDC_with_AVX2 = false; 204 enum LDC_with_SHA = false; 205 enum LDC_with_BMI2 = false; 206 } 207 208 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 209 210 version(DigitalMars) 211 { 212 version(D_InlineAsm_X86) 213 enum DMD_with_asm = true; 214 else version(D_InlineAsm_X86_64) 215 enum DMD_with_asm = true; 216 else 217 enum DMD_with_asm = false; 218 219 version(D_InlineAsm_X86) 220 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 221 else 222 enum DMD_with_32bit_asm = false; 223 224 version (D_SIMD) 225 { 226 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 227 228 // Going further, does DMD has SSE4.1 through -mcpu? 229 static if (DMD_with_DSIMD) 230 enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0)); 231 else 232 enum bool DMD_with_DSIMD_and_SSE41 = false; 233 234 // No DMD way to detect those instruction sets => pessimize 235 // would be cool to have a way to detect support for this at CT 236 enum DMD_with_DSIMD_and_SSE3 = DMD_with_DSIMD_and_SSE41; 237 enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41; 238 239 version(D_AVX) 240 enum DMD_with_DSIMD_and_AVX = true; 241 else 242 enum DMD_with_DSIMD_and_AVX = false; 243 244 version(D_AVX2) 245 enum DMD_with_DSIMD_and_AVX2 = true; 246 else 247 enum DMD_with_DSIMD_and_AVX2 = false; 248 249 enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX; 250 } 251 else 252 { 253 enum DMD_with_DSIMD = false; 254 enum DMD_with_DSIMD_and_SSE3 = false; 255 enum DMD_with_DSIMD_and_SSSE3 = false; 256 enum DMD_with_DSIMD_and_SSE41 = false; 257 enum DMD_with_DSIMD_and_SSE42 = false; 258 enum DMD_with_DSIMD_and_AVX = false; 259 enum DMD_with_DSIMD_and_AVX2 = false; 260 } 261 } 262 else 263 { 264 enum DMD_with_asm = false; 265 enum DMD_with_32bit_asm = false; 266 enum DMD_with_DSIMD = false; 267 enum DMD_with_DSIMD_and_SSE3 = false; 268 enum DMD_with_DSIMD_and_SSSE3 = false; 269 enum DMD_with_DSIMD_and_SSE41 = false; 270 enum DMD_with_DSIMD_and_SSE42 = false; 271 enum DMD_with_DSIMD_and_AVX = false; 272 enum DMD_with_DSIMD_and_AVX2 = false; 273 } 274 275 276 // Sometimes, can be helpful to merge builtin code, however keep in mind that 277 // LDC and GDC builtins often subtly diverse, wrt. unsigned vs signed vectors, 278 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics. 279 enum GDC_or_LDC_with_SSE = GDC_with_SSE || LDC_with_SSE; 280 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2; 281 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3; 282 283 enum GDC_or_LDC_with_AVX = GDC_with_AVX || LDC_with_AVX; 284 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2; 285 enum GDC_or_LDC_with_SHA = GDC_with_SHA || LDC_with_SHA; 286 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2; 287 288 289 static if (LDC_with_ARM32) 290 { 291 package uint arm_get_fpcr() nothrow @nogc @trusted 292 { 293 return __builtin_arm_get_fpscr(); 294 } 295 296 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 297 { 298 __builtin_arm_set_fpscr(cw); 299 } 300 } 301 302 static if (LDC_with_ARM64) 303 { 304 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 305 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 306 307 package uint arm_get_fpcr() pure nothrow @nogc @trusted 308 { 309 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 310 return __asm!uint("mrs $0, fpcr", "=r"); 311 } 312 313 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 314 { 315 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 316 long save_x2; 317 __asm!void("str x2, $1 \n" ~ 318 "ldr w2, $0 \n" ~ 319 "msr fpcr, x2 \n" ~ 320 "ldr x2, $1 " , "m,m", cw, &save_x2); 321 } 322 } 323 324 325 // For internal use only, since public API deals with a x86 semantic emulation 326 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 327 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 328 enum uint _MM_ROUND_UP_ARM = 0x00400000; 329 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 330 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 331 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 332 333 334 // 335 // <ROUNDING> 336 // 337 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 338 // doesn't change the FPU rounding mode, and isn't expected to do so. 339 // So we devised these rounding function to help having consistent rounding between 340 // LDC and DMD. It's important that DMD uses whatever is in MXCSR to round. 341 // 342 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 343 // functionality. 344 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 345 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 346 347 int convertFloatToInt32UsingMXCSR(float value) @trusted 348 { 349 int result; 350 version(GNU) 351 { 352 asm pure nothrow @nogc @trusted 353 { 354 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 355 } 356 } 357 else static if (LDC_with_ARM32) 358 { 359 result = __asm!int(`vldr s2, $1 360 vcvtr.s32.f32 s2, s2 361 vmov $0, s2`, "=r,m,~{s2}", value); 362 } 363 else static if (LDC_with_ARM64) 364 { 365 // Get current rounding mode. 366 uint fpscr = arm_get_fpcr(); 367 368 switch(fpscr & _MM_ROUND_MASK_ARM) 369 { 370 default: 371 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 372 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 373 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 374 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 375 } 376 } 377 else 378 { 379 asm pure nothrow @nogc @trusted 380 { 381 cvtss2si EAX, value; 382 mov result, EAX; 383 } 384 } 385 return result; 386 } 387 388 int convertDoubleToInt32UsingMXCSR(double value) @trusted 389 { 390 int result; 391 version(GNU) 392 { 393 asm pure nothrow @nogc @trusted 394 { 395 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 396 } 397 } 398 else static if (LDC_with_ARM32) 399 { 400 result = __asm!int(`vldr d2, $1 401 vcvtr.s32.f64 s2, d2 402 vmov $0, s2`, "=r,m,~{s2},~{d2}", value); 403 } 404 else static if (LDC_with_ARM64) 405 { 406 // Get current rounding mode. 407 uint fpscr = arm_get_fpcr(); 408 409 switch(fpscr & _MM_ROUND_MASK_ARM) 410 { 411 default: 412 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 413 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 414 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 415 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 416 } 417 } 418 else 419 { 420 asm pure nothrow @nogc @trusted 421 { 422 cvtsd2si EAX, value; 423 mov result, EAX; 424 } 425 } 426 return result; 427 } 428 429 long convertFloatToInt64UsingMXCSR(float value) @trusted 430 { 431 static if (LDC_with_ARM32) 432 { 433 // We have to resort to libc since 32-bit ARM 434 // doesn't seem to have 64-bit registers. 435 436 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 437 438 // Note: converting to double precision else rounding could be different for large integers 439 double asDouble = value; 440 441 switch(fpscr & _MM_ROUND_MASK_ARM) 442 { 443 default: 444 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 445 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 446 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 447 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 448 } 449 } 450 else static if (LDC_with_ARM64) 451 { 452 uint fpscr = arm_get_fpcr(); 453 454 switch(fpscr & _MM_ROUND_MASK_ARM) 455 { 456 default: 457 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 458 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 459 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 460 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 461 } 462 } 463 // 64-bit can use an SSE instruction 464 else version(D_InlineAsm_X86_64) 465 { 466 long result; 467 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 468 { 469 asm pure nothrow @nogc @trusted 470 { 471 movss XMM0, value; 472 cvtss2si RAX, XMM0; 473 mov result, RAX; 474 } 475 } 476 else 477 { 478 asm pure nothrow @nogc @trusted 479 { 480 movss XMM0, value; 481 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 482 mov result, RAX; 483 } 484 } 485 return result; 486 } 487 else version(D_InlineAsm_X86) 488 { 489 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 490 // This leads to an unfortunate FPU sequence in every C++ compiler. 491 // See: https://godbolt.org/z/vZym77 492 493 // Get current MXCSR rounding 494 uint sseRounding; 495 ushort savedFPUCW; 496 ushort newFPUCW; 497 long result; 498 asm pure nothrow @nogc @trusted 499 { 500 stmxcsr sseRounding; 501 fld value; 502 fnstcw savedFPUCW; 503 mov AX, savedFPUCW; 504 and AX, 0xf3ff; // clear FPU rounding bits 505 movzx ECX, word ptr sseRounding; 506 and ECX, 0x6000; // only keep SSE rounding bits 507 shr ECX, 3; 508 or AX, CX; // make a new control word for FPU with SSE bits 509 mov newFPUCW, AX; 510 fldcw newFPUCW; 511 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 512 fldcw savedFPUCW; 513 } 514 return result; 515 } 516 else static if (GDC_with_x86) 517 { 518 version(X86_64) // 64-bit can just use the right instruction 519 { 520 static assert(GDC_with_SSE); 521 __m128 A; 522 A.ptr[0] = value; 523 return __builtin_ia32_cvtss2si64 (A); 524 } 525 else version(X86) // 32-bit 526 { 527 // This is untested! 528 uint sseRounding; 529 ushort savedFPUCW; 530 ushort newFPUCW; 531 long result; 532 asm pure nothrow @nogc @trusted 533 { 534 "stmxcsr %1;\n" ~ 535 "fld %2;\n" ~ 536 "fnstcw %3;\n" ~ 537 "movw %3, %%ax;\n" ~ 538 "andw $0xf3ff, %%ax;\n" ~ 539 "movzwl %1, %%ecx;\n" ~ 540 "andl $0x6000, %%ecx;\n" ~ 541 "shrl $3, %%ecx;\n" ~ 542 "orw %%cx, %%ax\n" ~ 543 "movw %%ax, %4;\n" ~ 544 "fldcw %4;\n" ~ 545 "fistpll %0;\n" ~ 546 "fldcw %3;\n" 547 : "=m"(result) // %0 548 : "m" (sseRounding), 549 "f" (value), 550 "m" (savedFPUCW), 551 "m" (newFPUCW) 552 : "eax", "ecx", "st"; 553 } 554 return result; 555 } 556 else 557 static assert(false); 558 } 559 else 560 static assert(false); 561 } 562 563 564 ///ditto 565 long convertDoubleToInt64UsingMXCSR(double value) @trusted 566 { 567 static if (LDC_with_ARM32) 568 { 569 // We have to resort to libc since 32-bit ARM 570 // doesn't seem to have 64-bit registers. 571 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 572 switch(fpscr & _MM_ROUND_MASK_ARM) 573 { 574 default: 575 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 576 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 577 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 578 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 579 } 580 } 581 else static if (LDC_with_ARM64) 582 { 583 // Get current rounding mode. 584 uint fpscr = arm_get_fpcr(); 585 586 switch(fpscr & _MM_ROUND_MASK_ARM) 587 { 588 default: 589 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 590 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 591 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 592 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 593 } 594 } 595 // 64-bit can use an SSE instruction 596 else version(D_InlineAsm_X86_64) 597 { 598 long result; 599 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 600 { 601 asm pure nothrow @nogc @trusted 602 { 603 movsd XMM0, value; 604 cvtsd2si RAX, XMM0; 605 mov result, RAX; 606 } 607 } 608 else 609 { 610 asm pure nothrow @nogc @trusted 611 { 612 movsd XMM0, value; 613 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 614 mov result, RAX; 615 } 616 } 617 return result; 618 } 619 else version(D_InlineAsm_X86) 620 { 621 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 622 // This leads to an unfortunate FPU sequence in every C++ compiler. 623 // See: https://godbolt.org/z/vZym77 624 625 // Get current MXCSR rounding 626 uint sseRounding; 627 ushort savedFPUCW; 628 ushort newFPUCW; 629 long result; 630 asm pure nothrow @nogc @trusted 631 { 632 stmxcsr sseRounding; 633 fld value; 634 fnstcw savedFPUCW; 635 mov AX, savedFPUCW; 636 and AX, 0xf3ff; 637 movzx ECX, word ptr sseRounding; 638 and ECX, 0x6000; 639 shr ECX, 3; 640 or AX, CX; 641 mov newFPUCW, AX; 642 fldcw newFPUCW; 643 fistp result; 644 fldcw savedFPUCW; 645 } 646 return result; 647 } 648 else static if (GDC_with_x86) 649 { 650 version(X86_64) 651 { 652 static assert(GDC_with_SSE2); 653 __m128d A; 654 A.ptr[0] = value; 655 return __builtin_ia32_cvtsd2si64 (A); 656 } 657 else 658 { 659 // This is untested! 660 uint sseRounding; 661 ushort savedFPUCW; 662 ushort newFPUCW; 663 long result; 664 asm pure nothrow @nogc @trusted 665 { 666 "stmxcsr %1;\n" ~ 667 "fld %2;\n" ~ 668 "fnstcw %3;\n" ~ 669 "movw %3, %%ax;\n" ~ 670 "andw $0xf3ff, %%ax;\n" ~ 671 "movzwl %1, %%ecx;\n" ~ 672 "andl $0x6000, %%ecx;\n" ~ 673 "shrl $3, %%ecx;\n" ~ 674 "orw %%cx, %%ax\n" ~ 675 "movw %%ax, %4;\n" ~ 676 "fldcw %4;\n" ~ 677 "fistpll %0;\n" ~ 678 "fldcw %3;\n" 679 : "=m"(result) // %0 680 : "m" (sseRounding), 681 "t" (value), 682 "m" (savedFPUCW), 683 "m" (newFPUCW) 684 : "eax", "ecx", "st"; 685 } 686 return result; 687 } 688 } 689 else 690 static assert(false); 691 } 692 693 // 694 // </ROUNDING> 695 // 696 697 698 // using the Intel terminology here 699 700 byte saturateSignedWordToSignedByte(short value) pure @safe 701 { 702 if (value > 127) value = 127; 703 if (value < -128) value = -128; 704 return cast(byte) value; 705 } 706 707 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 708 { 709 if (value > 255) value = 255; 710 if (value < 0) value = 0; 711 return cast(ubyte) value; 712 } 713 714 short saturateSignedIntToSignedShort(int value) pure @safe 715 { 716 if (value > 32767) value = 32767; 717 if (value < -32768) value = -32768; 718 return cast(short) value; 719 } 720 721 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 722 { 723 if (value > 65535) value = 65535; 724 if (value < 0) value = 0; 725 return cast(ushort) value; 726 } 727 728 unittest // test saturate operations 729 { 730 assert( saturateSignedWordToSignedByte(32000) == 127); 731 assert( saturateSignedWordToUnsignedByte(32000) == 255); 732 assert( saturateSignedWordToSignedByte(-4000) == -128); 733 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 734 assert( saturateSignedIntToSignedShort(32768) == 32767); 735 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 736 assert( saturateSignedIntToSignedShort(-32769) == -32768); 737 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 738 } 739 740 version(unittest) 741 { 742 // This is just for debugging tests 743 import core.stdc.stdio: printf; 744 745 // printing vectors for implementation 746 // Note: you can override `pure` within a `debug` clause 747 748 void _mm_print_pi64(__m64 v) @trusted 749 { 750 long1 vl = cast(long1)v; 751 printf("%lld\n", vl.array[0]); 752 } 753 754 void _mm_print_pi32(__m64 v) @trusted 755 { 756 int[2] C = (cast(int2)v).array; 757 printf("%d %d\n", C[0], C[1]); 758 } 759 760 void _mm_print_pi16(__m64 v) @trusted 761 { 762 short[4] C = (cast(short4)v).array; 763 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 764 } 765 766 void _mm_print_pi8(__m64 v) @trusted 767 { 768 byte[8] C = (cast(byte8)v).array; 769 printf("%d %d %d %d %d %d %d %d\n", 770 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 771 } 772 773 void _mm_print_epi64(__m128i v) @trusted 774 { 775 long2 vl = cast(long2)v; 776 printf("%lld %lld\n", vl.array[0], vl.array[1]); 777 } 778 779 void _mm_print_epi32(__m128i v) @trusted 780 { 781 printf("%d %d %d %d\n", 782 v.array[0], v.array[1], v.array[2], v.array[3]); 783 } 784 785 void _mm_print_epi16(__m128i v) @trusted 786 { 787 short[8] C = (cast(short8)v).array; 788 printf("%d %d %d %d %d %d %d %d\n", 789 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 790 } 791 792 void _mm_print_epi8(__m128i v) @trusted 793 { 794 byte[16] C = (cast(byte16)v).array; 795 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 796 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 797 } 798 799 void _mm_print_ps(__m128 v) @trusted 800 { 801 // %g because %f can conceal very small numbers and prints zero instead 802 float[4] C = (cast(float4)v).array; 803 printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]); 804 } 805 806 void _mm_print_pd(__m128d v) @trusted 807 { 808 double[2] C = (cast(double2)v).array; 809 printf("%f %f\n", C[0], C[1]); 810 } 811 812 void _mm256_print_pd(__m256d v) @trusted 813 { 814 // %g because %f can conceal very small numbers and prints zero instead 815 printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 816 } 817 818 void _mm256_print_ps(__m256 v) @trusted 819 { 820 // %g because %f can conceal very small numbers and prints zero instead 821 printf("%g %g %g %g %g %g %g %g\n", 822 v.array[0], v.array[1], v.array[2], v.array[3], 823 v.array[4], v.array[5], v.array[6], v.array[7]); 824 } 825 826 void _mm256_print_epi32(__m256i v) @trusted 827 { 828 int8 vl = cast(int8)v; 829 printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3], 830 vl.array[4], vl.array[5], vl.array[6], vl.array[7]); 831 } 832 833 void _mm256_print_epi64(__m256i v) @trusted 834 { 835 long4 vl = cast(long4)v; 836 printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]); 837 } 838 } 839 840 841 // 842 // <FLOATING-POINT COMPARISONS> 843 // 844 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 845 // need different IR generation. 846 847 enum FPComparison 848 { 849 oeq, // ordered and equal 850 ogt, // ordered and greater than 851 oge, // ordered and greater than or equal 852 olt, // ordered and less than 853 ole, // ordered and less than or equal 854 one, // ordered and not equal 855 ord, // ordered (no nans) 856 ueq, // unordered or equal 857 ugt, // unordered or greater than ("nle") 858 uge, // unordered or greater than or equal ("nlt") 859 ult, // unordered or less than ("nge") 860 ule, // unordered or less than or equal ("ngt") 861 une, // unordered or not equal ("neq") 862 uno, // unordered (either nans) 863 } 864 865 private static immutable string[FPComparison.max+1] FPComparisonToString = 866 [ 867 "oeq", 868 "ogt", 869 "oge", 870 "olt", 871 "ole", 872 "one", 873 "ord", 874 "ueq", 875 "ugt", 876 "uge", 877 "ult", 878 "ule", 879 "une", 880 "uno", 881 ]; 882 883 // Individual float comparison: returns -1 for true or 0 for false. 884 // Useful for DMD and testing 885 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 886 { 887 bool unordered = isnan(a) || isnan(b); 888 final switch(comparison) with(FPComparison) 889 { 890 case oeq: return a == b; 891 case ogt: return a > b; 892 case oge: return a >= b; 893 case olt: return a < b; 894 case ole: return a <= b; 895 case one: return !unordered && (a != b); // NaN with != always yields true 896 case ord: return !unordered; 897 case ueq: return unordered || (a == b); 898 case ugt: return unordered || (a > b); 899 case uge: return unordered || (a >= b); 900 case ult: return unordered || (a < b); 901 case ule: return unordered || (a <= b); 902 case une: return (a != b); // NaN with != always yields true 903 case uno: return unordered; 904 } 905 } 906 907 version(LDC) 908 { 909 /// Provides packed float comparisons 910 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 911 { 912 enum ir = ` 913 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 914 %r = sext <4 x i1> %cmp to <4 x i32> 915 ret <4 x i32> %r`; 916 917 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 918 } 919 920 /// Provides packed double comparisons 921 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 922 { 923 enum ir = ` 924 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 925 %r = sext <2 x i1> %cmp to <2 x i64> 926 ret <2 x i64> %r`; 927 928 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 929 } 930 931 /// CMPSS-style comparisons 932 /// clang implement it through x86 intrinsics, it is possible with IR alone 933 /// but leads to less optimal code. 934 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 935 /// Not that simple. 936 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 937 { 938 /* 939 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 940 enum bool invertOp = (predicateNumber & 0x80) != 0; 941 static if(invertOp) 942 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 943 else 944 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 945 */ 946 enum ir = ` 947 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 948 %r = sext i1 %cmp to i32 949 %r2 = bitcast i32 %r to float 950 ret float %r2`; 951 952 float4 r = a; 953 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 954 return r; 955 } 956 957 /// CMPSD-style comparisons 958 /// clang implement it through x86 intrinsics, it is possible with IR alone 959 /// but leads to less optimal code. 960 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 961 /// Not that simple. 962 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 963 { 964 enum ir = ` 965 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 966 %r = sext i1 %cmp to i64 967 %r2 = bitcast i64 %r to double 968 ret double %r2`; 969 970 double2 r = a; 971 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 972 return r; 973 } 974 } 975 else 976 { 977 /// Provides packed float comparisons 978 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 979 { 980 int4 result; 981 foreach(i; 0..4) 982 { 983 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 984 } 985 return result; 986 } 987 988 /// Provides packed double comparisons 989 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 990 { 991 long2 result; 992 foreach(i; 0..2) 993 { 994 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 995 } 996 return result; 997 } 998 999 /// Provides CMPSS-style comparison 1000 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 1001 { 1002 int4 result = cast(int4)a; 1003 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 1004 return cast(float4)result; 1005 } 1006 1007 /// Provides CMPSD-style comparison 1008 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1009 { 1010 long2 result = cast(long2)a; 1011 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 1012 return cast(double2)result; 1013 } 1014 } 1015 unittest // cmpps 1016 { 1017 // Check all comparison type is working 1018 float4 A = [1, 3, 5, float.nan]; 1019 float4 B = [2, 3, 4, 5]; 1020 1021 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 1022 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 1023 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 1024 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 1025 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 1026 int4 result_one = cmpps!(FPComparison.one)(A, B); 1027 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 1028 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 1029 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 1030 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 1031 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 1032 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 1033 int4 result_une = cmpps!(FPComparison.une)(A, B); 1034 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 1035 1036 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 1037 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 1038 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 1039 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 1040 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 1041 static immutable int[4] correct_one = [-1, 0,-1, 0]; 1042 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 1043 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 1044 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 1045 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 1046 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 1047 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 1048 static immutable int[4] correct_une = [-1, 0,-1,-1]; 1049 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 1050 1051 assert(result_oeq.array == correct_oeq); 1052 assert(result_ogt.array == correct_ogt); 1053 assert(result_oge.array == correct_oge); 1054 assert(result_olt.array == correct_olt); 1055 assert(result_ole.array == correct_ole); 1056 assert(result_one.array == correct_one); 1057 assert(result_ord.array == correct_ord); 1058 assert(result_ueq.array == correct_ueq); 1059 assert(result_ugt.array == correct_ugt); 1060 assert(result_uge.array == correct_uge); 1061 assert(result_ult.array == correct_ult); 1062 assert(result_ule.array == correct_ule); 1063 assert(result_une.array == correct_une); 1064 assert(result_uno.array == correct_uno); 1065 } 1066 unittest 1067 { 1068 double2 a = [1, 3]; 1069 double2 b = [2, 3]; 1070 long2 c = cmppd!(FPComparison.ult)(a, b); 1071 static immutable long[2] correct = [cast(long)(-1), 0]; 1072 assert(c.array == correct); 1073 } 1074 unittest // cmpss 1075 { 1076 void testComparison(FPComparison comparison)(float4 A, float4 B) 1077 { 1078 float4 result = cmpss!comparison(A, B); 1079 int4 iresult = cast(int4)result; 1080 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1081 assert(iresult.array[0] == expected); 1082 assert(result.array[1] == A.array[1]); 1083 assert(result.array[2] == A.array[2]); 1084 assert(result.array[3] == A.array[3]); 1085 } 1086 1087 // Check all comparison type is working 1088 float4 A = [1, 3, 5, 6]; 1089 float4 B = [2, 3, 4, 5]; 1090 float4 C = [float.nan, 3, 4, 5]; 1091 1092 testComparison!(FPComparison.oeq)(A, B); 1093 testComparison!(FPComparison.oeq)(A, C); 1094 testComparison!(FPComparison.ogt)(A, B); 1095 testComparison!(FPComparison.ogt)(A, C); 1096 testComparison!(FPComparison.oge)(A, B); 1097 testComparison!(FPComparison.oge)(A, C); 1098 testComparison!(FPComparison.olt)(A, B); 1099 testComparison!(FPComparison.olt)(A, C); 1100 testComparison!(FPComparison.ole)(A, B); 1101 testComparison!(FPComparison.ole)(A, C); 1102 testComparison!(FPComparison.one)(A, B); 1103 testComparison!(FPComparison.one)(A, C); 1104 testComparison!(FPComparison.ord)(A, B); 1105 testComparison!(FPComparison.ord)(A, C); 1106 testComparison!(FPComparison.ueq)(A, B); 1107 testComparison!(FPComparison.ueq)(A, C); 1108 testComparison!(FPComparison.ugt)(A, B); 1109 testComparison!(FPComparison.ugt)(A, C); 1110 testComparison!(FPComparison.uge)(A, B); 1111 testComparison!(FPComparison.uge)(A, C); 1112 testComparison!(FPComparison.ult)(A, B); 1113 testComparison!(FPComparison.ult)(A, C); 1114 testComparison!(FPComparison.ule)(A, B); 1115 testComparison!(FPComparison.ule)(A, C); 1116 testComparison!(FPComparison.une)(A, B); 1117 testComparison!(FPComparison.une)(A, C); 1118 testComparison!(FPComparison.uno)(A, B); 1119 testComparison!(FPComparison.uno)(A, C); 1120 } 1121 unittest // cmpsd 1122 { 1123 void testComparison(FPComparison comparison)(double2 A, double2 B) 1124 { 1125 double2 result = cmpsd!comparison(A, B); 1126 long2 iresult = cast(long2)result; 1127 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1128 assert(iresult.array[0] == expected); 1129 assert(result.array[1] == A.array[1]); 1130 } 1131 1132 // Check all comparison type is working 1133 double2 A = [1, 3]; 1134 double2 B = [2, 4]; 1135 double2 C = [double.nan, 5]; 1136 1137 testComparison!(FPComparison.oeq)(A, B); 1138 testComparison!(FPComparison.oeq)(A, C); 1139 testComparison!(FPComparison.ogt)(A, B); 1140 testComparison!(FPComparison.ogt)(A, C); 1141 testComparison!(FPComparison.oge)(A, B); 1142 testComparison!(FPComparison.oge)(A, C); 1143 testComparison!(FPComparison.olt)(A, B); 1144 testComparison!(FPComparison.olt)(A, C); 1145 testComparison!(FPComparison.ole)(A, B); 1146 testComparison!(FPComparison.ole)(A, C); 1147 testComparison!(FPComparison.one)(A, B); 1148 testComparison!(FPComparison.one)(A, C); 1149 testComparison!(FPComparison.ord)(A, B); 1150 testComparison!(FPComparison.ord)(A, C); 1151 testComparison!(FPComparison.ueq)(A, B); 1152 testComparison!(FPComparison.ueq)(A, C); 1153 testComparison!(FPComparison.ugt)(A, B); 1154 testComparison!(FPComparison.ugt)(A, C); 1155 testComparison!(FPComparison.uge)(A, B); 1156 testComparison!(FPComparison.uge)(A, C); 1157 testComparison!(FPComparison.ult)(A, B); 1158 testComparison!(FPComparison.ult)(A, C); 1159 testComparison!(FPComparison.ule)(A, B); 1160 testComparison!(FPComparison.ule)(A, C); 1161 testComparison!(FPComparison.une)(A, B); 1162 testComparison!(FPComparison.une)(A, C); 1163 testComparison!(FPComparison.uno)(A, B); 1164 testComparison!(FPComparison.uno)(A, C); 1165 } 1166 1167 // 1168 // </FLOATING-POINT COMPARISONS> 1169 // 1170 1171 1172 __m64 to_m64(__m128i a) pure @trusted 1173 { 1174 long2 la = cast(long2)a; 1175 long1 r = la.array[0]; 1176 return r; 1177 } 1178 1179 __m128i to_m128i(__m64 a) pure @trusted 1180 { 1181 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1182 1183 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1184 { 1185 long2 r = a.array[0]; 1186 r.ptr[1] = 0; 1187 return cast(int4)r; 1188 } 1189 else */ 1190 { 1191 long2 r = [0, 0]; 1192 r.ptr[0] = a.array[0]; 1193 return cast(__m128i)r; 1194 } 1195 } 1196 1197 // ADDITIONAL x86 INTRINSICS 1198 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1199 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1200 static if (LDC_with_SSE41) 1201 { 1202 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1203 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1204 } 1205 1206 // SOME NEON INTRINSICS 1207 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1208 // Not in the public API but the simde project expose it all for the user to use. 1209 // MAYDO: create a new neon.d module, for internal use only. 1210 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1211 static if (LDC_with_ARM64) 1212 { 1213 // VERY USEFUL LINK 1214 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1215 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1216 1217 pragma(LDC_intrinsic, "llvm.aarch64.crc32cb") 1218 uint __crc32cb(uint a, uint b) pure @safe; 1219 1220 pragma(LDC_intrinsic, "llvm.aarch64.crc32ch") 1221 uint __crc32ch(uint a, uint b) pure @safe; 1222 1223 pragma(LDC_intrinsic, "llvm.aarch64.crc32cw") 1224 uint __crc32cw(uint a, uint b) pure @safe; 1225 1226 pragma(LDC_intrinsic, "llvm.aarch64.crc32cx") 1227 uint __crc32cd(uint a, ulong b) pure @safe; 1228 1229 //pragma(LDC_intrinsic, "llvm.aarch64.dmb") 1230 // uint __dmb(int a) @safe; // didn't found a name in intrinsic list 1231 1232 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1233 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1234 1235 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1236 short8 vabsq_s16(short8 a) pure @safe; 1237 1238 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1239 int4 vabsq_s32(int4 a) pure @safe; 1240 1241 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1242 byte16 vabsq_s8(byte16 a) pure @safe; 1243 1244 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1245 { 1246 return a & b; 1247 } 1248 1249 long2 vandq_s64(long2 a, long2 b) 1250 { 1251 return a & b; 1252 } 1253 1254 long2 vbicq_s64(long2 a, long2 b) pure @safe 1255 { 1256 return a & ~b; 1257 } 1258 1259 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1260 { 1261 return c ^ ((c ^ b) & a); 1262 } 1263 1264 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1265 { 1266 return c ^ ((c ^ b) & a); 1267 } 1268 1269 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1270 { 1271 return c ^ ((c ^ b) & a); 1272 } 1273 1274 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1275 { 1276 short8 r; 1277 r.ptr[0] = lo.array[0]; 1278 r.ptr[1] = lo.array[1]; 1279 r.ptr[2] = lo.array[2]; 1280 r.ptr[3] = lo.array[3]; 1281 r.ptr[4] = hi.array[0]; 1282 r.ptr[5] = hi.array[1]; 1283 r.ptr[6] = hi.array[2]; 1284 r.ptr[7] = hi.array[3]; 1285 return r; 1286 } 1287 1288 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1289 { 1290 int4 r; 1291 r.ptr[0] = lo.array[0]; 1292 r.ptr[1] = lo.array[1]; 1293 r.ptr[2] = hi.array[0]; 1294 r.ptr[3] = hi.array[1]; 1295 return r; 1296 } 1297 1298 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1299 { 1300 byte16 r; 1301 r.ptr[0] = lo.array[0]; 1302 r.ptr[1] = lo.array[1]; 1303 r.ptr[2] = lo.array[2]; 1304 r.ptr[3] = lo.array[3]; 1305 r.ptr[4] = lo.array[4]; 1306 r.ptr[5] = lo.array[5]; 1307 r.ptr[6] = lo.array[6]; 1308 r.ptr[7] = lo.array[7]; 1309 r.ptr[8] = hi.array[0]; 1310 r.ptr[9] = hi.array[1]; 1311 r.ptr[10] = hi.array[2]; 1312 r.ptr[11] = hi.array[3]; 1313 r.ptr[12] = hi.array[4]; 1314 r.ptr[13] = hi.array[5]; 1315 r.ptr[14] = hi.array[6]; 1316 r.ptr[15] = hi.array[7]; 1317 return r; 1318 } 1319 1320 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1321 { 1322 short8 r; 1323 r.ptr[0] = lo.array[0]; 1324 r.ptr[1] = lo.array[1]; 1325 r.ptr[2] = lo.array[2]; 1326 r.ptr[3] = lo.array[3]; 1327 r.ptr[4] = hi.array[0]; 1328 r.ptr[5] = hi.array[1]; 1329 r.ptr[6] = hi.array[2]; 1330 r.ptr[7] = hi.array[3]; 1331 return r; 1332 } 1333 1334 1335 // float4 => int4 1336 1337 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1338 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1339 1340 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1341 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1342 1343 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1344 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1345 1346 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1347 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1348 1349 1350 // double2 => long2 1351 1352 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1353 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1354 1355 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1356 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1357 1358 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1359 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1360 1361 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1362 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1363 1364 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1365 int vcvtms_s32_f32(float a) pure @safe; 1366 1367 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1368 int vcvtns_s32_f32(float a) pure @safe; 1369 1370 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1371 int vcvtps_s32_f32(float a) pure @safe; 1372 1373 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1374 int vcvts_s32_f32(float a) pure @safe; 1375 1376 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1377 int vcvtms_s32_f64(double a) pure @safe; 1378 1379 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1380 int vcvtns_s32_f64(double a) pure @safe; 1381 1382 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1383 int vcvtps_s32_f64(double a) pure @safe; 1384 1385 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1386 int vcvts_s32_f64(double a) pure @safe; 1387 1388 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1389 long vcvtms_s64_f32(float a) pure @safe; 1390 1391 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1392 long vcvtns_s64_f32(float a) pure @safe; 1393 1394 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1395 long vcvtps_s64_f32(float a) pure @safe; 1396 1397 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1398 long vcvts_s64_f32(float a) pure @safe; 1399 1400 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1401 long vcvtms_s64_f64(double a) pure @safe; 1402 1403 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1404 long vcvtns_s64_f64(double a) pure @safe; 1405 1406 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1407 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1408 1409 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1410 long vcvts_s64_f64(double a) pure @safe; 1411 1412 long2 vdupq_n_s64(long value) pure @safe 1413 { 1414 long2 r; 1415 r = value; 1416 return r; 1417 } 1418 1419 short4 vget_high_s16(short8 a) pure @trusted 1420 { 1421 short4 r; 1422 r.ptr[0] = a.array[4]; 1423 r.ptr[1] = a.array[5]; 1424 r.ptr[2] = a.array[6]; 1425 r.ptr[3] = a.array[7]; 1426 return r; 1427 } 1428 1429 int2 vget_high_s32(int4 a) pure @trusted 1430 { 1431 int2 r; 1432 r.ptr[0] = a.array[2]; 1433 r.ptr[1] = a.array[3]; 1434 return r; 1435 } 1436 1437 byte8 vget_high_u8(byte16 a) pure @trusted 1438 { 1439 byte8 r; 1440 r.ptr[0] = a.array[8]; 1441 r.ptr[1] = a.array[9]; 1442 r.ptr[2] = a.array[10]; 1443 r.ptr[3] = a.array[11]; 1444 r.ptr[4] = a.array[12]; 1445 r.ptr[5] = a.array[13]; 1446 r.ptr[6] = a.array[14]; 1447 r.ptr[7] = a.array[15]; 1448 return r; 1449 } 1450 1451 short4 vget_low_s16(short8 a) pure @trusted 1452 { 1453 short4 r; 1454 r.ptr[0] = a.array[0]; 1455 r.ptr[1] = a.array[1]; 1456 r.ptr[2] = a.array[2]; 1457 r.ptr[3] = a.array[3]; 1458 return r; 1459 } 1460 1461 int2 vget_low_s32(int4 a) pure @trusted 1462 { 1463 int2 r; 1464 r.ptr[0] = a.array[0]; 1465 r.ptr[1] = a.array[1]; 1466 return r; 1467 } 1468 1469 byte8 vget_low_u8(byte16 a) pure @trusted 1470 { 1471 byte8 r; 1472 r.ptr[0] = a.array[0]; 1473 r.ptr[1] = a.array[1]; 1474 r.ptr[2] = a.array[2]; 1475 r.ptr[3] = a.array[3]; 1476 r.ptr[4] = a.array[4]; 1477 r.ptr[5] = a.array[5]; 1478 r.ptr[6] = a.array[6]; 1479 r.ptr[7] = a.array[7]; 1480 return r; 1481 } 1482 1483 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1484 { 1485 return v.array[lane]; 1486 } 1487 1488 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1489 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1490 1491 int4 vmaxq_s32(int4 a, int4 b) 1492 { 1493 int4 r; 1494 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1495 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1496 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1497 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1498 return r; 1499 } 1500 1501 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1502 short8 vminq_s16(short8 a, short8 b) pure @safe; 1503 1504 int2 vmovn_s64(long2 a) pure @trusted 1505 { 1506 int2 r; 1507 r.ptr[0] = cast(int)(a.array[0]); 1508 r.ptr[1] = cast(int)(a.array[1]); 1509 return r; 1510 } 1511 1512 int4 vmull_s16(short4 a, short4 b) pure @trusted 1513 { 1514 int4 r; 1515 r.ptr[0] = a.array[0] * b.array[0]; 1516 r.ptr[1] = a.array[1] * b.array[1]; 1517 r.ptr[2] = a.array[2] * b.array[2]; 1518 r.ptr[3] = a.array[3] * b.array[3]; 1519 return r; 1520 } 1521 1522 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1523 long2 vmull_s32(int2 a, int2 b) pure @safe; 1524 1525 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1526 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1527 1528 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1529 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1530 1531 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1532 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1533 1534 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1535 short8 vpaddlq_u8 (byte16 a) pure @safe; 1536 1537 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1538 { 1539 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1540 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1541 } 1542 else 1543 { 1544 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1545 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1546 } 1547 1548 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1549 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1550 1551 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1552 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1553 1554 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1555 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1556 1557 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1558 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1559 1560 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1561 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1562 1563 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1564 byte8 vqmovn_s16(short8 a) pure @safe; 1565 1566 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1567 short4 vqmovn_s32(int4 a) pure @safe; 1568 1569 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1570 short4 vqmovn_u32(int4 a) pure @safe; 1571 1572 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1573 byte8 vqmovun_s16(short8 a) pure @safe; 1574 1575 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1576 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1577 1578 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1579 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1580 1581 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1582 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1583 1584 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1585 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1586 1587 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1588 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1589 1590 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1591 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1592 1593 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1594 { 1595 return a >>> b; 1596 } 1597 1598 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1599 { 1600 a = a >> byte16(cast(byte)r); 1601 return a; 1602 } 1603 1604 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1605 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1606 } 1607 1608 version(unittest) 1609 { 1610 double abs_double(double x) @trusted 1611 { 1612 version(LDC) 1613 return llvm_fabs(x); 1614 else 1615 { 1616 long uf = *cast(long*)(&x); 1617 uf &= 0x7fffffff_ffffffff; 1618 return *cast(double*)(&uf); 1619 } 1620 } 1621 } 1622 1623 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1624 1625 bool isnan(float x) pure @trusted 1626 { 1627 uint u = *cast(uint*)(&x); 1628 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1629 return result; 1630 } 1631 unittest 1632 { 1633 float x = float.nan; 1634 assert(isnan(x)); 1635 1636 x = 0; 1637 assert(!isnan(x)); 1638 1639 x = float.infinity; 1640 assert(!isnan(x)); 1641 } 1642 1643 bool isnan(double x) pure @trusted 1644 { 1645 ulong u = *cast(ulong*)(&x); 1646 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1647 } 1648 unittest 1649 { 1650 double x = double.nan; 1651 assert(isnan(x)); 1652 1653 x = 0; 1654 assert(!isnan(x)); 1655 1656 x = double.infinity; 1657 assert(!isnan(x)); 1658 }