1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_AVX = false; 34 enum GDC_with_AVX2 = false; 35 enum GDC_with_SHA = false; 36 enum GDC_with_BMI2 = false; 37 } 38 else version (X86_64) 39 { 40 // GDC support uses extended inline assembly: 41 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 42 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 43 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 44 45 public import core.simd: byte16, short8, int4, float4, double2; 46 47 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 48 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 49 public import gcc.builtins; 50 51 // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 52 // want to support other archs with GDC 53 54 enum GDC_with_x86 = true; 55 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 56 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 57 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 58 59 static if (__VERSION__ >= 2100) // Starting at GDC 12.1 60 { 61 enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps); 62 enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128); 63 enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps); 64 enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq); 65 enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256); 66 enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df); 67 enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si); 68 69 } 70 else 71 { 72 // Before GCC 11.3, no reliable way to detect instruction sets. 73 // We start above detection at GCC 12, with DMDFE 2.100, which 74 // is more conservative. 75 enum GDC_with_SSE3 = false; 76 enum GDC_with_SSSE3 = false; 77 enum GDC_with_SSE41 = false; 78 enum GDC_with_SSE42 = false; 79 enum GDC_with_AVX = false; 80 enum GDC_with_AVX2 = false; 81 enum GDC_with_BMI2 = false; 82 } 83 84 enum GDC_with_SHA = false; // TODO: detect that 85 } 86 else 87 { 88 enum GDC_with_x86 = false; 89 enum GDC_with_MMX = false; 90 enum GDC_with_SSE = false; 91 enum GDC_with_SSE2 = false; 92 enum GDC_with_SSE3 = false; 93 enum GDC_with_SSSE3 = false; 94 enum GDC_with_SSE41 = false; 95 enum GDC_with_SSE42 = false; 96 enum GDC_with_AVX = false; 97 enum GDC_with_AVX2 = false; 98 enum GDC_with_SHA = false; 99 enum GDC_with_BMI2 = false; 100 } 101 } 102 else 103 { 104 enum GDC_with_x86 = false; 105 enum GDC_with_MMX = false; 106 enum GDC_with_SSE = false; 107 enum GDC_with_SSE2 = false; 108 enum GDC_with_SSE3 = false; 109 enum GDC_with_SSSE3 = false; 110 enum GDC_with_SSE41 = false; 111 enum GDC_with_SSE42 = false; 112 enum GDC_with_AVX = false; 113 enum GDC_with_AVX2 = false; 114 enum GDC_with_SHA = false; 115 enum GDC_with_BMI2 = false; 116 } 117 118 version(LDC) 119 { 120 public import core.simd; 121 public import ldc.simd; 122 public import ldc.intrinsics; 123 public import ldc.llvmasm: __asm; 124 125 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 126 static if (__VERSION__ >= 2083) 127 { 128 import ldc.llvmasm; 129 alias LDCInlineIR = __ir_pure; 130 131 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 132 alias LDCInlineIREx = __irEx_pure; 133 } 134 else 135 { 136 alias LDCInlineIR = inlineIR; 137 } 138 139 version(ARM) 140 { 141 public import ldc.gccbuiltins_arm; 142 enum LDC_with_ARM32 = true; 143 enum LDC_with_ARM64 = false; 144 enum LDC_with_ARM64_CRC = false; 145 enum LDC_with_SSE = false; 146 enum LDC_with_SSE2 = false; 147 enum LDC_with_SSE3 = false; 148 enum LDC_with_SSSE3 = false; 149 enum LDC_with_SSE41 = false; 150 enum LDC_with_SSE42 = false; 151 enum LDC_with_AVX = false; 152 enum LDC_with_AVX2 = false; 153 enum LDC_with_SHA = false; 154 enum LDC_with_BMI2 = false; 155 } 156 else version(AArch64) 157 { 158 public import ldc.gccbuiltins_aarch64; 159 enum LDC_with_ARM32 = false; 160 enum LDC_with_ARM64 = true; // implies "has Neon" 161 enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc"); 162 enum LDC_with_SSE = false; 163 enum LDC_with_SSE2 = false; 164 enum LDC_with_SSE3 = false; 165 enum LDC_with_SSSE3 = false; 166 enum LDC_with_SSE41 = false; 167 enum LDC_with_SSE42 = false; 168 enum LDC_with_AVX = false; 169 enum LDC_with_AVX2 = false; 170 enum LDC_with_SHA = false; 171 enum LDC_with_BMI2 = false; 172 } 173 else 174 { 175 public import ldc.gccbuiltins_x86; 176 enum LDC_with_ARM32 = false; 177 enum LDC_with_ARM64 = false; 178 enum LDC_with_ARM64_CRC = false; 179 enum LDC_with_SSE = __traits(targetHasFeature, "sse"); 180 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 181 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 182 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 183 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 184 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 185 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 186 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 187 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 188 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2"); 189 } 190 } 191 else 192 { 193 enum LDC_with_ARM32 = false; 194 enum LDC_with_ARM64 = false; 195 enum LDC_with_ARM64_CRC = false; 196 enum LDC_with_SSE = false; 197 enum LDC_with_SSE2 = false; 198 enum LDC_with_SSE3 = false; 199 enum LDC_with_SSSE3 = false; 200 enum LDC_with_SSE41 = false; 201 enum LDC_with_SSE42 = false; 202 enum LDC_with_AVX = false; 203 enum LDC_with_AVX2 = false; 204 enum LDC_with_SHA = false; 205 enum LDC_with_BMI2 = false; 206 } 207 208 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 209 210 version(DigitalMars) 211 { 212 version(D_InlineAsm_X86) 213 enum DMD_with_asm = true; 214 else version(D_InlineAsm_X86_64) 215 enum DMD_with_asm = true; 216 else 217 enum DMD_with_asm = false; 218 219 version(D_InlineAsm_X86) 220 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 221 else 222 enum DMD_with_32bit_asm = false; 223 224 version (D_SIMD) 225 { 226 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 227 228 // Going further, does DMD has SSE4.1 through -mcpu? 229 static if (DMD_with_DSIMD) 230 enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0)); 231 else 232 enum bool DMD_with_DSIMD_and_SSE41 = false; 233 234 // No DMD way to detect those instruction sets => pessimize 235 // would be cool to have a way to detect support for this at CT 236 enum DMD_with_DSIMD_and_SSE3 = DMD_with_DSIMD_and_SSE41; 237 enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41; 238 239 version(D_AVX) 240 enum DMD_with_DSIMD_and_AVX = true; 241 else 242 enum DMD_with_DSIMD_and_AVX = false; 243 244 version(D_AVX2) 245 enum DMD_with_DSIMD_and_AVX2 = true; 246 else 247 enum DMD_with_DSIMD_and_AVX2 = false; 248 249 enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX; 250 } 251 else 252 { 253 enum DMD_with_DSIMD = false; 254 enum DMD_with_DSIMD_and_SSE3 = false; 255 enum DMD_with_DSIMD_and_SSSE3 = false; 256 enum DMD_with_DSIMD_and_SSE41 = false; 257 enum DMD_with_DSIMD_and_SSE42 = false; 258 enum DMD_with_DSIMD_and_AVX = false; 259 enum DMD_with_DSIMD_and_AVX2 = false; 260 } 261 } 262 else 263 { 264 enum DMD_with_asm = false; 265 enum DMD_with_32bit_asm = false; 266 enum DMD_with_DSIMD = false; 267 enum DMD_with_DSIMD_and_SSE3 = false; 268 enum DMD_with_DSIMD_and_SSSE3 = false; 269 enum DMD_with_DSIMD_and_SSE41 = false; 270 enum DMD_with_DSIMD_and_SSE42 = false; 271 enum DMD_with_DSIMD_and_AVX = false; 272 enum DMD_with_DSIMD_and_AVX2 = false; 273 } 274 275 276 // Sometimes, can be helpful to merge builtin code, however keep in mind that 277 // LDC and GDC builtins often subtly diverse, wrt. unsigned vs signed vectors, 278 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics. 279 enum GDC_or_LDC_with_SSE = GDC_with_SSE || LDC_with_SSE; 280 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2; 281 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3; 282 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41; 283 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42; 284 285 enum GDC_or_LDC_with_AVX = GDC_with_AVX || LDC_with_AVX; 286 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2; 287 enum GDC_or_LDC_with_SHA = GDC_with_SHA || LDC_with_SHA; 288 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2; 289 290 291 static if (LDC_with_ARM32) 292 { 293 package uint arm_get_fpcr() nothrow @nogc @trusted 294 { 295 return __builtin_arm_get_fpscr(); 296 } 297 298 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 299 { 300 __builtin_arm_set_fpscr(cw); 301 } 302 } 303 304 static if (LDC_with_ARM64) 305 { 306 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 307 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 308 309 package uint arm_get_fpcr() pure nothrow @nogc @trusted 310 { 311 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 312 return __asm!uint("mrs $0, fpcr", "=r"); 313 } 314 315 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 316 { 317 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 318 long save_x2; 319 __asm!void("str x2, $1 \n" ~ 320 "ldr w2, $0 \n" ~ 321 "msr fpcr, x2 \n" ~ 322 "ldr x2, $1 " , "m,m", cw, &save_x2); 323 } 324 } 325 326 327 // For internal use only, since public API deals with a x86 semantic emulation 328 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 329 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 330 enum uint _MM_ROUND_UP_ARM = 0x00400000; 331 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 332 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 333 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 334 335 336 // 337 // <ROUNDING> 338 // 339 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 340 // doesn't change the FPU rounding mode, and isn't expected to do so. 341 // So we devised these rounding function to help having consistent rounding between 342 // LDC and DMD. It's important that DMD uses whatever is in MXCSR to round. 343 // 344 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 345 // functionality. 346 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 347 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 348 349 int convertFloatToInt32UsingMXCSR(float value) @trusted 350 { 351 int result; 352 version(GNU) 353 { 354 asm pure nothrow @nogc @trusted 355 { 356 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 357 } 358 } 359 else static if (LDC_with_ARM32) 360 { 361 result = __asm!int(`vldr s2, $1 362 vcvtr.s32.f32 s2, s2 363 vmov $0, s2`, "=r,m,~{s2}", value); 364 } 365 else static if (LDC_with_ARM64) 366 { 367 // Get current rounding mode. 368 uint fpscr = arm_get_fpcr(); 369 370 switch(fpscr & _MM_ROUND_MASK_ARM) 371 { 372 default: 373 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 374 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 375 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 376 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 377 } 378 } 379 else 380 { 381 asm pure nothrow @nogc @trusted 382 { 383 cvtss2si EAX, value; 384 mov result, EAX; 385 } 386 } 387 return result; 388 } 389 390 int convertDoubleToInt32UsingMXCSR(double value) @trusted 391 { 392 int result; 393 version(GNU) 394 { 395 asm pure nothrow @nogc @trusted 396 { 397 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 398 } 399 } 400 else static if (LDC_with_ARM32) 401 { 402 result = __asm!int(`vldr d2, $1 403 vcvtr.s32.f64 s2, d2 404 vmov $0, s2`, "=r,m,~{s2},~{d2}", value); 405 } 406 else static if (LDC_with_ARM64) 407 { 408 // Get current rounding mode. 409 uint fpscr = arm_get_fpcr(); 410 411 switch(fpscr & _MM_ROUND_MASK_ARM) 412 { 413 default: 414 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 415 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 416 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 417 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 418 } 419 } 420 else 421 { 422 asm pure nothrow @nogc @trusted 423 { 424 cvtsd2si EAX, value; 425 mov result, EAX; 426 } 427 } 428 return result; 429 } 430 431 long convertFloatToInt64UsingMXCSR(float value) @trusted 432 { 433 static if (LDC_with_ARM32) 434 { 435 // We have to resort to libc since 32-bit ARM 436 // doesn't seem to have 64-bit registers. 437 438 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 439 440 // Note: converting to double precision else rounding could be different for large integers 441 double asDouble = value; 442 443 switch(fpscr & _MM_ROUND_MASK_ARM) 444 { 445 default: 446 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 447 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 448 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 449 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 450 } 451 } 452 else static if (LDC_with_ARM64) 453 { 454 uint fpscr = arm_get_fpcr(); 455 456 switch(fpscr & _MM_ROUND_MASK_ARM) 457 { 458 default: 459 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 460 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 461 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 462 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 463 } 464 } 465 // 64-bit can use an SSE instruction 466 else version(D_InlineAsm_X86_64) 467 { 468 long result; 469 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 470 { 471 asm pure nothrow @nogc @trusted 472 { 473 movss XMM0, value; 474 cvtss2si RAX, XMM0; 475 mov result, RAX; 476 } 477 } 478 else 479 { 480 asm pure nothrow @nogc @trusted 481 { 482 movss XMM0, value; 483 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 484 mov result, RAX; 485 } 486 } 487 return result; 488 } 489 else version(D_InlineAsm_X86) 490 { 491 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 492 // This leads to an unfortunate FPU sequence in every C++ compiler. 493 // See: https://godbolt.org/z/vZym77 494 495 // Get current MXCSR rounding 496 uint sseRounding; 497 ushort savedFPUCW; 498 ushort newFPUCW; 499 long result; 500 asm pure nothrow @nogc @trusted 501 { 502 stmxcsr sseRounding; 503 fld value; 504 fnstcw savedFPUCW; 505 mov AX, savedFPUCW; 506 and AX, 0xf3ff; // clear FPU rounding bits 507 movzx ECX, word ptr sseRounding; 508 and ECX, 0x6000; // only keep SSE rounding bits 509 shr ECX, 3; 510 or AX, CX; // make a new control word for FPU with SSE bits 511 mov newFPUCW, AX; 512 fldcw newFPUCW; 513 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 514 fldcw savedFPUCW; 515 } 516 return result; 517 } 518 else static if (GDC_with_x86) 519 { 520 version(X86_64) // 64-bit can just use the right instruction 521 { 522 static assert(GDC_with_SSE); 523 __m128 A; 524 A.ptr[0] = value; 525 return __builtin_ia32_cvtss2si64 (A); 526 } 527 else version(X86) // 32-bit 528 { 529 // This is untested! 530 uint sseRounding; 531 ushort savedFPUCW; 532 ushort newFPUCW; 533 long result; 534 asm pure nothrow @nogc @trusted 535 { 536 "stmxcsr %1;\n" ~ 537 "fld %2;\n" ~ 538 "fnstcw %3;\n" ~ 539 "movw %3, %%ax;\n" ~ 540 "andw $0xf3ff, %%ax;\n" ~ 541 "movzwl %1, %%ecx;\n" ~ 542 "andl $0x6000, %%ecx;\n" ~ 543 "shrl $3, %%ecx;\n" ~ 544 "orw %%cx, %%ax\n" ~ 545 "movw %%ax, %4;\n" ~ 546 "fldcw %4;\n" ~ 547 "fistpll %0;\n" ~ 548 "fldcw %3;\n" 549 : "=m"(result) // %0 550 : "m" (sseRounding), 551 "f" (value), 552 "m" (savedFPUCW), 553 "m" (newFPUCW) 554 : "eax", "ecx", "st"; 555 } 556 return result; 557 } 558 else 559 static assert(false); 560 } 561 else 562 static assert(false); 563 } 564 565 566 ///ditto 567 long convertDoubleToInt64UsingMXCSR(double value) @trusted 568 { 569 static if (LDC_with_ARM32) 570 { 571 // We have to resort to libc since 32-bit ARM 572 // doesn't seem to have 64-bit registers. 573 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 574 switch(fpscr & _MM_ROUND_MASK_ARM) 575 { 576 default: 577 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 578 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 579 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 580 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 581 } 582 } 583 else static if (LDC_with_ARM64) 584 { 585 // Get current rounding mode. 586 uint fpscr = arm_get_fpcr(); 587 588 switch(fpscr & _MM_ROUND_MASK_ARM) 589 { 590 default: 591 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 592 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 593 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 594 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 595 } 596 } 597 // 64-bit can use an SSE instruction 598 else version(D_InlineAsm_X86_64) 599 { 600 long result; 601 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 602 { 603 asm pure nothrow @nogc @trusted 604 { 605 movsd XMM0, value; 606 cvtsd2si RAX, XMM0; 607 mov result, RAX; 608 } 609 } 610 else 611 { 612 asm pure nothrow @nogc @trusted 613 { 614 movsd XMM0, value; 615 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 616 mov result, RAX; 617 } 618 } 619 return result; 620 } 621 else version(D_InlineAsm_X86) 622 { 623 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 624 // This leads to an unfortunate FPU sequence in every C++ compiler. 625 // See: https://godbolt.org/z/vZym77 626 627 // Get current MXCSR rounding 628 uint sseRounding; 629 ushort savedFPUCW; 630 ushort newFPUCW; 631 long result; 632 asm pure nothrow @nogc @trusted 633 { 634 stmxcsr sseRounding; 635 fld value; 636 fnstcw savedFPUCW; 637 mov AX, savedFPUCW; 638 and AX, 0xf3ff; 639 movzx ECX, word ptr sseRounding; 640 and ECX, 0x6000; 641 shr ECX, 3; 642 or AX, CX; 643 mov newFPUCW, AX; 644 fldcw newFPUCW; 645 fistp result; 646 fldcw savedFPUCW; 647 } 648 return result; 649 } 650 else static if (GDC_with_x86) 651 { 652 version(X86_64) 653 { 654 static assert(GDC_with_SSE2); 655 __m128d A; 656 A.ptr[0] = value; 657 return __builtin_ia32_cvtsd2si64 (A); 658 } 659 else 660 { 661 // This is untested! 662 uint sseRounding; 663 ushort savedFPUCW; 664 ushort newFPUCW; 665 long result; 666 asm pure nothrow @nogc @trusted 667 { 668 "stmxcsr %1;\n" ~ 669 "fld %2;\n" ~ 670 "fnstcw %3;\n" ~ 671 "movw %3, %%ax;\n" ~ 672 "andw $0xf3ff, %%ax;\n" ~ 673 "movzwl %1, %%ecx;\n" ~ 674 "andl $0x6000, %%ecx;\n" ~ 675 "shrl $3, %%ecx;\n" ~ 676 "orw %%cx, %%ax\n" ~ 677 "movw %%ax, %4;\n" ~ 678 "fldcw %4;\n" ~ 679 "fistpll %0;\n" ~ 680 "fldcw %3;\n" 681 : "=m"(result) // %0 682 : "m" (sseRounding), 683 "t" (value), 684 "m" (savedFPUCW), 685 "m" (newFPUCW) 686 : "eax", "ecx", "st"; 687 } 688 return result; 689 } 690 } 691 else 692 static assert(false); 693 } 694 695 // 696 // </ROUNDING> 697 // 698 699 700 // using the Intel terminology here 701 702 byte saturateSignedWordToSignedByte(short value) pure @safe 703 { 704 if (value > 127) value = 127; 705 if (value < -128) value = -128; 706 return cast(byte) value; 707 } 708 709 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 710 { 711 if (value > 255) value = 255; 712 if (value < 0) value = 0; 713 return cast(ubyte) value; 714 } 715 716 short saturateSignedIntToSignedShort(int value) pure @safe 717 { 718 if (value > 32767) value = 32767; 719 if (value < -32768) value = -32768; 720 return cast(short) value; 721 } 722 723 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 724 { 725 if (value > 65535) value = 65535; 726 if (value < 0) value = 0; 727 return cast(ushort) value; 728 } 729 730 unittest // test saturate operations 731 { 732 assert( saturateSignedWordToSignedByte(32000) == 127); 733 assert( saturateSignedWordToUnsignedByte(32000) == 255); 734 assert( saturateSignedWordToSignedByte(-4000) == -128); 735 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 736 assert( saturateSignedIntToSignedShort(32768) == 32767); 737 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 738 assert( saturateSignedIntToSignedShort(-32769) == -32768); 739 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 740 } 741 742 version(unittest) 743 { 744 // This is just for debugging tests 745 import core.stdc.stdio: printf; 746 747 // printing vectors for implementation 748 // Note: you can override `pure` within a `debug` clause 749 750 void _mm_print_pi64(__m64 v) @trusted 751 { 752 long1 vl = cast(long1)v; 753 printf("%lld\n", vl.array[0]); 754 } 755 756 void _mm_print_pi32(__m64 v) @trusted 757 { 758 int[2] C = (cast(int2)v).array; 759 printf("%d %d\n", C[0], C[1]); 760 } 761 762 void _mm_print_pi16(__m64 v) @trusted 763 { 764 short[4] C = (cast(short4)v).array; 765 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 766 } 767 768 void _mm_print_pi8(__m64 v) @trusted 769 { 770 byte[8] C = (cast(byte8)v).array; 771 printf("%d %d %d %d %d %d %d %d\n", 772 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 773 } 774 775 void _mm_print_epi64(__m128i v) @trusted 776 { 777 long2 vl = cast(long2)v; 778 printf("%lld %lld\n", vl.array[0], vl.array[1]); 779 } 780 781 void _mm_print_epi32(__m128i v) @trusted 782 { 783 printf("%d %d %d %d\n", 784 v.array[0], v.array[1], v.array[2], v.array[3]); 785 } 786 787 void _mm_print_epi16(__m128i v) @trusted 788 { 789 short[8] C = (cast(short8)v).array; 790 printf("%d %d %d %d %d %d %d %d\n", 791 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 792 } 793 794 void _mm_print_epi8(__m128i v) @trusted 795 { 796 byte[16] C = (cast(byte16)v).array; 797 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 798 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 799 } 800 801 void _mm_print_ps(__m128 v) @trusted 802 { 803 // %g because %f can conceal very small numbers and prints zero instead 804 float[4] C = (cast(float4)v).array; 805 printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]); 806 } 807 808 void _mm_print_pd(__m128d v) @trusted 809 { 810 double[2] C = (cast(double2)v).array; 811 printf("%f %f\n", C[0], C[1]); 812 } 813 814 void _mm256_print_pd(__m256d v) @trusted 815 { 816 // %g because %f can conceal very small numbers and prints zero instead 817 printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 818 } 819 820 void _mm256_print_ps(__m256 v) @trusted 821 { 822 // %g because %f can conceal very small numbers and prints zero instead 823 printf("%g %g %g %g %g %g %g %g\n", 824 v.array[0], v.array[1], v.array[2], v.array[3], 825 v.array[4], v.array[5], v.array[6], v.array[7]); 826 } 827 828 void _mm256_print_epi32(__m256i v) @trusted 829 { 830 int8 vl = cast(int8)v; 831 printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3], 832 vl.array[4], vl.array[5], vl.array[6], vl.array[7]); 833 } 834 835 void _mm256_print_epi64(__m256i v) @trusted 836 { 837 long4 vl = cast(long4)v; 838 printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]); 839 } 840 } 841 842 843 // 844 // <FLOATING-POINT COMPARISONS> 845 // 846 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 847 // need different IR generation. 848 849 enum FPComparison 850 { 851 false_,// always false 852 oeq, // ordered and equal 853 ogt, // ordered and greater than 854 oge, // ordered and greater than or equal 855 olt, // ordered and less than 856 ole, // ordered and less than or equal 857 one, // ordered and not equal 858 ord, // ordered (no nans) 859 ueq, // unordered or equal 860 ugt, // unordered or greater than ("nle") 861 uge, // unordered or greater than or equal ("nlt") 862 ult, // unordered or less than ("nge") 863 ule, // unordered or less than or equal ("ngt") 864 une, // unordered or not equal ("neq") 865 uno, // unordered (either nans) 866 true_, // always true 867 } 868 869 private static immutable string[FPComparison.max+1] FPComparisonToString = 870 [ 871 "false", 872 "oeq", 873 "ogt", 874 "oge", 875 "olt", 876 "ole", 877 "one", 878 "ord", 879 "ueq", 880 "ugt", 881 "uge", 882 "ult", 883 "ule", 884 "une", 885 "uno", 886 "true" 887 ]; 888 889 // AVX FP comparison to FPComparison 890 FPComparison mapAVXFPComparison(int imm8) pure @safe 891 { 892 // Always map on non-signalling 893 static immutable FPComparison[16] mapping = 894 [ 895 FPComparison.oeq, // _CMP_EQ_OQ 896 FPComparison.olt, // _CMP_LT_OS 897 FPComparison.ole, // _CMP_LE_OS 898 FPComparison.uno, // _CMP_UNORD_Q 899 FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered? 900 FPComparison.uge, // _CMP_NLT_US 901 FPComparison.ugt, // _CMP_NLE_US 902 FPComparison.ord, // _CMP_ORD_Q 903 FPComparison.ueq, // _CMP_EQ_UQ 904 FPComparison.ult, // _CMP_NGE_US 905 FPComparison.ule, // _CMP_NGT_US 906 FPComparison.false_,// _CMP_FALSE_OQ 907 FPComparison.one, // _CMP_NEQ_OQ 908 FPComparison.oge, // _CMP_GE_OS 909 FPComparison.ogt, // _CMP_GT_OS 910 FPComparison.true_ // _CMP_TRUE_UQ 911 ]; 912 913 return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up 914 } 915 916 // Individual float comparison: returns -1 for true or 0 for false. 917 // Useful for DMD and testing 918 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 919 { 920 bool unordered = isnan(a) || isnan(b); 921 final switch(comparison) with(FPComparison) 922 { 923 case false_: return false; 924 case oeq: return a == b; 925 case ogt: return a > b; 926 case oge: return a >= b; 927 case olt: return a < b; 928 case ole: return a <= b; 929 case one: return !unordered && (a != b); // NaN with != always yields true 930 case ord: return !unordered; 931 case ueq: return unordered || (a == b); 932 case ugt: return unordered || (a > b); 933 case uge: return unordered || (a >= b); 934 case ult: return unordered || (a < b); 935 case ule: return unordered || (a <= b); 936 case une: return (a != b); // NaN with != always yields true 937 case uno: return unordered; 938 case true_: return true; 939 } 940 } 941 942 version(LDC) 943 { 944 /// Provides packed float comparisons 945 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 946 { 947 enum ir = ` 948 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 949 %r = sext <4 x i1> %cmp to <4 x i32> 950 ret <4 x i32> %r`; 951 952 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 953 } 954 955 ///ditto 956 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe 957 { 958 enum ir = ` 959 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1 960 %r = sext <8 x i1> %cmp to <8 x i32> 961 ret <8 x i32> %r`; 962 return LDCInlineIR!(ir, int8, float8, float8)(a, b); 963 } 964 965 /// Provides packed double comparisons 966 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 967 { 968 enum ir = ` 969 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 970 %r = sext <2 x i1> %cmp to <2 x i64> 971 ret <2 x i64> %r`; 972 973 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 974 } 975 976 ///ditto 977 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe 978 { 979 enum ir = ` 980 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1 981 %r = sext <4 x i1> %cmp to <4 x i64> 982 ret <4 x i64> %r`; 983 return LDCInlineIR!(ir, long4, double4, double4)(a, b); 984 } 985 986 /// CMPSS-style comparisons 987 /// clang implement it through x86 intrinsics, it is possible with IR alone 988 /// but leads to less optimal code. 989 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 990 /// Not that simple. 991 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 992 { 993 /* 994 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 995 enum bool invertOp = (predicateNumber & 0x80) != 0; 996 static if(invertOp) 997 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 998 else 999 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 1000 */ 1001 enum ir = ` 1002 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 1003 %r = sext i1 %cmp to i32 1004 %r2 = bitcast i32 %r to float 1005 ret float %r2`; 1006 1007 float4 r = a; 1008 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 1009 return r; 1010 } 1011 1012 /// CMPSD-style comparisons 1013 /// clang implement it through x86 intrinsics, it is possible with IR alone 1014 /// but leads to less optimal code. 1015 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 1016 /// Not that simple. 1017 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 1018 { 1019 enum ir = ` 1020 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 1021 %r = sext i1 %cmp to i64 1022 %r2 = bitcast i64 %r to double 1023 ret double %r2`; 1024 1025 double2 r = a; 1026 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 1027 return r; 1028 } 1029 } 1030 else 1031 { 1032 /// Provides packed float comparisons 1033 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 1034 { 1035 int4 result; 1036 foreach(i; 0..4) 1037 { 1038 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1039 } 1040 return result; 1041 } 1042 ///ditto 1043 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted 1044 { 1045 int8 result; 1046 foreach(i; 0..8) 1047 { 1048 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1049 } 1050 return result; 1051 } 1052 1053 /// Provides packed double comparisons 1054 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1055 { 1056 long2 result; 1057 foreach(i; 0..2) 1058 { 1059 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1060 } 1061 return result; 1062 } 1063 ///ditto 1064 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted 1065 { 1066 long4 result; 1067 foreach(i; 0..4) 1068 { 1069 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1070 } 1071 return result; 1072 } 1073 1074 /// Provides CMPSS-style comparison 1075 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 1076 { 1077 int4 result = cast(int4)a; 1078 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 1079 return cast(float4)result; 1080 } 1081 1082 /// Provides CMPSD-style comparison 1083 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1084 { 1085 long2 result = cast(long2)a; 1086 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 1087 return cast(double2)result; 1088 } 1089 } 1090 unittest // cmpps 1091 { 1092 // Check all comparison type is working 1093 float4 A = [1, 3, 5, float.nan]; 1094 float4 B = [2, 3, 4, 5]; 1095 1096 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 1097 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 1098 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 1099 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 1100 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 1101 int4 result_one = cmpps!(FPComparison.one)(A, B); 1102 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 1103 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 1104 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 1105 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 1106 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 1107 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 1108 int4 result_une = cmpps!(FPComparison.une)(A, B); 1109 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 1110 1111 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 1112 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 1113 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 1114 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 1115 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 1116 static immutable int[4] correct_one = [-1, 0,-1, 0]; 1117 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 1118 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 1119 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 1120 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 1121 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 1122 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 1123 static immutable int[4] correct_une = [-1, 0,-1,-1]; 1124 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 1125 1126 assert(result_oeq.array == correct_oeq); 1127 assert(result_ogt.array == correct_ogt); 1128 assert(result_oge.array == correct_oge); 1129 assert(result_olt.array == correct_olt); 1130 assert(result_ole.array == correct_ole); 1131 assert(result_one.array == correct_one); 1132 assert(result_ord.array == correct_ord); 1133 assert(result_ueq.array == correct_ueq); 1134 assert(result_ugt.array == correct_ugt); 1135 assert(result_uge.array == correct_uge); 1136 assert(result_ult.array == correct_ult); 1137 assert(result_ule.array == correct_ule); 1138 assert(result_une.array == correct_une); 1139 assert(result_uno.array == correct_uno); 1140 } 1141 unittest 1142 { 1143 double2 a = [1, 3]; 1144 double2 b = [2, 3]; 1145 long2 c = cmppd!(FPComparison.ult)(a, b); 1146 static immutable long[2] correct = [cast(long)(-1), 0]; 1147 assert(c.array == correct); 1148 } 1149 unittest // cmpss 1150 { 1151 void testComparison(FPComparison comparison)(float4 A, float4 B) 1152 { 1153 float4 result = cmpss!comparison(A, B); 1154 int4 iresult = cast(int4)result; 1155 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1156 assert(iresult.array[0] == expected); 1157 assert(result.array[1] == A.array[1]); 1158 assert(result.array[2] == A.array[2]); 1159 assert(result.array[3] == A.array[3]); 1160 } 1161 1162 // Check all comparison type is working 1163 float4 A = [1, 3, 5, 6]; 1164 float4 B = [2, 3, 4, 5]; 1165 float4 C = [float.nan, 3, 4, 5]; 1166 1167 testComparison!(FPComparison.oeq)(A, B); 1168 testComparison!(FPComparison.oeq)(A, C); 1169 testComparison!(FPComparison.ogt)(A, B); 1170 testComparison!(FPComparison.ogt)(A, C); 1171 testComparison!(FPComparison.oge)(A, B); 1172 testComparison!(FPComparison.oge)(A, C); 1173 testComparison!(FPComparison.olt)(A, B); 1174 testComparison!(FPComparison.olt)(A, C); 1175 testComparison!(FPComparison.ole)(A, B); 1176 testComparison!(FPComparison.ole)(A, C); 1177 testComparison!(FPComparison.one)(A, B); 1178 testComparison!(FPComparison.one)(A, C); 1179 testComparison!(FPComparison.ord)(A, B); 1180 testComparison!(FPComparison.ord)(A, C); 1181 testComparison!(FPComparison.ueq)(A, B); 1182 testComparison!(FPComparison.ueq)(A, C); 1183 testComparison!(FPComparison.ugt)(A, B); 1184 testComparison!(FPComparison.ugt)(A, C); 1185 testComparison!(FPComparison.uge)(A, B); 1186 testComparison!(FPComparison.uge)(A, C); 1187 testComparison!(FPComparison.ult)(A, B); 1188 testComparison!(FPComparison.ult)(A, C); 1189 testComparison!(FPComparison.ule)(A, B); 1190 testComparison!(FPComparison.ule)(A, C); 1191 testComparison!(FPComparison.une)(A, B); 1192 testComparison!(FPComparison.une)(A, C); 1193 testComparison!(FPComparison.uno)(A, B); 1194 testComparison!(FPComparison.uno)(A, C); 1195 } 1196 unittest // cmpsd 1197 { 1198 void testComparison(FPComparison comparison)(double2 A, double2 B) 1199 { 1200 double2 result = cmpsd!comparison(A, B); 1201 long2 iresult = cast(long2)result; 1202 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1203 assert(iresult.array[0] == expected); 1204 assert(result.array[1] == A.array[1]); 1205 } 1206 1207 // Check all comparison type is working 1208 double2 A = [1, 3]; 1209 double2 B = [2, 4]; 1210 double2 C = [double.nan, 5]; 1211 1212 testComparison!(FPComparison.oeq)(A, B); 1213 testComparison!(FPComparison.oeq)(A, C); 1214 testComparison!(FPComparison.ogt)(A, B); 1215 testComparison!(FPComparison.ogt)(A, C); 1216 testComparison!(FPComparison.oge)(A, B); 1217 testComparison!(FPComparison.oge)(A, C); 1218 testComparison!(FPComparison.olt)(A, B); 1219 testComparison!(FPComparison.olt)(A, C); 1220 testComparison!(FPComparison.ole)(A, B); 1221 testComparison!(FPComparison.ole)(A, C); 1222 testComparison!(FPComparison.one)(A, B); 1223 testComparison!(FPComparison.one)(A, C); 1224 testComparison!(FPComparison.ord)(A, B); 1225 testComparison!(FPComparison.ord)(A, C); 1226 testComparison!(FPComparison.ueq)(A, B); 1227 testComparison!(FPComparison.ueq)(A, C); 1228 testComparison!(FPComparison.ugt)(A, B); 1229 testComparison!(FPComparison.ugt)(A, C); 1230 testComparison!(FPComparison.uge)(A, B); 1231 testComparison!(FPComparison.uge)(A, C); 1232 testComparison!(FPComparison.ult)(A, B); 1233 testComparison!(FPComparison.ult)(A, C); 1234 testComparison!(FPComparison.ule)(A, B); 1235 testComparison!(FPComparison.ule)(A, C); 1236 testComparison!(FPComparison.une)(A, B); 1237 testComparison!(FPComparison.une)(A, C); 1238 testComparison!(FPComparison.uno)(A, B); 1239 testComparison!(FPComparison.uno)(A, C); 1240 } 1241 1242 // 1243 // </FLOATING-POINT COMPARISONS> 1244 // 1245 1246 1247 __m64 to_m64(__m128i a) pure @trusted 1248 { 1249 long2 la = cast(long2)a; 1250 long1 r = la.array[0]; 1251 return r; 1252 } 1253 1254 __m128i to_m128i(__m64 a) pure @trusted 1255 { 1256 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1257 1258 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1259 { 1260 long2 r = a.array[0]; 1261 r.ptr[1] = 0; 1262 return cast(int4)r; 1263 } 1264 else */ 1265 { 1266 long2 r = [0, 0]; 1267 r.ptr[0] = a.array[0]; 1268 return cast(__m128i)r; 1269 } 1270 } 1271 1272 // ADDITIONAL x86 INTRINSICS 1273 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1274 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1275 static if (LDC_with_SSE41) 1276 { 1277 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1278 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1279 } 1280 1281 // SOME NEON INTRINSICS 1282 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1283 // Not in the public API but the simde project expose it all for the user to use. 1284 // MAYDO: create a new neon.d module, for internal use only. 1285 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1286 static if (LDC_with_ARM64) 1287 { 1288 // VERY USEFUL LINK 1289 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1290 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1291 1292 pragma(LDC_intrinsic, "llvm.aarch64.crc32cb") 1293 uint __crc32cb(uint a, uint b) pure @safe; 1294 1295 pragma(LDC_intrinsic, "llvm.aarch64.crc32ch") 1296 uint __crc32ch(uint a, uint b) pure @safe; 1297 1298 pragma(LDC_intrinsic, "llvm.aarch64.crc32cw") 1299 uint __crc32cw(uint a, uint b) pure @safe; 1300 1301 pragma(LDC_intrinsic, "llvm.aarch64.crc32cx") 1302 uint __crc32cd(uint a, ulong b) pure @safe; 1303 1304 //pragma(LDC_intrinsic, "llvm.aarch64.dmb") 1305 // uint __dmb(int a) @safe; // didn't found a name in intrinsic list 1306 1307 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1308 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1309 1310 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1311 short8 vabsq_s16(short8 a) pure @safe; 1312 1313 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1314 int4 vabsq_s32(int4 a) pure @safe; 1315 1316 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1317 byte16 vabsq_s8(byte16 a) pure @safe; 1318 1319 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1320 { 1321 return a & b; 1322 } 1323 1324 long2 vandq_s64(long2 a, long2 b) 1325 { 1326 return a & b; 1327 } 1328 1329 long2 vbicq_s64(long2 a, long2 b) pure @safe 1330 { 1331 return a & ~b; 1332 } 1333 1334 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1335 { 1336 return c ^ ((c ^ b) & a); 1337 } 1338 1339 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1340 { 1341 return c ^ ((c ^ b) & a); 1342 } 1343 1344 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1345 { 1346 return c ^ ((c ^ b) & a); 1347 } 1348 1349 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1350 { 1351 short8 r; 1352 r.ptr[0] = lo.array[0]; 1353 r.ptr[1] = lo.array[1]; 1354 r.ptr[2] = lo.array[2]; 1355 r.ptr[3] = lo.array[3]; 1356 r.ptr[4] = hi.array[0]; 1357 r.ptr[5] = hi.array[1]; 1358 r.ptr[6] = hi.array[2]; 1359 r.ptr[7] = hi.array[3]; 1360 return r; 1361 } 1362 1363 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1364 { 1365 int4 r; 1366 r.ptr[0] = lo.array[0]; 1367 r.ptr[1] = lo.array[1]; 1368 r.ptr[2] = hi.array[0]; 1369 r.ptr[3] = hi.array[1]; 1370 return r; 1371 } 1372 1373 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1374 { 1375 byte16 r; 1376 r.ptr[0] = lo.array[0]; 1377 r.ptr[1] = lo.array[1]; 1378 r.ptr[2] = lo.array[2]; 1379 r.ptr[3] = lo.array[3]; 1380 r.ptr[4] = lo.array[4]; 1381 r.ptr[5] = lo.array[5]; 1382 r.ptr[6] = lo.array[6]; 1383 r.ptr[7] = lo.array[7]; 1384 r.ptr[8] = hi.array[0]; 1385 r.ptr[9] = hi.array[1]; 1386 r.ptr[10] = hi.array[2]; 1387 r.ptr[11] = hi.array[3]; 1388 r.ptr[12] = hi.array[4]; 1389 r.ptr[13] = hi.array[5]; 1390 r.ptr[14] = hi.array[6]; 1391 r.ptr[15] = hi.array[7]; 1392 return r; 1393 } 1394 1395 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1396 { 1397 short8 r; 1398 r.ptr[0] = lo.array[0]; 1399 r.ptr[1] = lo.array[1]; 1400 r.ptr[2] = lo.array[2]; 1401 r.ptr[3] = lo.array[3]; 1402 r.ptr[4] = hi.array[0]; 1403 r.ptr[5] = hi.array[1]; 1404 r.ptr[6] = hi.array[2]; 1405 r.ptr[7] = hi.array[3]; 1406 return r; 1407 } 1408 1409 1410 // float4 => int4 1411 1412 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1413 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1414 1415 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1416 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1417 1418 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1419 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1420 1421 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1422 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1423 1424 1425 // double2 => long2 1426 1427 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1428 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1429 1430 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1431 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1432 1433 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1434 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1435 1436 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1437 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1438 1439 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1440 int vcvtms_s32_f32(float a) pure @safe; 1441 1442 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1443 int vcvtns_s32_f32(float a) pure @safe; 1444 1445 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1446 int vcvtps_s32_f32(float a) pure @safe; 1447 1448 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1449 int vcvts_s32_f32(float a) pure @safe; 1450 1451 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1452 int vcvtms_s32_f64(double a) pure @safe; 1453 1454 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1455 int vcvtns_s32_f64(double a) pure @safe; 1456 1457 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1458 int vcvtps_s32_f64(double a) pure @safe; 1459 1460 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1461 int vcvts_s32_f64(double a) pure @safe; 1462 1463 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1464 long vcvtms_s64_f32(float a) pure @safe; 1465 1466 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1467 long vcvtns_s64_f32(float a) pure @safe; 1468 1469 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1470 long vcvtps_s64_f32(float a) pure @safe; 1471 1472 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1473 long vcvts_s64_f32(float a) pure @safe; 1474 1475 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1476 long vcvtms_s64_f64(double a) pure @safe; 1477 1478 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1479 long vcvtns_s64_f64(double a) pure @safe; 1480 1481 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1482 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1483 1484 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1485 long vcvts_s64_f64(double a) pure @safe; 1486 1487 long2 vdupq_n_s64(long value) pure @safe 1488 { 1489 long2 r; 1490 r = value; 1491 return r; 1492 } 1493 1494 short4 vget_high_s16(short8 a) pure @trusted 1495 { 1496 short4 r; 1497 r.ptr[0] = a.array[4]; 1498 r.ptr[1] = a.array[5]; 1499 r.ptr[2] = a.array[6]; 1500 r.ptr[3] = a.array[7]; 1501 return r; 1502 } 1503 1504 int2 vget_high_s32(int4 a) pure @trusted 1505 { 1506 int2 r; 1507 r.ptr[0] = a.array[2]; 1508 r.ptr[1] = a.array[3]; 1509 return r; 1510 } 1511 1512 byte8 vget_high_u8(byte16 a) pure @trusted 1513 { 1514 byte8 r; 1515 r.ptr[0] = a.array[8]; 1516 r.ptr[1] = a.array[9]; 1517 r.ptr[2] = a.array[10]; 1518 r.ptr[3] = a.array[11]; 1519 r.ptr[4] = a.array[12]; 1520 r.ptr[5] = a.array[13]; 1521 r.ptr[6] = a.array[14]; 1522 r.ptr[7] = a.array[15]; 1523 return r; 1524 } 1525 1526 short4 vget_low_s16(short8 a) pure @trusted 1527 { 1528 short4 r; 1529 r.ptr[0] = a.array[0]; 1530 r.ptr[1] = a.array[1]; 1531 r.ptr[2] = a.array[2]; 1532 r.ptr[3] = a.array[3]; 1533 return r; 1534 } 1535 1536 int2 vget_low_s32(int4 a) pure @trusted 1537 { 1538 int2 r; 1539 r.ptr[0] = a.array[0]; 1540 r.ptr[1] = a.array[1]; 1541 return r; 1542 } 1543 1544 byte8 vget_low_u8(byte16 a) pure @trusted 1545 { 1546 byte8 r; 1547 r.ptr[0] = a.array[0]; 1548 r.ptr[1] = a.array[1]; 1549 r.ptr[2] = a.array[2]; 1550 r.ptr[3] = a.array[3]; 1551 r.ptr[4] = a.array[4]; 1552 r.ptr[5] = a.array[5]; 1553 r.ptr[6] = a.array[6]; 1554 r.ptr[7] = a.array[7]; 1555 return r; 1556 } 1557 1558 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1559 { 1560 return v.array[lane]; 1561 } 1562 1563 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1564 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1565 1566 int4 vmaxq_s32(int4 a, int4 b) 1567 { 1568 int4 r; 1569 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1570 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1571 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1572 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1573 return r; 1574 } 1575 1576 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1577 short8 vminq_s16(short8 a, short8 b) pure @safe; 1578 1579 int2 vmovn_s64(long2 a) pure @trusted 1580 { 1581 int2 r; 1582 r.ptr[0] = cast(int)(a.array[0]); 1583 r.ptr[1] = cast(int)(a.array[1]); 1584 return r; 1585 } 1586 1587 int4 vmull_s16(short4 a, short4 b) pure @trusted 1588 { 1589 int4 r; 1590 r.ptr[0] = a.array[0] * b.array[0]; 1591 r.ptr[1] = a.array[1] * b.array[1]; 1592 r.ptr[2] = a.array[2] * b.array[2]; 1593 r.ptr[3] = a.array[3] * b.array[3]; 1594 return r; 1595 } 1596 1597 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1598 long2 vmull_s32(int2 a, int2 b) pure @safe; 1599 1600 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1601 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1602 1603 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1604 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1605 1606 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1607 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1608 1609 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1610 short8 vpaddlq_u8 (byte16 a) pure @safe; 1611 1612 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1613 { 1614 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1615 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1616 } 1617 else 1618 { 1619 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1620 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1621 } 1622 1623 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1624 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1625 1626 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1627 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1628 1629 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1630 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1631 1632 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1633 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1634 1635 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1636 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1637 1638 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1639 byte8 vqmovn_s16(short8 a) pure @safe; 1640 1641 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1642 short4 vqmovn_s32(int4 a) pure @safe; 1643 1644 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1645 short4 vqmovn_u32(int4 a) pure @safe; 1646 1647 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1648 byte8 vqmovun_s16(short8 a) pure @safe; 1649 1650 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1651 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1652 1653 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1654 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1655 1656 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1657 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1658 1659 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1660 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1661 1662 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1663 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1664 1665 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1666 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1667 1668 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1669 { 1670 return a >>> b; 1671 } 1672 1673 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1674 { 1675 a = a >> byte16(cast(byte)r); 1676 return a; 1677 } 1678 1679 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1680 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1681 } 1682 1683 version(unittest) 1684 { 1685 double abs_double(double x) @trusted 1686 { 1687 version(LDC) 1688 return llvm_fabs(x); 1689 else 1690 { 1691 long uf = *cast(long*)(&x); 1692 uf &= 0x7fffffff_ffffffff; 1693 return *cast(double*)(&uf); 1694 } 1695 } 1696 } 1697 1698 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1699 1700 bool isnan(float x) pure @trusted 1701 { 1702 uint u = *cast(uint*)(&x); 1703 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1704 return result; 1705 } 1706 unittest 1707 { 1708 float x = float.nan; 1709 assert(isnan(x)); 1710 1711 x = 0; 1712 assert(!isnan(x)); 1713 1714 x = float.infinity; 1715 assert(!isnan(x)); 1716 } 1717 1718 bool isnan(double x) pure @trusted 1719 { 1720 ulong u = *cast(ulong*)(&x); 1721 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1722 } 1723 unittest 1724 { 1725 double x = double.nan; 1726 assert(isnan(x)); 1727 1728 x = 0; 1729 assert(!isnan(x)); 1730 1731 x = double.infinity; 1732 assert(!isnan(x)); 1733 }