1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.internals; 8 9 import inteli.types; 10 11 // The only math functions needed for intel-intrinsics 12 public import core.math: sqrt; // since it's an intrinsics 13 14 package: 15 nothrow: 16 @nogc: 17 18 19 version(GNU) 20 { 21 version (X86) 22 { 23 // For 32-bit x86, disable vector extensions with GDC. 24 // It just doesn't work well. 25 enum GDC_with_x86 = true; 26 enum GDC_with_MMX = false; 27 enum GDC_with_SSE = false; 28 enum GDC_with_SSE2 = false; 29 enum GDC_with_SSE3 = false; 30 enum GDC_with_SSSE3 = false; 31 enum GDC_with_SSE41 = false; 32 enum GDC_with_SSE42 = false; 33 enum GDC_with_AVX = false; 34 enum GDC_with_AVX2 = false; 35 enum GDC_with_SHA = false; 36 enum GDC_with_BMI2 = false; 37 } 38 else version (X86_64) 39 { 40 // GDC support uses extended inline assembly: 41 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 42 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 43 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 44 45 public import core.simd: byte16, short8, int4, float4, double2; 46 47 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 48 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 49 public import gcc.builtins; 50 51 // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 52 // want to support other archs with GDC 53 54 enum GDC_with_x86 = true; 55 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 56 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 57 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 58 59 static if (__VERSION__ >= 2100) // Starting at GDC 12.1 60 { 61 enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps); 62 enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128); 63 enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps); 64 enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq); 65 enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256); 66 enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df); 67 enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si); 68 69 } 70 else 71 { 72 // Before GCC 11.3, no reliable way to detect instruction sets. 73 // We start above detection at GCC 12, with DMDFE 2.100, which 74 // is more conservative. 75 enum GDC_with_SSE3 = false; 76 enum GDC_with_SSSE3 = false; 77 enum GDC_with_SSE41 = false; 78 enum GDC_with_SSE42 = false; 79 enum GDC_with_AVX = false; 80 enum GDC_with_AVX2 = false; 81 enum GDC_with_BMI2 = false; 82 } 83 84 enum GDC_with_SHA = false; // TODO: detect that 85 } 86 else 87 { 88 enum GDC_with_x86 = false; 89 enum GDC_with_MMX = false; 90 enum GDC_with_SSE = false; 91 enum GDC_with_SSE2 = false; 92 enum GDC_with_SSE3 = false; 93 enum GDC_with_SSSE3 = false; 94 enum GDC_with_SSE41 = false; 95 enum GDC_with_SSE42 = false; 96 enum GDC_with_AVX = false; 97 enum GDC_with_AVX2 = false; 98 enum GDC_with_SHA = false; 99 enum GDC_with_BMI2 = false; 100 } 101 } 102 else 103 { 104 enum GDC_with_x86 = false; 105 enum GDC_with_MMX = false; 106 enum GDC_with_SSE = false; 107 enum GDC_with_SSE2 = false; 108 enum GDC_with_SSE3 = false; 109 enum GDC_with_SSSE3 = false; 110 enum GDC_with_SSE41 = false; 111 enum GDC_with_SSE42 = false; 112 enum GDC_with_AVX = false; 113 enum GDC_with_AVX2 = false; 114 enum GDC_with_SHA = false; 115 enum GDC_with_BMI2 = false; 116 } 117 118 version(LDC) 119 { 120 public import core.simd; 121 public import ldc.simd; 122 public import ldc.intrinsics; 123 public import ldc.llvmasm: __asm; 124 125 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 126 static if (__VERSION__ >= 2083) 127 { 128 import ldc.llvmasm; 129 alias LDCInlineIR = __ir_pure; 130 131 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 132 alias LDCInlineIREx = __irEx_pure; 133 134 enum bool LDC_with_InlineIREx = true; 135 } 136 else 137 { 138 alias LDCInlineIR = inlineIR; 139 140 enum bool LDC_with_InlineIREx = false; 141 } 142 143 version(ARM) 144 { 145 public import ldc.gccbuiltins_arm; 146 enum LDC_with_ARM32 = true; 147 enum LDC_with_ARM64 = false; 148 enum LDC_with_ARM64_CRC = false; 149 enum LDC_with_SSE = false; 150 enum LDC_with_SSE2 = false; 151 enum LDC_with_SSE3 = false; 152 enum LDC_with_SSSE3 = false; 153 enum LDC_with_SSE41 = false; 154 enum LDC_with_SSE42 = false; 155 enum LDC_with_CRC32 = false; 156 enum LDC_with_AVX = false; 157 enum LDC_with_AVX2 = false; 158 enum LDC_with_SHA = false; 159 enum LDC_with_BMI2 = false; 160 } 161 else version(AArch64) 162 { 163 public import ldc.gccbuiltins_aarch64; 164 enum LDC_with_ARM32 = false; 165 enum LDC_with_ARM64 = true; // implies "has Neon" 166 enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc"); 167 enum LDC_with_SSE = false; 168 enum LDC_with_SSE2 = false; 169 enum LDC_with_SSE3 = false; 170 enum LDC_with_SSSE3 = false; 171 enum LDC_with_SSE41 = false; 172 enum LDC_with_SSE42 = false; 173 enum LDC_with_CRC32 = false; 174 enum LDC_with_AVX = false; 175 enum LDC_with_AVX2 = false; 176 enum LDC_with_SHA = false; 177 enum LDC_with_BMI2 = false; 178 } 179 else 180 { 181 public import ldc.gccbuiltins_x86; 182 enum LDC_with_ARM32 = false; 183 enum LDC_with_ARM64 = false; 184 enum LDC_with_ARM64_CRC = false; 185 enum LDC_with_SSE = __traits(targetHasFeature, "sse"); 186 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2"); 187 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3"); 188 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3"); 189 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1"); 190 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 191 192 // Since LDC 1.30, crc32 is a separate (and sufficient) attribute from sse4.2 193 // As of Jan 2023, GDC doesn't make that distinction, -msse4.2 includes -mcrc32 for GDC. 194 static if (__VERSION__ >= 2100) 195 { 196 enum LDC_with_CRC32 = __traits(targetHasFeature, "crc32"); 197 } 198 else 199 { 200 enum LDC_with_CRC32 = __traits(targetHasFeature, "sse4.2"); // crc32 used to be included in sse4.2 201 } 202 203 enum LDC_with_AVX = __traits(targetHasFeature, "avx"); 204 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2"); 205 enum LDC_with_SHA = __traits(targetHasFeature, "sha"); 206 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2"); 207 } 208 } 209 else 210 { 211 enum LDC_with_ARM32 = false; 212 enum LDC_with_ARM64 = false; 213 enum LDC_with_ARM64_CRC = false; 214 enum LDC_with_SSE = false; 215 enum LDC_with_SSE2 = false; 216 enum LDC_with_SSE3 = false; 217 enum LDC_with_SSSE3 = false; 218 enum LDC_with_SSE41 = false; 219 enum LDC_with_SSE42 = false; 220 enum LDC_with_CRC32 = false; 221 enum LDC_with_AVX = false; 222 enum LDC_with_AVX2 = false; 223 enum LDC_with_SHA = false; 224 enum LDC_with_BMI2 = false; 225 enum LDC_with_InlineIREx = false; 226 } 227 228 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 229 230 version(DigitalMars) 231 { 232 version(D_InlineAsm_X86) 233 enum DMD_with_asm = true; 234 else version(D_InlineAsm_X86_64) 235 enum DMD_with_asm = true; 236 else 237 enum DMD_with_asm = false; 238 239 version(D_InlineAsm_X86) 240 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 241 else 242 enum DMD_with_32bit_asm = false; 243 244 version (D_SIMD) 245 { 246 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 247 248 // Going further, does DMD has SSE4.1 through -mcpu? 249 static if (DMD_with_DSIMD) 250 enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0)); 251 else 252 enum bool DMD_with_DSIMD_and_SSE41 = false; 253 254 // No DMD way to detect those instruction sets => pessimize 255 // would be cool to have a way to detect support for this at CT 256 enum DMD_with_DSIMD_and_SSE3 = DMD_with_DSIMD_and_SSE41; 257 enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41; 258 259 version(D_AVX) 260 enum DMD_with_DSIMD_and_AVX = true; 261 else 262 enum DMD_with_DSIMD_and_AVX = false; 263 264 version(D_AVX2) 265 enum DMD_with_DSIMD_and_AVX2 = true; 266 else 267 enum DMD_with_DSIMD_and_AVX2 = false; 268 269 enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX; 270 } 271 else 272 { 273 enum DMD_with_DSIMD = false; 274 enum DMD_with_DSIMD_and_SSE3 = false; 275 enum DMD_with_DSIMD_and_SSSE3 = false; 276 enum DMD_with_DSIMD_and_SSE41 = false; 277 enum DMD_with_DSIMD_and_SSE42 = false; 278 enum DMD_with_DSIMD_and_AVX = false; 279 enum DMD_with_DSIMD_and_AVX2 = false; 280 } 281 } 282 else 283 { 284 enum DMD_with_asm = false; 285 enum DMD_with_32bit_asm = false; 286 enum DMD_with_DSIMD = false; 287 enum DMD_with_DSIMD_and_SSE3 = false; 288 enum DMD_with_DSIMD_and_SSSE3 = false; 289 enum DMD_with_DSIMD_and_SSE41 = false; 290 enum DMD_with_DSIMD_and_SSE42 = false; 291 enum DMD_with_DSIMD_and_AVX = false; 292 enum DMD_with_DSIMD_and_AVX2 = false; 293 } 294 295 296 // Sometimes, can be helpful to merge builtin code, however keep in mind that 297 // LDC and GDC builtins often subtly diverse, wrt. unsigned vs signed vectors, 298 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics. 299 enum GDC_or_LDC_with_SSE = GDC_with_SSE || LDC_with_SSE; 300 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2; 301 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3; 302 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41; 303 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42; 304 305 enum GDC_or_LDC_with_AVX = GDC_with_AVX || LDC_with_AVX; 306 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2; 307 enum GDC_or_LDC_with_SHA = GDC_with_SHA || LDC_with_SHA; 308 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2; 309 310 311 static if (LDC_with_ARM32) 312 { 313 package uint arm_get_fpcr() nothrow @nogc @trusted 314 { 315 return __builtin_arm_get_fpscr(); 316 } 317 318 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 319 { 320 __builtin_arm_set_fpscr(cw); 321 } 322 } 323 324 static if (LDC_with_ARM64) 325 { 326 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 327 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 328 329 package uint arm_get_fpcr() pure nothrow @nogc @trusted 330 { 331 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 332 return __asm!uint("mrs $0, fpcr", "=r"); 333 } 334 335 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 336 { 337 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 338 long save_x2; 339 __asm!void("str x2, $1 \n" ~ 340 "ldr w2, $0 \n" ~ 341 "msr fpcr, x2 \n" ~ 342 "ldr x2, $1 " , "m,m", cw, &save_x2); 343 } 344 } 345 346 347 // For internal use only, since public API deals with a x86 semantic emulation 348 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 349 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 350 enum uint _MM_ROUND_UP_ARM = 0x00400000; 351 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 352 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 353 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 354 355 356 // 357 // <ROUNDING> 358 // 359 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 360 // doesn't change the FPU rounding mode, and isn't expected to do so. 361 // So we devised these rounding function to help having consistent rounding between 362 // LDC and DMD. It's important that DMD uses whatever is in MXCSR to round. 363 // 364 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 365 // functionality. 366 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 367 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 368 369 int convertFloatToInt32UsingMXCSR(float value) @trusted 370 { 371 int result; 372 version(GNU) 373 { 374 asm pure nothrow @nogc @trusted 375 { 376 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 377 } 378 } 379 else static if (LDC_with_ARM32) 380 { 381 result = __asm!int(`vldr s2, $1 382 vcvtr.s32.f32 s2, s2 383 vmov $0, s2`, "=r,m,~{s2}", value); 384 } 385 else static if (LDC_with_ARM64) 386 { 387 // Get current rounding mode. 388 uint fpscr = arm_get_fpcr(); 389 390 switch(fpscr & _MM_ROUND_MASK_ARM) 391 { 392 default: 393 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 394 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 395 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 396 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 397 } 398 } 399 else 400 { 401 asm pure nothrow @nogc @trusted 402 { 403 cvtss2si EAX, value; 404 mov result, EAX; 405 } 406 } 407 return result; 408 } 409 410 int convertDoubleToInt32UsingMXCSR(double value) @trusted 411 { 412 int result; 413 version(GNU) 414 { 415 asm pure nothrow @nogc @trusted 416 { 417 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 418 } 419 } 420 else static if (LDC_with_ARM32) 421 { 422 result = __asm!int(`vldr d2, $1 423 vcvtr.s32.f64 s2, d2 424 vmov $0, s2`, "=r,m,~{s2},~{d2}", value); 425 } 426 else static if (LDC_with_ARM64) 427 { 428 // Get current rounding mode. 429 uint fpscr = arm_get_fpcr(); 430 431 switch(fpscr & _MM_ROUND_MASK_ARM) 432 { 433 default: 434 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 435 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 436 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 437 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 438 } 439 } 440 else 441 { 442 asm pure nothrow @nogc @trusted 443 { 444 cvtsd2si EAX, value; 445 mov result, EAX; 446 } 447 } 448 return result; 449 } 450 451 long convertFloatToInt64UsingMXCSR(float value) @trusted 452 { 453 static if (LDC_with_ARM32) 454 { 455 // We have to resort to libc since 32-bit ARM 456 // doesn't seem to have 64-bit registers. 457 458 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 459 460 // Note: converting to double precision else rounding could be different for large integers 461 double asDouble = value; 462 463 switch(fpscr & _MM_ROUND_MASK_ARM) 464 { 465 default: 466 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 467 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 468 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 469 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 470 } 471 } 472 else static if (LDC_with_ARM64) 473 { 474 uint fpscr = arm_get_fpcr(); 475 476 switch(fpscr & _MM_ROUND_MASK_ARM) 477 { 478 default: 479 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 480 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 481 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 482 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 483 } 484 } 485 // 64-bit can use an SSE instruction 486 else version(D_InlineAsm_X86_64) 487 { 488 long result; 489 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 490 { 491 asm pure nothrow @nogc @trusted 492 { 493 movss XMM0, value; 494 cvtss2si RAX, XMM0; 495 mov result, RAX; 496 } 497 } 498 else 499 { 500 asm pure nothrow @nogc @trusted 501 { 502 movss XMM0, value; 503 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 504 mov result, RAX; 505 } 506 } 507 return result; 508 } 509 else version(D_InlineAsm_X86) 510 { 511 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 512 // This leads to an unfortunate FPU sequence in every C++ compiler. 513 // See: https://godbolt.org/z/vZym77 514 515 // Get current MXCSR rounding 516 uint sseRounding; 517 ushort savedFPUCW; 518 ushort newFPUCW; 519 long result; 520 asm pure nothrow @nogc @trusted 521 { 522 stmxcsr sseRounding; 523 fld value; 524 fnstcw savedFPUCW; 525 mov AX, savedFPUCW; 526 and AX, 0xf3ff; // clear FPU rounding bits 527 movzx ECX, word ptr sseRounding; 528 and ECX, 0x6000; // only keep SSE rounding bits 529 shr ECX, 3; 530 or AX, CX; // make a new control word for FPU with SSE bits 531 mov newFPUCW, AX; 532 fldcw newFPUCW; 533 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 534 fldcw savedFPUCW; 535 } 536 return result; 537 } 538 else static if (GDC_with_x86) 539 { 540 version(X86_64) // 64-bit can just use the right instruction 541 { 542 static assert(GDC_with_SSE); 543 __m128 A; 544 A.ptr[0] = value; 545 return __builtin_ia32_cvtss2si64 (A); 546 } 547 else version(X86) // 32-bit 548 { 549 // This is untested! 550 uint sseRounding; 551 ushort savedFPUCW; 552 ushort newFPUCW; 553 long result; 554 asm pure nothrow @nogc @trusted 555 { 556 "stmxcsr %1;\n" ~ 557 "fld %2;\n" ~ 558 "fnstcw %3;\n" ~ 559 "movw %3, %%ax;\n" ~ 560 "andw $0xf3ff, %%ax;\n" ~ 561 "movzwl %1, %%ecx;\n" ~ 562 "andl $0x6000, %%ecx;\n" ~ 563 "shrl $3, %%ecx;\n" ~ 564 "orw %%cx, %%ax\n" ~ 565 "movw %%ax, %4;\n" ~ 566 "fldcw %4;\n" ~ 567 "fistpll %0;\n" ~ 568 "fldcw %3;\n" 569 : "=m"(result) // %0 570 : "m" (sseRounding), 571 "f" (value), 572 "m" (savedFPUCW), 573 "m" (newFPUCW) 574 : "eax", "ecx", "st"; 575 } 576 return result; 577 } 578 else 579 static assert(false); 580 } 581 else 582 static assert(false); 583 } 584 585 586 ///ditto 587 long convertDoubleToInt64UsingMXCSR(double value) @trusted 588 { 589 static if (LDC_with_ARM32) 590 { 591 // We have to resort to libc since 32-bit ARM 592 // doesn't seem to have 64-bit registers. 593 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 594 switch(fpscr & _MM_ROUND_MASK_ARM) 595 { 596 default: 597 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 598 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 599 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 600 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 601 } 602 } 603 else static if (LDC_with_ARM64) 604 { 605 // Get current rounding mode. 606 uint fpscr = arm_get_fpcr(); 607 608 switch(fpscr & _MM_ROUND_MASK_ARM) 609 { 610 default: 611 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 612 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 613 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 614 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 615 } 616 } 617 // 64-bit can use an SSE instruction 618 else version(D_InlineAsm_X86_64) 619 { 620 long result; 621 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 622 { 623 asm pure nothrow @nogc @trusted 624 { 625 movsd XMM0, value; 626 cvtsd2si RAX, XMM0; 627 mov result, RAX; 628 } 629 } 630 else 631 { 632 asm pure nothrow @nogc @trusted 633 { 634 movsd XMM0, value; 635 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 636 mov result, RAX; 637 } 638 } 639 return result; 640 } 641 else version(D_InlineAsm_X86) 642 { 643 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 644 // This leads to an unfortunate FPU sequence in every C++ compiler. 645 // See: https://godbolt.org/z/vZym77 646 647 // Get current MXCSR rounding 648 uint sseRounding; 649 ushort savedFPUCW; 650 ushort newFPUCW; 651 long result; 652 asm pure nothrow @nogc @trusted 653 { 654 stmxcsr sseRounding; 655 fld value; 656 fnstcw savedFPUCW; 657 mov AX, savedFPUCW; 658 and AX, 0xf3ff; 659 movzx ECX, word ptr sseRounding; 660 and ECX, 0x6000; 661 shr ECX, 3; 662 or AX, CX; 663 mov newFPUCW, AX; 664 fldcw newFPUCW; 665 fistp result; 666 fldcw savedFPUCW; 667 } 668 return result; 669 } 670 else static if (GDC_with_x86) 671 { 672 version(X86_64) 673 { 674 static assert(GDC_with_SSE2); 675 __m128d A; 676 A.ptr[0] = value; 677 return __builtin_ia32_cvtsd2si64 (A); 678 } 679 else 680 { 681 // This is untested! 682 uint sseRounding; 683 ushort savedFPUCW; 684 ushort newFPUCW; 685 long result; 686 asm pure nothrow @nogc @trusted 687 { 688 "stmxcsr %1;\n" ~ 689 "fld %2;\n" ~ 690 "fnstcw %3;\n" ~ 691 "movw %3, %%ax;\n" ~ 692 "andw $0xf3ff, %%ax;\n" ~ 693 "movzwl %1, %%ecx;\n" ~ 694 "andl $0x6000, %%ecx;\n" ~ 695 "shrl $3, %%ecx;\n" ~ 696 "orw %%cx, %%ax\n" ~ 697 "movw %%ax, %4;\n" ~ 698 "fldcw %4;\n" ~ 699 "fistpll %0;\n" ~ 700 "fldcw %3;\n" 701 : "=m"(result) // %0 702 : "m" (sseRounding), 703 "t" (value), 704 "m" (savedFPUCW), 705 "m" (newFPUCW) 706 : "eax", "ecx", "st"; 707 } 708 return result; 709 } 710 } 711 else 712 static assert(false); 713 } 714 715 // 716 // </ROUNDING> 717 // 718 719 720 // using the Intel terminology here 721 722 byte saturateSignedWordToSignedByte(short value) pure @safe 723 { 724 if (value > 127) value = 127; 725 if (value < -128) value = -128; 726 return cast(byte) value; 727 } 728 729 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 730 { 731 if (value > 255) value = 255; 732 if (value < 0) value = 0; 733 return cast(ubyte) value; 734 } 735 736 short saturateSignedIntToSignedShort(int value) pure @safe 737 { 738 if (value > 32767) value = 32767; 739 if (value < -32768) value = -32768; 740 return cast(short) value; 741 } 742 743 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 744 { 745 if (value > 65535) value = 65535; 746 if (value < 0) value = 0; 747 return cast(ushort) value; 748 } 749 750 unittest // test saturate operations 751 { 752 assert( saturateSignedWordToSignedByte(32000) == 127); 753 assert( saturateSignedWordToUnsignedByte(32000) == 255); 754 assert( saturateSignedWordToSignedByte(-4000) == -128); 755 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 756 assert( saturateSignedIntToSignedShort(32768) == 32767); 757 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 758 assert( saturateSignedIntToSignedShort(-32769) == -32768); 759 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 760 } 761 762 version(unittest) 763 { 764 // This is just for debugging tests 765 import core.stdc.stdio: printf; 766 767 // printing vectors for implementation 768 // Note: you can override `pure` within a `debug` clause 769 770 void _mm_print_pi64(__m64 v) @trusted 771 { 772 long1 vl = cast(long1)v; 773 printf("%lld\n", vl.array[0]); 774 } 775 776 void _mm_print_pi32(__m64 v) @trusted 777 { 778 int[2] C = (cast(int2)v).array; 779 printf("%d %d\n", C[0], C[1]); 780 } 781 782 void _mm_print_pi16(__m64 v) @trusted 783 { 784 short[4] C = (cast(short4)v).array; 785 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 786 } 787 788 void _mm_print_pi8(__m64 v) @trusted 789 { 790 byte[8] C = (cast(byte8)v).array; 791 printf("%d %d %d %d %d %d %d %d\n", 792 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 793 } 794 795 void _mm_print_epi64(__m128i v) @trusted 796 { 797 long2 vl = cast(long2)v; 798 printf("%lld %lld\n", vl.array[0], vl.array[1]); 799 } 800 801 void _mm_print_epi32(__m128i v) @trusted 802 { 803 printf("%d %d %d %d\n", 804 v.array[0], v.array[1], v.array[2], v.array[3]); 805 } 806 807 void _mm_print_epi16(__m128i v) @trusted 808 { 809 short[8] C = (cast(short8)v).array; 810 printf("%d %d %d %d %d %d %d %d\n", 811 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 812 } 813 814 void _mm_print_epi8(__m128i v) @trusted 815 { 816 byte[16] C = (cast(byte16)v).array; 817 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 818 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 819 } 820 821 void _mm_print_ps(__m128 v) @trusted 822 { 823 // %g because %f can conceal very small numbers and prints zero instead 824 float[4] C = (cast(float4)v).array; 825 printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]); 826 } 827 828 void _mm_print_pd(__m128d v) @trusted 829 { 830 double[2] C = (cast(double2)v).array; 831 printf("%f %f\n", C[0], C[1]); 832 } 833 834 void _mm256_print_pd(__m256d v) @trusted 835 { 836 // %g because %f can conceal very small numbers and prints zero instead 837 printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 838 } 839 840 void _mm256_print_ps(__m256 v) @trusted 841 { 842 // %g because %f can conceal very small numbers and prints zero instead 843 printf("%g %g %g %g %g %g %g %g\n", 844 v.array[0], v.array[1], v.array[2], v.array[3], 845 v.array[4], v.array[5], v.array[6], v.array[7]); 846 } 847 848 void _mm256_print_epi16(__m256i v) @trusted 849 { 850 short16 vl = cast(short16)v; 851 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 852 vl.array[0], vl.array[1], vl.array[2], vl.array[3], 853 vl.array[4], vl.array[5], vl.array[6], vl.array[7], 854 vl.array[8], vl.array[9], vl.array[10], vl.array[11], 855 vl.array[12], vl.array[13], vl.array[14], vl.array[15]); 856 } 857 858 void _mm256_print_epi32(__m256i v) @trusted 859 { 860 int8 vl = cast(int8)v; 861 printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3], 862 vl.array[4], vl.array[5], vl.array[6], vl.array[7]); 863 } 864 865 void _mm256_print_epi64(__m256i v) @trusted 866 { 867 long4 vl = cast(long4)v; 868 printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]); 869 } 870 } 871 872 873 // 874 // <FLOATING-POINT COMPARISONS> 875 // 876 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 877 // need different IR generation. 878 879 enum FPComparison 880 { 881 false_,// always false 882 oeq, // ordered and equal 883 ogt, // ordered and greater than 884 oge, // ordered and greater than or equal 885 olt, // ordered and less than 886 ole, // ordered and less than or equal 887 one, // ordered and not equal 888 ord, // ordered (no nans) 889 ueq, // unordered or equal 890 ugt, // unordered or greater than ("nle") 891 uge, // unordered or greater than or equal ("nlt") 892 ult, // unordered or less than ("nge") 893 ule, // unordered or less than or equal ("ngt") 894 une, // unordered or not equal ("neq") 895 uno, // unordered (either nans) 896 true_, // always true 897 } 898 899 private static immutable string[FPComparison.max+1] FPComparisonToString = 900 [ 901 "false", 902 "oeq", 903 "ogt", 904 "oge", 905 "olt", 906 "ole", 907 "one", 908 "ord", 909 "ueq", 910 "ugt", 911 "uge", 912 "ult", 913 "ule", 914 "une", 915 "uno", 916 "true" 917 ]; 918 919 // AVX FP comparison to FPComparison 920 FPComparison mapAVXFPComparison(int imm8) pure @safe 921 { 922 // Always map on non-signalling 923 static immutable FPComparison[16] mapping = 924 [ 925 FPComparison.oeq, // _CMP_EQ_OQ 926 FPComparison.olt, // _CMP_LT_OS 927 FPComparison.ole, // _CMP_LE_OS 928 FPComparison.uno, // _CMP_UNORD_Q 929 FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered? 930 FPComparison.uge, // _CMP_NLT_US 931 FPComparison.ugt, // _CMP_NLE_US 932 FPComparison.ord, // _CMP_ORD_Q 933 FPComparison.ueq, // _CMP_EQ_UQ 934 FPComparison.ult, // _CMP_NGE_US 935 FPComparison.ule, // _CMP_NGT_US 936 FPComparison.false_,// _CMP_FALSE_OQ 937 FPComparison.one, // _CMP_NEQ_OQ 938 FPComparison.oge, // _CMP_GE_OS 939 FPComparison.ogt, // _CMP_GT_OS 940 FPComparison.true_ // _CMP_TRUE_UQ 941 ]; 942 943 return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up 944 } 945 946 // Individual float comparison: returns -1 for true or 0 for false. 947 // Useful for DMD and testing 948 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 949 { 950 bool unordered = isnan(a) || isnan(b); 951 final switch(comparison) with(FPComparison) 952 { 953 case false_: return false; 954 case oeq: return a == b; 955 case ogt: return a > b; 956 case oge: return a >= b; 957 case olt: return a < b; 958 case ole: return a <= b; 959 case one: return !unordered && (a != b); // NaN with != always yields true 960 case ord: return !unordered; 961 case ueq: return unordered || (a == b); 962 case ugt: return unordered || (a > b); 963 case uge: return unordered || (a >= b); 964 case ult: return unordered || (a < b); 965 case ule: return unordered || (a <= b); 966 case une: return (a != b); // NaN with != always yields true 967 case uno: return unordered; 968 case true_: return true; 969 } 970 } 971 972 version(LDC) 973 { 974 /// Provides packed float comparisons 975 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 976 { 977 enum ir = ` 978 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 979 %r = sext <4 x i1> %cmp to <4 x i32> 980 ret <4 x i32> %r`; 981 982 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 983 } 984 985 ///ditto 986 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe 987 { 988 enum ir = ` 989 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1 990 %r = sext <8 x i1> %cmp to <8 x i32> 991 ret <8 x i32> %r`; 992 return LDCInlineIR!(ir, int8, float8, float8)(a, b); 993 } 994 995 /// Provides packed double comparisons 996 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 997 { 998 enum ir = ` 999 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 1000 %r = sext <2 x i1> %cmp to <2 x i64> 1001 ret <2 x i64> %r`; 1002 1003 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 1004 } 1005 1006 ///ditto 1007 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe 1008 { 1009 enum ir = ` 1010 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1 1011 %r = sext <4 x i1> %cmp to <4 x i64> 1012 ret <4 x i64> %r`; 1013 return LDCInlineIR!(ir, long4, double4, double4)(a, b); 1014 } 1015 1016 /// CMPSS-style comparisons 1017 /// clang implement it through x86 intrinsics, it is possible with IR alone 1018 /// but leads to less optimal code. 1019 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 1020 /// Not that simple. 1021 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 1022 { 1023 /* 1024 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 1025 enum bool invertOp = (predicateNumber & 0x80) != 0; 1026 static if(invertOp) 1027 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 1028 else 1029 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 1030 */ 1031 enum ir = ` 1032 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 1033 %r = sext i1 %cmp to i32 1034 %r2 = bitcast i32 %r to float 1035 ret float %r2`; 1036 1037 float4 r = a; 1038 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 1039 return r; 1040 } 1041 1042 /// CMPSD-style comparisons 1043 /// clang implement it through x86 intrinsics, it is possible with IR alone 1044 /// but leads to less optimal code. 1045 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 1046 /// Not that simple. 1047 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 1048 { 1049 enum ir = ` 1050 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 1051 %r = sext i1 %cmp to i64 1052 %r2 = bitcast i64 %r to double 1053 ret double %r2`; 1054 1055 double2 r = a; 1056 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 1057 return r; 1058 } 1059 } 1060 else 1061 { 1062 /// Provides packed float comparisons 1063 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 1064 { 1065 int4 result; 1066 foreach(i; 0..4) 1067 { 1068 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1069 } 1070 return result; 1071 } 1072 ///ditto 1073 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted 1074 { 1075 int8 result; 1076 foreach(i; 0..8) 1077 { 1078 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1079 } 1080 return result; 1081 } 1082 1083 /// Provides packed double comparisons 1084 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1085 { 1086 long2 result; 1087 foreach(i; 0..2) 1088 { 1089 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1090 } 1091 return result; 1092 } 1093 ///ditto 1094 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted 1095 { 1096 long4 result; 1097 foreach(i; 0..4) 1098 { 1099 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1100 } 1101 return result; 1102 } 1103 1104 /// Provides CMPSS-style comparison 1105 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 1106 { 1107 int4 result = cast(int4)a; 1108 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 1109 return cast(float4)result; 1110 } 1111 1112 /// Provides CMPSD-style comparison 1113 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1114 { 1115 long2 result = cast(long2)a; 1116 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 1117 return cast(double2)result; 1118 } 1119 } 1120 unittest // cmpps 1121 { 1122 // Check all comparison type is working 1123 float4 A = [1, 3, 5, float.nan]; 1124 float4 B = [2, 3, 4, 5]; 1125 1126 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 1127 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 1128 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 1129 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 1130 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 1131 int4 result_one = cmpps!(FPComparison.one)(A, B); 1132 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 1133 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 1134 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 1135 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 1136 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 1137 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 1138 int4 result_une = cmpps!(FPComparison.une)(A, B); 1139 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 1140 1141 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 1142 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 1143 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 1144 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 1145 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 1146 static immutable int[4] correct_one = [-1, 0,-1, 0]; 1147 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 1148 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 1149 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 1150 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 1151 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 1152 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 1153 static immutable int[4] correct_une = [-1, 0,-1,-1]; 1154 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 1155 1156 assert(result_oeq.array == correct_oeq); 1157 assert(result_ogt.array == correct_ogt); 1158 assert(result_oge.array == correct_oge); 1159 assert(result_olt.array == correct_olt); 1160 assert(result_ole.array == correct_ole); 1161 assert(result_one.array == correct_one); 1162 assert(result_ord.array == correct_ord); 1163 assert(result_ueq.array == correct_ueq); 1164 assert(result_ugt.array == correct_ugt); 1165 assert(result_uge.array == correct_uge); 1166 assert(result_ult.array == correct_ult); 1167 assert(result_ule.array == correct_ule); 1168 assert(result_une.array == correct_une); 1169 assert(result_uno.array == correct_uno); 1170 } 1171 unittest 1172 { 1173 double2 a = [1, 3]; 1174 double2 b = [2, 3]; 1175 long2 c = cmppd!(FPComparison.ult)(a, b); 1176 static immutable long[2] correct = [cast(long)(-1), 0]; 1177 assert(c.array == correct); 1178 } 1179 unittest // cmpss 1180 { 1181 void testComparison(FPComparison comparison)(float4 A, float4 B) 1182 { 1183 float4 result = cmpss!comparison(A, B); 1184 int4 iresult = cast(int4)result; 1185 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1186 assert(iresult.array[0] == expected); 1187 assert(result.array[1] == A.array[1]); 1188 assert(result.array[2] == A.array[2]); 1189 assert(result.array[3] == A.array[3]); 1190 } 1191 1192 // Check all comparison type is working 1193 float4 A = [1, 3, 5, 6]; 1194 float4 B = [2, 3, 4, 5]; 1195 float4 C = [float.nan, 3, 4, 5]; 1196 1197 testComparison!(FPComparison.oeq)(A, B); 1198 testComparison!(FPComparison.oeq)(A, C); 1199 testComparison!(FPComparison.ogt)(A, B); 1200 testComparison!(FPComparison.ogt)(A, C); 1201 testComparison!(FPComparison.oge)(A, B); 1202 testComparison!(FPComparison.oge)(A, C); 1203 testComparison!(FPComparison.olt)(A, B); 1204 testComparison!(FPComparison.olt)(A, C); 1205 testComparison!(FPComparison.ole)(A, B); 1206 testComparison!(FPComparison.ole)(A, C); 1207 testComparison!(FPComparison.one)(A, B); 1208 testComparison!(FPComparison.one)(A, C); 1209 testComparison!(FPComparison.ord)(A, B); 1210 testComparison!(FPComparison.ord)(A, C); 1211 testComparison!(FPComparison.ueq)(A, B); 1212 testComparison!(FPComparison.ueq)(A, C); 1213 testComparison!(FPComparison.ugt)(A, B); 1214 testComparison!(FPComparison.ugt)(A, C); 1215 testComparison!(FPComparison.uge)(A, B); 1216 testComparison!(FPComparison.uge)(A, C); 1217 testComparison!(FPComparison.ult)(A, B); 1218 testComparison!(FPComparison.ult)(A, C); 1219 testComparison!(FPComparison.ule)(A, B); 1220 testComparison!(FPComparison.ule)(A, C); 1221 testComparison!(FPComparison.une)(A, B); 1222 testComparison!(FPComparison.une)(A, C); 1223 testComparison!(FPComparison.uno)(A, B); 1224 testComparison!(FPComparison.uno)(A, C); 1225 } 1226 unittest // cmpsd 1227 { 1228 void testComparison(FPComparison comparison)(double2 A, double2 B) 1229 { 1230 double2 result = cmpsd!comparison(A, B); 1231 long2 iresult = cast(long2)result; 1232 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1233 assert(iresult.array[0] == expected); 1234 assert(result.array[1] == A.array[1]); 1235 } 1236 1237 // Check all comparison type is working 1238 double2 A = [1, 3]; 1239 double2 B = [2, 4]; 1240 double2 C = [double.nan, 5]; 1241 1242 testComparison!(FPComparison.oeq)(A, B); 1243 testComparison!(FPComparison.oeq)(A, C); 1244 testComparison!(FPComparison.ogt)(A, B); 1245 testComparison!(FPComparison.ogt)(A, C); 1246 testComparison!(FPComparison.oge)(A, B); 1247 testComparison!(FPComparison.oge)(A, C); 1248 testComparison!(FPComparison.olt)(A, B); 1249 testComparison!(FPComparison.olt)(A, C); 1250 testComparison!(FPComparison.ole)(A, B); 1251 testComparison!(FPComparison.ole)(A, C); 1252 testComparison!(FPComparison.one)(A, B); 1253 testComparison!(FPComparison.one)(A, C); 1254 testComparison!(FPComparison.ord)(A, B); 1255 testComparison!(FPComparison.ord)(A, C); 1256 testComparison!(FPComparison.ueq)(A, B); 1257 testComparison!(FPComparison.ueq)(A, C); 1258 testComparison!(FPComparison.ugt)(A, B); 1259 testComparison!(FPComparison.ugt)(A, C); 1260 testComparison!(FPComparison.uge)(A, B); 1261 testComparison!(FPComparison.uge)(A, C); 1262 testComparison!(FPComparison.ult)(A, B); 1263 testComparison!(FPComparison.ult)(A, C); 1264 testComparison!(FPComparison.ule)(A, B); 1265 testComparison!(FPComparison.ule)(A, C); 1266 testComparison!(FPComparison.une)(A, B); 1267 testComparison!(FPComparison.une)(A, C); 1268 testComparison!(FPComparison.uno)(A, B); 1269 testComparison!(FPComparison.uno)(A, C); 1270 } 1271 1272 // 1273 // </FLOATING-POINT COMPARISONS> 1274 // 1275 1276 1277 __m64 to_m64(__m128i a) pure @trusted 1278 { 1279 long2 la = cast(long2)a; 1280 long1 r = la.array[0]; 1281 return r; 1282 } 1283 1284 __m128i to_m128i(__m64 a) pure @trusted 1285 { 1286 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1287 1288 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1289 { 1290 long2 r = a.array[0]; 1291 r.ptr[1] = 0; 1292 return cast(int4)r; 1293 } 1294 else */ 1295 { 1296 long2 r = [0, 0]; 1297 r.ptr[0] = a.array[0]; 1298 return cast(__m128i)r; 1299 } 1300 } 1301 1302 1303 // ADDITIONAL LLVM INTRINSICS 1304 // Basically LDC didn't add them yet 1305 version(LDC) 1306 { 1307 static if (__VERSION__ >= 2097) // LDC 1.27+ 1308 { 1309 pragma(LDC_intrinsic, "llvm.abs.i#") 1310 T inteli_llvm_abs(T)(T val, bool attrib); 1311 } 1312 1313 static if (__VERSION__ >= 2092) // LDC 1.22+ 1314 { 1315 pragma(LDC_intrinsic, "llvm.sadd.sat.i#") 1316 T inteli_llvm_adds(T)(T a, T b) pure @safe; 1317 pragma(LDC_intrinsic, "llvm.ssub.sat.i#") 1318 T inteli_llvm_subs(T)(T a, T b) pure @safe; 1319 pragma(LDC_intrinsic, "llvm.uadd.sat.i#") 1320 T inteli_llvm_addus(T)(T a, T b) pure @safe; 1321 pragma(LDC_intrinsic, "llvm.usub.sat.i#") 1322 T inteli_llvm_subus(T)(T a, T b) pure @safe; 1323 } 1324 } 1325 1326 // ADDITIONAL x86 INTRINSICS 1327 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1328 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1329 static if (LDC_with_SSE41) 1330 { 1331 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1332 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1333 } 1334 1335 // SOME NEON INTRINSICS 1336 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1337 // Not in the public API but the simde project expose it all for the user to use. 1338 // MAYDO: create a new neon.d module, for internal use only. 1339 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1340 static if (LDC_with_ARM64) 1341 { 1342 // VERY USEFUL LINK 1343 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1344 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1345 1346 pragma(LDC_intrinsic, "llvm.aarch64.crc32cb") 1347 uint __crc32cb(uint a, uint b) pure @safe; 1348 1349 pragma(LDC_intrinsic, "llvm.aarch64.crc32ch") 1350 uint __crc32ch(uint a, uint b) pure @safe; 1351 1352 pragma(LDC_intrinsic, "llvm.aarch64.crc32cw") 1353 uint __crc32cw(uint a, uint b) pure @safe; 1354 1355 pragma(LDC_intrinsic, "llvm.aarch64.crc32cx") 1356 uint __crc32cd(uint a, ulong b) pure @safe; 1357 1358 //pragma(LDC_intrinsic, "llvm.aarch64.dmb") 1359 // uint __dmb(int a) @safe; // didn't found a name in intrinsic list 1360 1361 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1362 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1363 1364 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1365 short8 vabsq_s16(short8 a) pure @safe; 1366 1367 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1368 int4 vabsq_s32(int4 a) pure @safe; 1369 1370 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1371 byte16 vabsq_s8(byte16 a) pure @safe; 1372 1373 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1374 { 1375 return a & b; 1376 } 1377 1378 long2 vandq_s64(long2 a, long2 b) 1379 { 1380 return a & b; 1381 } 1382 1383 long2 vbicq_s64(long2 a, long2 b) pure @safe 1384 { 1385 return a & ~b; 1386 } 1387 1388 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1389 { 1390 return c ^ ((c ^ b) & a); 1391 } 1392 1393 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1394 { 1395 return c ^ ((c ^ b) & a); 1396 } 1397 1398 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1399 { 1400 return c ^ ((c ^ b) & a); 1401 } 1402 1403 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1404 { 1405 short8 r; 1406 r.ptr[0] = lo.array[0]; 1407 r.ptr[1] = lo.array[1]; 1408 r.ptr[2] = lo.array[2]; 1409 r.ptr[3] = lo.array[3]; 1410 r.ptr[4] = hi.array[0]; 1411 r.ptr[5] = hi.array[1]; 1412 r.ptr[6] = hi.array[2]; 1413 r.ptr[7] = hi.array[3]; 1414 return r; 1415 } 1416 1417 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1418 { 1419 int4 r; 1420 r.ptr[0] = lo.array[0]; 1421 r.ptr[1] = lo.array[1]; 1422 r.ptr[2] = hi.array[0]; 1423 r.ptr[3] = hi.array[1]; 1424 return r; 1425 } 1426 1427 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1428 { 1429 byte16 r; 1430 r.ptr[0] = lo.array[0]; 1431 r.ptr[1] = lo.array[1]; 1432 r.ptr[2] = lo.array[2]; 1433 r.ptr[3] = lo.array[3]; 1434 r.ptr[4] = lo.array[4]; 1435 r.ptr[5] = lo.array[5]; 1436 r.ptr[6] = lo.array[6]; 1437 r.ptr[7] = lo.array[7]; 1438 r.ptr[8] = hi.array[0]; 1439 r.ptr[9] = hi.array[1]; 1440 r.ptr[10] = hi.array[2]; 1441 r.ptr[11] = hi.array[3]; 1442 r.ptr[12] = hi.array[4]; 1443 r.ptr[13] = hi.array[5]; 1444 r.ptr[14] = hi.array[6]; 1445 r.ptr[15] = hi.array[7]; 1446 return r; 1447 } 1448 1449 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1450 { 1451 short8 r; 1452 r.ptr[0] = lo.array[0]; 1453 r.ptr[1] = lo.array[1]; 1454 r.ptr[2] = lo.array[2]; 1455 r.ptr[3] = lo.array[3]; 1456 r.ptr[4] = hi.array[0]; 1457 r.ptr[5] = hi.array[1]; 1458 r.ptr[6] = hi.array[2]; 1459 r.ptr[7] = hi.array[3]; 1460 return r; 1461 } 1462 1463 1464 // float4 => int4 1465 1466 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1467 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1468 1469 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1470 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1471 1472 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1473 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1474 1475 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1476 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1477 1478 1479 // double2 => long2 1480 1481 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1482 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1483 1484 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1485 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1486 1487 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1488 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1489 1490 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1491 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1492 1493 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1494 int vcvtms_s32_f32(float a) pure @safe; 1495 1496 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1497 int vcvtns_s32_f32(float a) pure @safe; 1498 1499 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1500 int vcvtps_s32_f32(float a) pure @safe; 1501 1502 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1503 int vcvts_s32_f32(float a) pure @safe; 1504 1505 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1506 int vcvtms_s32_f64(double a) pure @safe; 1507 1508 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1509 int vcvtns_s32_f64(double a) pure @safe; 1510 1511 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1512 int vcvtps_s32_f64(double a) pure @safe; 1513 1514 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1515 int vcvts_s32_f64(double a) pure @safe; 1516 1517 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1518 long vcvtms_s64_f32(float a) pure @safe; 1519 1520 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1521 long vcvtns_s64_f32(float a) pure @safe; 1522 1523 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1524 long vcvtps_s64_f32(float a) pure @safe; 1525 1526 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1527 long vcvts_s64_f32(float a) pure @safe; 1528 1529 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1530 long vcvtms_s64_f64(double a) pure @safe; 1531 1532 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1533 long vcvtns_s64_f64(double a) pure @safe; 1534 1535 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1536 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1537 1538 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1539 long vcvts_s64_f64(double a) pure @safe; 1540 1541 long2 vdupq_n_s64(long value) pure @safe 1542 { 1543 long2 r; 1544 r = value; 1545 return r; 1546 } 1547 1548 short4 vget_high_s16(short8 a) pure @trusted 1549 { 1550 short4 r; 1551 r.ptr[0] = a.array[4]; 1552 r.ptr[1] = a.array[5]; 1553 r.ptr[2] = a.array[6]; 1554 r.ptr[3] = a.array[7]; 1555 return r; 1556 } 1557 1558 int2 vget_high_s32(int4 a) pure @trusted 1559 { 1560 int2 r; 1561 r.ptr[0] = a.array[2]; 1562 r.ptr[1] = a.array[3]; 1563 return r; 1564 } 1565 1566 byte8 vget_high_u8(byte16 a) pure @trusted 1567 { 1568 byte8 r; 1569 r.ptr[0] = a.array[8]; 1570 r.ptr[1] = a.array[9]; 1571 r.ptr[2] = a.array[10]; 1572 r.ptr[3] = a.array[11]; 1573 r.ptr[4] = a.array[12]; 1574 r.ptr[5] = a.array[13]; 1575 r.ptr[6] = a.array[14]; 1576 r.ptr[7] = a.array[15]; 1577 return r; 1578 } 1579 1580 short4 vget_low_s16(short8 a) pure @trusted 1581 { 1582 short4 r; 1583 r.ptr[0] = a.array[0]; 1584 r.ptr[1] = a.array[1]; 1585 r.ptr[2] = a.array[2]; 1586 r.ptr[3] = a.array[3]; 1587 return r; 1588 } 1589 1590 int2 vget_low_s32(int4 a) pure @trusted 1591 { 1592 int2 r; 1593 r.ptr[0] = a.array[0]; 1594 r.ptr[1] = a.array[1]; 1595 return r; 1596 } 1597 1598 byte8 vget_low_u8(byte16 a) pure @trusted 1599 { 1600 byte8 r; 1601 r.ptr[0] = a.array[0]; 1602 r.ptr[1] = a.array[1]; 1603 r.ptr[2] = a.array[2]; 1604 r.ptr[3] = a.array[3]; 1605 r.ptr[4] = a.array[4]; 1606 r.ptr[5] = a.array[5]; 1607 r.ptr[6] = a.array[6]; 1608 r.ptr[7] = a.array[7]; 1609 return r; 1610 } 1611 1612 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1613 { 1614 return v.array[lane]; 1615 } 1616 1617 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1618 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1619 1620 int4 vmaxq_s32(int4 a, int4 b) 1621 { 1622 int4 r; 1623 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1624 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1625 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1626 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1627 return r; 1628 } 1629 1630 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1631 short8 vminq_s16(short8 a, short8 b) pure @safe; 1632 1633 int2 vmovn_s64(long2 a) pure @trusted 1634 { 1635 int2 r; 1636 r.ptr[0] = cast(int)(a.array[0]); 1637 r.ptr[1] = cast(int)(a.array[1]); 1638 return r; 1639 } 1640 1641 int4 vmull_s16(short4 a, short4 b) pure @trusted 1642 { 1643 int4 r; 1644 r.ptr[0] = a.array[0] * b.array[0]; 1645 r.ptr[1] = a.array[1] * b.array[1]; 1646 r.ptr[2] = a.array[2] * b.array[2]; 1647 r.ptr[3] = a.array[3] * b.array[3]; 1648 return r; 1649 } 1650 1651 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1652 long2 vmull_s32(int2 a, int2 b) pure @safe; 1653 1654 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1655 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1656 1657 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1658 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1659 1660 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1661 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1662 1663 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1664 short8 vpaddlq_u8 (byte16 a) pure @safe; 1665 1666 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1667 { 1668 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1669 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1670 } 1671 else 1672 { 1673 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1674 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1675 } 1676 1677 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1678 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1679 1680 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1681 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1682 1683 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1684 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1685 1686 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1687 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1688 1689 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1690 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1691 1692 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1693 byte8 vqmovn_s16(short8 a) pure @safe; 1694 1695 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1696 short4 vqmovn_s32(int4 a) pure @safe; 1697 1698 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1699 short4 vqmovn_u32(int4 a) pure @safe; 1700 1701 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1702 byte8 vqmovun_s16(short8 a) pure @safe; 1703 1704 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1705 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1706 1707 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1708 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1709 1710 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1711 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1712 1713 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1714 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1715 1716 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1717 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1718 1719 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1720 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1721 1722 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1723 { 1724 return a >>> b; 1725 } 1726 1727 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1728 { 1729 a = a >> byte16(cast(byte)r); 1730 return a; 1731 } 1732 1733 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1734 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1735 } 1736 1737 version(unittest) 1738 { 1739 double abs_double(double x) @trusted 1740 { 1741 version(LDC) 1742 return llvm_fabs(x); 1743 else 1744 { 1745 long uf = *cast(long*)(&x); 1746 uf &= 0x7fffffff_ffffffff; 1747 return *cast(double*)(&uf); 1748 } 1749 } 1750 } 1751 1752 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1753 1754 bool isnan(float x) pure @trusted 1755 { 1756 uint u = *cast(uint*)(&x); 1757 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1758 return result; 1759 } 1760 unittest 1761 { 1762 float x = float.nan; 1763 assert(isnan(x)); 1764 1765 x = 0; 1766 assert(!isnan(x)); 1767 1768 x = float.infinity; 1769 assert(!isnan(x)); 1770 } 1771 1772 bool isnan(double x) pure @trusted 1773 { 1774 ulong u = *cast(ulong*)(&x); 1775 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 1776 } 1777 unittest 1778 { 1779 double x = double.nan; 1780 assert(isnan(x)); 1781 1782 x = 0; 1783 assert(!isnan(x)); 1784 1785 x = double.infinity; 1786 assert(!isnan(x)); 1787 }