1 /** 2 * Internal stuff only, do not import. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2025, Stefanos Baziotis 2019. 5 * cet 2024. 6 * Copyright Kitsunebi Games 2025. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.internals; 10 11 import inteli.types; 12 13 package: 14 nothrow: 15 @nogc: 16 17 // nurt compatibility 18 19 version(USE_NURT) 20 { 21 import numem.core.hooks : nu_malloc, nu_free, nu_memcpy; 22 public import core.internal.exception : onOutOfMemoryError; 23 alias malloc = nu_malloc; 24 alias free = nu_free; 25 alias memcpy = nu_memcpy; 26 } 27 else 28 { 29 public import core.stdc.stdlib: malloc, free; 30 public import core.stdc.string: memcpy; 31 public import core.exception: onOutOfMemoryError; 32 } 33 34 // The only math functions needed for intel-intrinsics 35 public import core.math: sqrt; 36 public import core.bitop: bsf, bsr; 37 38 39 40 /// Helps portability with yet unsupported platforms 41 void __warn_noop(string fname = __FUNCTION__)() 42 { 43 pragma(msg, "Warning: ", fname, " is currently not supported, it will become a NO-OP!"); 44 } 45 ///ditto 46 RetT __warn_noop_ret(RetT, string fname = __FUNCTION__)(RetT rval = RetT.init) 47 if (!is(RetT == void)) 48 { 49 pragma(msg, "Warning: ", fname, " is currently not supported, it will become a NO-OP!"); 50 return rval; 51 } 52 53 54 55 version(GNU) 56 { 57 version (X86) 58 { 59 // For 32-bit x86, disable vector extensions with GDC. 60 // It just doesn't work well. 61 enum GDC_with_x86 = true; 62 enum GDC_with_MMX = false; 63 enum GDC_with_SSE = false; 64 enum GDC_with_SSE2 = false; 65 enum GDC_with_SSE3 = false; 66 enum GDC_with_SSSE3 = false; 67 enum GDC_with_SSE41 = false; 68 enum GDC_with_SSE42 = false; 69 enum GDC_with_AVX = false; 70 enum GDC_with_F16C = false; 71 enum GDC_with_AVX2 = false; 72 enum GDC_with_SHA = false; 73 enum GDC_with_BMI2 = false; 74 } 75 else version (X86_64) 76 { 77 // GDC support uses extended inline assembly: 78 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 79 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 80 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 81 82 public import core.simd: byte16, short8, int4, float4, double2; 83 84 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 85 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 86 public import gcc.builtins; 87 88 // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 89 // want to support other archs with GDC 90 91 enum GDC_with_x86 = true; 92 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 93 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 94 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 95 96 // Starting at GDC 12.1, detect capabilities with __traits(compiles). 97 // Before GCC 11.3, no reliable way to detect instruction sets. 98 // We start above detection at GCC 12, with DMDFE 2.100, which 99 // is more conservative. 100 static if (__VERSION__ >= 2100) 101 { 102 enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps); 103 enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128); 104 enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps); 105 enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq); 106 enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256); 107 enum GDC_with_F16C = __traits(compiles, __builtin_ia32_vcvtps2ph); 108 enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df); 109 enum GDC_with_SHA = __traits(compiles, __builtin_ia32_sha256msg1); 110 enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si); 111 112 } 113 else 114 { 115 116 enum GDC_with_SSE3 = false; 117 enum GDC_with_SSSE3 = false; 118 enum GDC_with_SSE41 = false; 119 enum GDC_with_SSE42 = false; 120 enum GDC_with_AVX = false; 121 enum GDC_with_F16C = false; 122 enum GDC_with_AVX2 = false; 123 enum GDC_with_SHA = false; 124 enum GDC_with_BMI2 = false; 125 } 126 127 128 } 129 else 130 { 131 enum GDC_with_x86 = false; 132 enum GDC_with_MMX = false; 133 enum GDC_with_SSE = false; 134 enum GDC_with_SSE2 = false; 135 enum GDC_with_SSE3 = false; 136 enum GDC_with_SSSE3 = false; 137 enum GDC_with_SSE41 = false; 138 enum GDC_with_SSE42 = false; 139 enum GDC_with_AVX = false; 140 enum GDC_with_F16C = false; 141 enum GDC_with_AVX2 = false; 142 enum GDC_with_SHA = false; 143 enum GDC_with_BMI2 = false; 144 } 145 } 146 else 147 { 148 enum GDC_with_x86 = false; 149 enum GDC_with_MMX = false; 150 enum GDC_with_SSE = false; 151 enum GDC_with_SSE2 = false; 152 enum GDC_with_SSE3 = false; 153 enum GDC_with_SSSE3 = false; 154 enum GDC_with_SSE41 = false; 155 enum GDC_with_SSE42 = false; 156 enum GDC_with_AVX = false; 157 enum GDC_with_F16C = false; 158 enum GDC_with_AVX2 = false; 159 enum GDC_with_SHA = false; 160 enum GDC_with_BMI2 = false; 161 } 162 163 version(LDC) 164 { 165 public import core.simd; 166 public import ldc.simd; 167 public import ldc.intrinsics; 168 public import ldc.llvmasm: __asm; 169 170 version (X86) 171 private enum bool some_x86 = true; 172 else version (X86_64) 173 private enum bool some_x86 = true; 174 else 175 private enum bool some_x86 = false; 176 177 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 178 static if (__VERSION__ >= 2083) 179 { 180 import ldc.llvmasm; 181 alias LDCInlineIR = __ir_pure; 182 183 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 184 alias LDCInlineIREx = __irEx_pure; 185 186 enum bool LDC_with_InlineIREx = true; 187 } 188 else 189 { 190 alias LDCInlineIR = inlineIR; 191 enum bool LDC_with_InlineIREx = false; 192 } 193 194 // This is used to disable LDC feature that are expensive at compile time: 195 // everything that relies on inline LLVM IR. 196 version(D_Optimized) 197 { 198 enum bool LDC_with_optimizations = true; 199 } 200 else 201 { 202 static if (__VERSION__ < 2101) 203 { 204 // See Issue #136, D_Optimized only appeared in DMDFE 2.101. 205 // Relying on this had terrible consequences. 206 enum bool LDC_with_optimizations = true; 207 } 208 else 209 enum bool LDC_with_optimizations = false; 210 } 211 212 version(ARM) 213 { 214 public import ldc.gccbuiltins_arm; 215 216 enum LDC_with_ARM32 = true; 217 enum LDC_with_ARM64 = false; 218 enum LDC_with_ARM64_CRC = false; 219 enum LDC_with_SSE = false; 220 enum LDC_with_SSE2 = false; 221 enum LDC_with_SSE3 = false; 222 enum LDC_with_SSSE3 = false; 223 enum LDC_with_SSE41 = false; 224 enum LDC_with_SSE42 = false; 225 enum LDC_with_CRC32 = false; 226 enum LDC_with_AVX = false; 227 enum LDC_with_F16C = false; 228 enum LDC_with_AVX2 = false; 229 enum LDC_with_SHA = false; 230 enum LDC_with_BMI2 = false; 231 } 232 else version(AArch64) 233 { 234 public import ldc.gccbuiltins_aarch64; 235 enum LDC_with_ARM32 = false; 236 enum LDC_with_ARM64 = true; // implies "has Neon" 237 enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc"); 238 enum LDC_with_SSE = false; 239 enum LDC_with_SSE2 = false; 240 enum LDC_with_SSE3 = false; 241 enum LDC_with_SSSE3 = false; 242 enum LDC_with_SSE41 = false; 243 enum LDC_with_SSE42 = false; 244 enum LDC_with_CRC32 = false; 245 enum LDC_with_AVX = false; 246 enum LDC_with_F16C = false; 247 enum LDC_with_AVX2 = false; 248 enum LDC_with_SHA = false; 249 enum LDC_with_BMI2 = false; 250 } 251 else static if (some_x86) 252 { 253 public import ldc.gccbuiltins_x86; 254 255 // Workaround LDC 1.32.0 having NO builtins at all. 256 // See LDC issue 4347 https://github.com/ldc-developers/ldc/issues/4347 257 enum LDC_with_ia32_builtins = __traits(compiles, __builtin_ia32_clflush); // This one must be available in all of LDC history. 258 259 static if (!LDC_with_ia32_builtins) 260 { 261 // in case our __builtin_ia32_clflush workaround breaks 262 pragma(msg, "Warning: LDC v1.32.0 has no SIMD builtins. intel-intrinsics will use slow path. Please avoid LDC 1.32.0"); 263 } 264 265 enum LDC_with_ARM32 = false; 266 enum LDC_with_ARM64 = false; 267 enum LDC_with_ARM64_CRC = false; 268 enum LDC_with_SSE = __traits(targetHasFeature, "sse") && LDC_with_ia32_builtins; 269 enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2") && LDC_with_ia32_builtins; 270 enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3") && LDC_with_ia32_builtins; 271 enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3") && LDC_with_ia32_builtins; 272 enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1") && LDC_with_ia32_builtins; 273 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2") && LDC_with_ia32_builtins; 274 275 // Since LDC 1.30, crc32 is a separate (and sufficient) attribute from sse4.2 276 // As of Jan 2023, GDC doesn't make that distinction, -msse4.2 includes -mcrc32 for GDC. 277 static if (__VERSION__ >= 2100) 278 { 279 enum LDC_with_CRC32 = __traits(targetHasFeature, "crc32") && LDC_with_ia32_builtins; 280 } 281 else 282 { 283 enum LDC_with_CRC32 = __traits(targetHasFeature, "sse4.2") && LDC_with_ia32_builtins; // crc32 used to be included in sse4.2 284 } 285 286 enum LDC_with_AVX = __traits(targetHasFeature, "avx") && LDC_with_ia32_builtins; 287 enum LDC_with_F16C = __traits(targetHasFeature, "f16c") && LDC_with_ia32_builtins; 288 enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2") && LDC_with_ia32_builtins; 289 enum LDC_with_SHA = __traits(targetHasFeature, "sha") && LDC_with_ia32_builtins; 290 enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2") && LDC_with_ia32_builtins; 291 } 292 else 293 { 294 enum LDC_with_ARM32 = false; 295 enum LDC_with_ARM64 = false; 296 enum LDC_with_ARM64_CRC = false; 297 enum LDC_with_SSE = false; 298 enum LDC_with_SSE2 = false; 299 enum LDC_with_SSE3 = false; 300 enum LDC_with_SSSE3 = false; 301 enum LDC_with_SSE41 = false; 302 enum LDC_with_SSE42 = false; 303 enum LDC_with_CRC32 = false; 304 enum LDC_with_AVX = false; 305 enum LDC_with_F16C = false; 306 enum LDC_with_AVX2 = false; 307 enum LDC_with_SHA = false; 308 enum LDC_with_BMI2 = false; 309 } 310 311 // Should we use inline x86 assembly with DMD syntax, in LDC? 312 version(D_InlineAsm_X86) 313 { 314 enum LDC_with_32b_x86_asm = LDC_with_SSE2; // if no SSE support, disable the x86 asm code path 315 enum LDC_with_64b_x86_asm = false; 316 } 317 else version(D_InlineAsm_X86_64) 318 { 319 enum LDC_with_32b_x86_asm = false; 320 enum LDC_with_64b_x86_asm = LDC_with_SSE2; 321 } 322 else 323 { 324 enum LDC_with_32b_x86_asm = false; 325 enum LDC_with_64b_x86_asm = false; 326 } 327 } 328 else 329 { 330 enum LDC_with_ARM32 = false; 331 enum LDC_with_ARM64 = false; 332 enum LDC_with_ARM64_CRC = false; 333 enum LDC_with_SSE = false; 334 enum LDC_with_SSE2 = false; 335 enum LDC_with_SSE3 = false; 336 enum LDC_with_SSSE3 = false; 337 enum LDC_with_SSE41 = false; 338 enum LDC_with_SSE42 = false; 339 enum LDC_with_CRC32 = false; 340 enum LDC_with_AVX = false; 341 enum LDC_with_F16C = false; 342 enum LDC_with_AVX2 = false; 343 enum LDC_with_SHA = false; 344 enum LDC_with_BMI2 = false; 345 346 enum LDC_with_InlineIREx = false; 347 enum bool LDC_with_optimizations = false; 348 enum bool LDC_with_32b_x86_asm = false; 349 enum bool LDC_with_64b_x86_asm = false; 350 } 351 enum LDC_with_x86_asm = LDC_with_32b_x86_asm || LDC_with_64b_x86_asm; 352 353 354 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; 355 356 version(DigitalMars) 357 { 358 version(D_InlineAsm_X86) 359 enum DMD_with_asm = true; 360 else version(D_InlineAsm_X86_64) 361 enum DMD_with_asm = true; 362 else 363 enum DMD_with_asm = false; 364 365 version(D_InlineAsm_X86) 366 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 367 else 368 enum DMD_with_32bit_asm = false; 369 370 version (D_SIMD) 371 { 372 enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated; 373 374 // Going further, does DMD has SSE4.1 through -mcpu? 375 static if (DMD_with_DSIMD) 376 enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0)); 377 else 378 enum bool DMD_with_DSIMD_and_SSE41 = false; 379 380 // No DMD way to detect those instruction sets => pessimize 381 // would be cool to have a way to detect support for this at CT 382 enum DMD_with_DSIMD_and_SSE3 = DMD_with_DSIMD_and_SSE41; 383 enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41; 384 385 version(D_AVX) 386 enum DMD_with_DSIMD_and_AVX = true; 387 else 388 enum DMD_with_DSIMD_and_AVX = false; 389 390 version(D_AVX2) 391 enum DMD_with_DSIMD_and_AVX2 = true; 392 else 393 enum DMD_with_DSIMD_and_AVX2 = false; 394 395 enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX; 396 } 397 else 398 { 399 enum DMD_with_DSIMD = false; 400 enum DMD_with_DSIMD_and_SSE3 = false; 401 enum DMD_with_DSIMD_and_SSSE3 = false; 402 enum DMD_with_DSIMD_and_SSE41 = false; 403 enum DMD_with_DSIMD_and_SSE42 = false; 404 enum DMD_with_DSIMD_and_AVX = false; 405 enum DMD_with_DSIMD_and_AVX2 = false; 406 } 407 } 408 else 409 { 410 enum DMD_with_asm = false; 411 enum DMD_with_32bit_asm = false; 412 enum DMD_with_DSIMD = false; 413 enum DMD_with_DSIMD_and_SSE3 = false; 414 enum DMD_with_DSIMD_and_SSSE3 = false; 415 enum DMD_with_DSIMD_and_SSE41 = false; 416 enum DMD_with_DSIMD_and_SSE42 = false; 417 enum DMD_with_DSIMD_and_AVX = false; 418 enum DMD_with_DSIMD_and_AVX2 = false; 419 } 420 421 422 // Sometimes, can be helpful to merge builtin code, however keep in mind that 423 // LDC and GDC builtins often subtly diverge, wrt. unsigned vs signed vectors, 424 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics. 425 enum GDC_or_LDC_with_SSE = GDC_with_SSE || LDC_with_SSE; 426 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2; 427 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3; 428 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41; 429 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42; 430 431 enum GDC_or_LDC_with_AVX = GDC_with_AVX || LDC_with_AVX; 432 enum GDC_or_LDC_with_F16C = GDC_with_F16C || LDC_with_F16C; 433 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2; 434 enum GDC_or_LDC_with_SHA = GDC_with_SHA || LDC_with_SHA; 435 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2; 436 437 static if (__VERSION__ >= 2102) 438 { 439 enum SIMD_COMPARISON_MASKS_8B = !MMXSizedVectorsAreEmulated; // can do < <= => > == with builtin 8 bytes __vectors. 440 enum SIMD_COMPARISON_MASKS_16B = !SSESizedVectorsAreEmulated; // can do < <= => > == with builtin 16 bytes __vectors. 441 enum SIMD_COMPARISON_MASKS_32B = !AVXSizedVectorsAreEmulated; // can do < <= => > == with builtin 32 bytes __vectors. 442 } 443 else 444 { 445 enum SIMD_COMPARISON_MASKS_8B = false; 446 enum SIMD_COMPARISON_MASKS_16B = false; 447 enum SIMD_COMPARISON_MASKS_32B = false; 448 } 449 450 451 static if (LDC_with_ARM32) 452 { 453 package uint arm_get_fpcr() nothrow @nogc @trusted 454 { 455 return __builtin_arm_get_fpscr(); 456 } 457 458 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 459 { 460 __builtin_arm_set_fpscr(cw); 461 } 462 } 463 464 static if (LDC_with_ARM64) 465 { 466 pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr") 467 long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe; 468 469 package uint arm_get_fpcr() pure nothrow @nogc @trusted 470 { 471 // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR 472 return __asm!uint("mrs $0, fpcr", "=r"); 473 } 474 475 package void arm_set_fpcr(uint cw) nothrow @nogc @trusted 476 { 477 // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR. 478 long save_x2; 479 __asm!void("str x2, $1 \n" ~ 480 "ldr w2, $0 \n" ~ 481 "msr fpcr, x2 \n" ~ 482 "ldr x2, $1 " , "m,m", cw, &save_x2); 483 } 484 } 485 486 487 // For internal use only, since public API deals with a x86 semantic emulation 488 enum uint _MM_ROUND_NEAREST_ARM = 0x00000000; 489 enum uint _MM_ROUND_DOWN_ARM = 0x00800000; 490 enum uint _MM_ROUND_UP_ARM = 0x00400000; 491 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000; 492 enum uint _MM_ROUND_MASK_ARM = 0x00C00000; 493 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000; 494 495 496 // 497 // <ROUNDING> 498 // 499 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 500 // doesn't change the FPU rounding mode, and isn't expected to do so. 501 // So we devised these rounding function to help having consistent rounding between 502 // LDC and DMD. It's important that DMD uses whatever is in MXCSR to round. 503 // 504 // Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 505 // functionality. 506 // https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register 507 // We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly. 508 509 int convertFloatToInt32UsingMXCSR(float value) @trusted 510 { 511 int result; 512 version(GNU) 513 { 514 version(X86) 515 { 516 asm pure nothrow @nogc @trusted 517 { 518 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 519 } 520 } 521 else version(X86_64) 522 { 523 asm pure nothrow @nogc @trusted 524 { 525 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 526 } 527 } 528 else 529 { 530 // BUG: this is truncation instead of MXCSR 531 result = cast(int)value; 532 } 533 } 534 else static if (LDC_with_ARM32) 535 { 536 result = __asm!int(`vldr s2, $1 537 vcvtr.s32.f32 s2, s2 538 vmov $0, s2`, "=r,m,~{s2}", value); 539 } 540 else static if (LDC_with_ARM64) 541 { 542 // Get current rounding mode. 543 uint fpscr = arm_get_fpcr(); 544 545 switch(fpscr & _MM_ROUND_MASK_ARM) 546 { 547 default: 548 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f32(value); break; 549 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f32(value); break; 550 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f32(value); break; 551 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value); break; 552 } 553 } 554 else 555 { 556 asm pure nothrow @nogc @trusted 557 { 558 cvtss2si EAX, value; 559 mov result, EAX; 560 } 561 } 562 return result; 563 } 564 565 int convertDoubleToInt32UsingMXCSR(double value) @trusted 566 { 567 int result; 568 version(GNU) 569 { 570 version(X86) 571 { 572 asm pure nothrow @nogc @trusted 573 { 574 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 575 } 576 } 577 else version(X86_64) 578 { 579 asm pure nothrow @nogc @trusted 580 { 581 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 582 } 583 } 584 else 585 { 586 // BUG: this is truncation instead of MXCSR 587 result = cast(int)value; 588 } 589 } 590 else static if (LDC_with_ARM32) 591 { 592 result = __asm!int(`vldr d2, $1 593 vcvtr.s32.f64 s2, d2 594 vmov $0, s2`, "=r,m,~{s2},~{d2}", value); 595 } 596 else static if (LDC_with_ARM64) 597 { 598 // Get current rounding mode. 599 uint fpscr = arm_get_fpcr(); 600 601 switch(fpscr & _MM_ROUND_MASK_ARM) 602 { 603 default: 604 case _MM_ROUND_NEAREST_ARM: result = vcvtns_s32_f64(value); break; 605 case _MM_ROUND_DOWN_ARM: result = vcvtms_s32_f64(value); break; 606 case _MM_ROUND_UP_ARM: result = vcvtps_s32_f64(value); break; 607 case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value); break; 608 } 609 } 610 else 611 { 612 asm pure nothrow @nogc @trusted 613 { 614 cvtsd2si EAX, value; 615 mov result, EAX; 616 } 617 } 618 return result; 619 } 620 621 long convertFloatToInt64UsingMXCSR(float value) @trusted 622 { 623 static if (LDC_with_ARM32) 624 { 625 // We have to resort to libc since 32-bit ARM 626 // doesn't seem to have 64-bit registers. 627 628 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 629 630 // Note: converting to double precision else rounding could be different for large integers 631 double asDouble = value; 632 633 switch(fpscr & _MM_ROUND_MASK_ARM) 634 { 635 default: 636 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(asDouble)); 637 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(asDouble)); 638 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(asDouble)); 639 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble); 640 } 641 } 642 else static if (LDC_with_ARM64) 643 { 644 uint fpscr = arm_get_fpcr(); 645 646 switch(fpscr & _MM_ROUND_MASK_ARM) 647 { 648 default: 649 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f32(value); 650 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f32(value); 651 case _MM_ROUND_UP_ARM: return vcvtps_s64_f32(value); 652 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value); 653 } 654 } 655 // 64-bit can use an SSE instruction 656 else version(D_InlineAsm_X86_64) 657 { 658 long result; 659 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 660 { 661 asm pure nothrow @nogc @trusted 662 { 663 movss XMM0, value; 664 cvtss2si RAX, XMM0; 665 mov result, RAX; 666 } 667 } 668 else 669 { 670 asm pure nothrow @nogc @trusted 671 { 672 movss XMM0, value; 673 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 674 mov result, RAX; 675 } 676 } 677 return result; 678 } 679 else version(D_InlineAsm_X86) 680 { 681 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 682 // This leads to an unfortunate FPU sequence in every C++ compiler. 683 // See: https://godbolt.org/z/vZym77 684 685 // Get current MXCSR rounding 686 uint sseRounding; 687 ushort savedFPUCW; 688 ushort newFPUCW; 689 long result; 690 asm pure nothrow @nogc @trusted 691 { 692 stmxcsr sseRounding; 693 fld value; 694 fnstcw savedFPUCW; 695 mov AX, savedFPUCW; 696 and AX, 0xf3ff; // clear FPU rounding bits 697 movzx ECX, word ptr sseRounding; 698 and ECX, 0x6000; // only keep SSE rounding bits 699 shr ECX, 3; 700 or AX, CX; // make a new control word for FPU with SSE bits 701 mov newFPUCW, AX; 702 fldcw newFPUCW; 703 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 704 fldcw savedFPUCW; 705 } 706 return result; 707 } 708 else static if (GDC_with_x86) 709 { 710 version(X86_64) // 64-bit can just use the right instruction 711 { 712 static assert(GDC_with_SSE); 713 __m128 A; 714 A.ptr[0] = value; 715 return __builtin_ia32_cvtss2si64 (A); 716 } 717 else version(X86) // 32-bit 718 { 719 // This is untested! 720 uint sseRounding; 721 ushort savedFPUCW; 722 ushort newFPUCW; 723 long result; 724 asm pure nothrow @nogc @trusted 725 { 726 "stmxcsr %1;\n" ~ 727 "fld %2;\n" ~ 728 "fnstcw %3;\n" ~ 729 "movw %3, %%ax;\n" ~ 730 "andw $0xf3ff, %%ax;\n" ~ 731 "movzwl %1, %%ecx;\n" ~ 732 "andl $0x6000, %%ecx;\n" ~ 733 "shrl $3, %%ecx;\n" ~ 734 "orw %%cx, %%ax\n" ~ 735 "movw %%ax, %4;\n" ~ 736 "fldcw %4;\n" ~ 737 "fistpll %0;\n" ~ 738 "fldcw %3;\n" 739 : "=m"(result) // %0 740 : "m" (sseRounding), 741 "f" (value), 742 "m" (savedFPUCW), 743 "m" (newFPUCW) 744 : "eax", "ecx", "st"; 745 } 746 return result; 747 } 748 else 749 static assert(false); 750 } 751 else 752 { 753 // BUG 754 // This is a last result and wrong, typically 755 // for GDC architectures we don't yet support 756 return cast(long)value; 757 } 758 } 759 760 761 ///ditto 762 long convertDoubleToInt64UsingMXCSR(double value) @trusted 763 { 764 static if (LDC_with_ARM32) 765 { 766 // We have to resort to libc since 32-bit ARM 767 // doesn't seem to have 64-bit registers. 768 uint fpscr = arm_get_fpcr(); // Get current rounding mode. 769 switch(fpscr & _MM_ROUND_MASK_ARM) 770 { 771 default: 772 case _MM_ROUND_NEAREST_ARM: return cast(long)(llvm_round(value)); 773 case _MM_ROUND_DOWN_ARM: return cast(long)(llvm_floor(value)); 774 case _MM_ROUND_UP_ARM: return cast(long)(llvm_ceil(value)); 775 case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value); 776 } 777 } 778 else static if (LDC_with_ARM64) 779 { 780 // Get current rounding mode. 781 uint fpscr = arm_get_fpcr(); 782 783 switch(fpscr & _MM_ROUND_MASK_ARM) 784 { 785 default: 786 case _MM_ROUND_NEAREST_ARM: return vcvtns_s64_f64(value); 787 case _MM_ROUND_DOWN_ARM: return vcvtms_s64_f64(value); 788 case _MM_ROUND_UP_ARM: return vcvtps_s64_f64(value); 789 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value); 790 } 791 } 792 // 64-bit can use an SSE instruction 793 else version(D_InlineAsm_X86_64) 794 { 795 long result; 796 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 797 { 798 asm pure nothrow @nogc @trusted 799 { 800 movsd XMM0, value; 801 cvtsd2si RAX, XMM0; 802 mov result, RAX; 803 } 804 } 805 else 806 { 807 asm pure nothrow @nogc @trusted 808 { 809 movsd XMM0, value; 810 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 811 mov result, RAX; 812 } 813 } 814 return result; 815 } 816 else version(D_InlineAsm_X86) 817 { 818 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 819 // This leads to an unfortunate FPU sequence in every C++ compiler. 820 // See: https://godbolt.org/z/vZym77 821 822 // Get current MXCSR rounding 823 uint sseRounding; 824 ushort savedFPUCW; 825 ushort newFPUCW; 826 long result; 827 asm pure nothrow @nogc @trusted 828 { 829 stmxcsr sseRounding; 830 fld value; 831 fnstcw savedFPUCW; 832 mov AX, savedFPUCW; 833 and AX, 0xf3ff; 834 movzx ECX, word ptr sseRounding; 835 and ECX, 0x6000; 836 shr ECX, 3; 837 or AX, CX; 838 mov newFPUCW, AX; 839 fldcw newFPUCW; 840 fistp result; 841 fldcw savedFPUCW; 842 } 843 return result; 844 } 845 else static if (GDC_with_x86) 846 { 847 version(X86_64) 848 { 849 static assert(GDC_with_SSE2); 850 __m128d A; 851 A.ptr[0] = value; 852 return __builtin_ia32_cvtsd2si64 (A); 853 } 854 else 855 { 856 // This is untested! 857 uint sseRounding; 858 ushort savedFPUCW; 859 ushort newFPUCW; 860 long result; 861 asm pure nothrow @nogc @trusted 862 { 863 "stmxcsr %1;\n" ~ 864 "fld %2;\n" ~ 865 "fnstcw %3;\n" ~ 866 "movw %3, %%ax;\n" ~ 867 "andw $0xf3ff, %%ax;\n" ~ 868 "movzwl %1, %%ecx;\n" ~ 869 "andl $0x6000, %%ecx;\n" ~ 870 "shrl $3, %%ecx;\n" ~ 871 "orw %%cx, %%ax\n" ~ 872 "movw %%ax, %4;\n" ~ 873 "fldcw %4;\n" ~ 874 "fistpll %0;\n" ~ 875 "fldcw %3;\n" 876 : "=m"(result) // %0 877 : "m" (sseRounding), 878 "t" (value), 879 "m" (savedFPUCW), 880 "m" (newFPUCW) 881 : "eax", "ecx", "st"; 882 } 883 return result; 884 } 885 } 886 else 887 { 888 // BUG 889 // This is a last result and wrong, typically 890 // for GDC architectures we don't yet support 891 return cast(long)value; 892 } 893 } 894 895 // 896 // </ROUNDING> 897 // 898 899 900 // using the Intel terminology here 901 902 byte saturateSignedWordToSignedByte(short value) pure @safe 903 { 904 if (value > 127) value = 127; 905 if (value < -128) value = -128; 906 return cast(byte) value; 907 } 908 909 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 910 { 911 if (value > 255) value = 255; 912 if (value < 0) value = 0; 913 return cast(ubyte) value; 914 } 915 916 short saturateSignedIntToSignedShort(int value) pure @safe 917 { 918 if (value > 32767) value = 32767; 919 if (value < -32768) value = -32768; 920 return cast(short) value; 921 } 922 923 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 924 { 925 if (value > 65535) value = 65535; 926 if (value < 0) value = 0; 927 return cast(ushort) value; 928 } 929 930 unittest // test saturate operations 931 { 932 assert( saturateSignedWordToSignedByte(32000) == 127); 933 assert( saturateSignedWordToUnsignedByte(32000) == 255); 934 assert( saturateSignedWordToSignedByte(-4000) == -128); 935 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 936 assert( saturateSignedIntToSignedShort(32768) == 32767); 937 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 938 assert( saturateSignedIntToSignedShort(-32769) == -32768); 939 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 940 } 941 942 version(unittest) 943 { 944 // This is just for debugging tests 945 import core.stdc.stdio: printf; 946 947 // printing vectors for implementation 948 // Note: you can override `pure` within a `debug` clause 949 950 void _mm_print_pi64(__m64 v) @trusted 951 { 952 long1 vl = cast(long1)v; 953 printf("%lld\n", vl.array[0]); 954 } 955 956 void _mm_print_pi32(__m64 v) @trusted 957 { 958 int[2] C = (cast(int2)v).array; 959 printf("%d %d\n", C[0], C[1]); 960 } 961 962 void _mm_print_pi16(__m64 v) @trusted 963 { 964 short[4] C = (cast(short4)v).array; 965 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 966 } 967 968 void _mm_print_pi8(__m64 v) @trusted 969 { 970 byte[8] C = (cast(byte8)v).array; 971 printf("%d %d %d %d %d %d %d %d\n", 972 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 973 } 974 975 void _mm_print_epi64(__m128i v) @trusted 976 { 977 long2 vl = cast(long2)v; 978 printf("%lld %lld\n", vl.array[0], vl.array[1]); 979 } 980 981 void _mm_print_epi32(__m128i v) @trusted 982 { 983 printf("%d %d %d %d\n", 984 v.array[0], v.array[1], v.array[2], v.array[3]); 985 } 986 987 void _mm_print_short8(short8 v) @trusted 988 { 989 printf("%d %d %d %d %d %d %d %d\n", 990 v.array[0], v.array[1], v.array[2], v.array[3], 991 v.array[4], v.array[5], v.array[6], v.array[7]); 992 } 993 994 void _mm_print_epi16(__m128i v) @trusted 995 { 996 short[8] C = (cast(short8)v).array; 997 printf("%d %d %d %d %d %d %d %d\n", 998 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 999 } 1000 1001 void _mm_print_epi8(__m128i v) @trusted 1002 { 1003 byte[16] C = (cast(byte16)v).array; 1004 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 1005 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 1006 } 1007 1008 void _mm_print_ps(__m128 v) @trusted 1009 { 1010 // %g because %f can conceal very small numbers and prints zero instead 1011 float[4] C = (cast(float4)v).array; 1012 printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]); 1013 } 1014 1015 void _mm_print_pd(__m128d v) @trusted 1016 { 1017 double[2] C = (cast(double2)v).array; 1018 printf("%f %f\n", C[0], C[1]); 1019 } 1020 1021 void _mm256_print_pd(__m256d v) @trusted 1022 { 1023 // %g because %f can conceal very small numbers and prints zero instead 1024 printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 1025 } 1026 1027 void _mm256_print_ps(__m256 v) @trusted 1028 { 1029 // %g because %f can conceal very small numbers and prints zero instead 1030 printf("%g %g %g %g %g %g %g %g\n", 1031 v.array[0], v.array[1], v.array[2], v.array[3], 1032 v.array[4], v.array[5], v.array[6], v.array[7]); 1033 } 1034 1035 void _mm256_print_epi16(__m256i v) @trusted 1036 { 1037 short16 vl = cast(short16)v; 1038 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 1039 vl.array[0], vl.array[1], vl.array[2], vl.array[3], 1040 vl.array[4], vl.array[5], vl.array[6], vl.array[7], 1041 vl.array[8], vl.array[9], vl.array[10], vl.array[11], 1042 vl.array[12], vl.array[13], vl.array[14], vl.array[15]); 1043 } 1044 1045 void _mm256_print_epi32(__m256i v) @trusted 1046 { 1047 int8 vl = cast(int8)v; 1048 printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3], 1049 vl.array[4], vl.array[5], vl.array[6], vl.array[7]); 1050 } 1051 1052 void _mm256_print_epi64(__m256i v) @trusted 1053 { 1054 long4 vl = cast(long4)v; 1055 printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]); 1056 } 1057 1058 void _mm256_print_epi8(__m256i v) @trusted 1059 { 1060 byte[32] C = (cast(byte32)v).array; 1061 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 1062 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], 1063 C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15], 1064 C[16], C[17], C[18], C[19], C[20], C[21], C[22], C[23], 1065 C[24], C[25], C[26], C[27], C[28], C[29], C[30], C[31]); 1066 1067 } 1068 } 1069 1070 1071 // 1072 // <FLOATING-POINT COMPARISONS> 1073 // 1074 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 1075 // need different IR generation. 1076 1077 enum FPComparison 1078 { 1079 false_,// always false 1080 oeq, // ordered and equal 1081 ogt, // ordered and greater than 1082 oge, // ordered and greater than or equal 1083 olt, // ordered and less than 1084 ole, // ordered and less than or equal 1085 one, // ordered and not equal 1086 ord, // ordered (no nans) 1087 ueq, // unordered or equal 1088 ugt, // unordered or greater than ("nle") 1089 uge, // unordered or greater than or equal ("nlt") 1090 ult, // unordered or less than ("nge") 1091 ule, // unordered or less than or equal ("ngt") 1092 une, // unordered or not equal ("neq") 1093 uno, // unordered (either nans) 1094 true_, // always true 1095 } 1096 1097 private static immutable string[FPComparison.max+1] FPComparisonToString = 1098 [ 1099 "false", 1100 "oeq", 1101 "ogt", 1102 "oge", 1103 "olt", 1104 "ole", 1105 "one", 1106 "ord", 1107 "ueq", 1108 "ugt", 1109 "uge", 1110 "ult", 1111 "ule", 1112 "une", 1113 "uno", 1114 "true" 1115 ]; 1116 1117 // AVX FP comparison to FPComparison 1118 FPComparison mapAVXFPComparison(int imm8) pure @safe 1119 { 1120 // Always map on non-signalling 1121 static immutable FPComparison[16] mapping = 1122 [ 1123 FPComparison.oeq, // _CMP_EQ_OQ 1124 FPComparison.olt, // _CMP_LT_OS 1125 FPComparison.ole, // _CMP_LE_OS 1126 FPComparison.uno, // _CMP_UNORD_Q 1127 FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered? 1128 FPComparison.uge, // _CMP_NLT_US 1129 FPComparison.ugt, // _CMP_NLE_US 1130 FPComparison.ord, // _CMP_ORD_Q 1131 FPComparison.ueq, // _CMP_EQ_UQ 1132 FPComparison.ult, // _CMP_NGE_US 1133 FPComparison.ule, // _CMP_NGT_US 1134 FPComparison.false_,// _CMP_FALSE_OQ 1135 FPComparison.one, // _CMP_NEQ_OQ 1136 FPComparison.oge, // _CMP_GE_OS 1137 FPComparison.ogt, // _CMP_GT_OS 1138 FPComparison.true_ // _CMP_TRUE_UQ 1139 ]; 1140 1141 return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up 1142 } 1143 1144 // Individual float comparison: returns -1 for true or 0 for false. 1145 // Useful for DMD and testing 1146 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 1147 { 1148 bool unordered = isnan(a) || isnan(b); 1149 final switch(comparison) with(FPComparison) 1150 { 1151 case false_: return false; 1152 case oeq: return a == b; 1153 case ogt: return a > b; 1154 case oge: return a >= b; 1155 case olt: return a < b; 1156 case ole: return a <= b; 1157 case one: return !unordered && (a != b); // NaN with != always yields true 1158 case ord: return !unordered; 1159 case ueq: return unordered || (a == b); 1160 case ugt: return unordered || (a > b); 1161 case uge: return unordered || (a >= b); 1162 case ult: return unordered || (a < b); 1163 case ule: return unordered || (a <= b); 1164 case une: return (a != b); // NaN with != always yields true 1165 case uno: return unordered; 1166 case true_: return true; 1167 } 1168 } 1169 1170 static if (LDC_with_optimizations) // this save time for bigger projects, since LDCInlineIR gets more expensive there. 1171 { 1172 /// Provides packed float comparisons 1173 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 1174 { 1175 enum ir = ` 1176 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 1177 %r = sext <4 x i1> %cmp to <4 x i32> 1178 ret <4 x i32> %r`; 1179 1180 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 1181 } 1182 1183 ///ditto 1184 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe 1185 { 1186 enum ir = ` 1187 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1 1188 %r = sext <8 x i1> %cmp to <8 x i32> 1189 ret <8 x i32> %r`; 1190 return LDCInlineIR!(ir, int8, float8, float8)(a, b); 1191 } 1192 1193 /// Provides packed double comparisons 1194 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 1195 { 1196 enum ir = ` 1197 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 1198 %r = sext <2 x i1> %cmp to <2 x i64> 1199 ret <2 x i64> %r`; 1200 1201 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 1202 } 1203 1204 ///ditto 1205 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe 1206 { 1207 enum ir = ` 1208 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1 1209 %r = sext <4 x i1> %cmp to <4 x i64> 1210 ret <4 x i64> %r`; 1211 return LDCInlineIR!(ir, long4, double4, double4)(a, b); 1212 } 1213 1214 /// CMPSS-style comparisons 1215 /// clang implement it through x86 intrinsics, it is possible with IR alone 1216 /// but leads to less optimal code. 1217 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 1218 /// Not that simple. 1219 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 1220 { 1221 /* 1222 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 1223 enum bool invertOp = (predicateNumber & 0x80) != 0; 1224 static if(invertOp) 1225 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 1226 else 1227 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 1228 */ 1229 enum ir = ` 1230 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 1231 %r = sext i1 %cmp to i32 1232 %r2 = bitcast i32 %r to float 1233 ret float %r2`; 1234 1235 float4 r = a; 1236 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 1237 return r; 1238 } 1239 1240 /// CMPSD-style comparisons 1241 /// clang implement it through x86 intrinsics, it is possible with IR alone 1242 /// but leads to less optimal code. 1243 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 1244 /// Not that simple. 1245 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 1246 { 1247 enum ir = ` 1248 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 1249 %r = sext i1 %cmp to i64 1250 %r2 = bitcast i64 %r to double 1251 ret double %r2`; 1252 1253 double2 r = a; 1254 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 1255 return r; 1256 } 1257 } 1258 else 1259 { 1260 /// Provides packed float comparisons 1261 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 1262 { 1263 int4 result; 1264 foreach(i; 0..4) 1265 { 1266 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1267 } 1268 return result; 1269 } 1270 ///ditto 1271 package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted 1272 { 1273 int8 result; 1274 foreach(i; 0..8) 1275 { 1276 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 1277 } 1278 return result; 1279 } 1280 1281 /// Provides packed double comparisons 1282 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1283 { 1284 long2 result; 1285 foreach(i; 0..2) 1286 { 1287 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1288 } 1289 return result; 1290 } 1291 ///ditto 1292 package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted 1293 { 1294 long4 result; 1295 foreach(i; 0..4) 1296 { 1297 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 1298 } 1299 return result; 1300 } 1301 1302 /// Provides CMPSS-style comparison 1303 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 1304 { 1305 int4 result = cast(int4)a; 1306 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 1307 return cast(float4)result; 1308 } 1309 1310 /// Provides CMPSD-style comparison 1311 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 1312 { 1313 long2 result = cast(long2)a; 1314 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 1315 return cast(double2)result; 1316 } 1317 } 1318 unittest // cmpps 1319 { 1320 // Check all comparison type is working 1321 float4 A = [1, 3, 5, float.nan]; 1322 float4 B = [2, 3, 4, 5]; 1323 1324 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 1325 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 1326 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 1327 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 1328 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 1329 int4 result_one = cmpps!(FPComparison.one)(A, B); 1330 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 1331 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 1332 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 1333 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 1334 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 1335 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 1336 int4 result_une = cmpps!(FPComparison.une)(A, B); 1337 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 1338 1339 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 1340 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 1341 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 1342 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 1343 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 1344 static immutable int[4] correct_one = [-1, 0,-1, 0]; 1345 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 1346 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 1347 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 1348 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 1349 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 1350 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 1351 static immutable int[4] correct_une = [-1, 0,-1,-1]; 1352 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 1353 1354 assert(result_oeq.array == correct_oeq); 1355 assert(result_ogt.array == correct_ogt); 1356 assert(result_oge.array == correct_oge); 1357 assert(result_olt.array == correct_olt); 1358 assert(result_ole.array == correct_ole); 1359 assert(result_one.array == correct_one); 1360 assert(result_ord.array == correct_ord); 1361 assert(result_ueq.array == correct_ueq); 1362 assert(result_ugt.array == correct_ugt); 1363 assert(result_uge.array == correct_uge); 1364 assert(result_ult.array == correct_ult); 1365 assert(result_ule.array == correct_ule); 1366 assert(result_une.array == correct_une); 1367 assert(result_uno.array == correct_uno); 1368 } 1369 unittest 1370 { 1371 double2 a = [1, 3]; 1372 double2 b = [2, 3]; 1373 long2 c = cmppd!(FPComparison.ult)(a, b); 1374 static immutable long[2] correct = [cast(long)(-1), 0]; 1375 assert(c.array == correct); 1376 } 1377 unittest // cmpss 1378 { 1379 void testComparison(FPComparison comparison)(float4 A, float4 B) 1380 { 1381 float4 result = cmpss!comparison(A, B); 1382 int4 iresult = cast(int4)result; 1383 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 1384 assert(iresult.array[0] == expected); 1385 assert(result.array[1] == A.array[1]); 1386 assert(result.array[2] == A.array[2]); 1387 assert(result.array[3] == A.array[3]); 1388 } 1389 1390 // Check all comparison type is working 1391 float4 A = [1, 3, 5, 6]; 1392 float4 B = [2, 3, 4, 5]; 1393 float4 C = [float.nan, 3, 4, 5]; 1394 1395 testComparison!(FPComparison.oeq)(A, B); 1396 testComparison!(FPComparison.oeq)(A, C); 1397 testComparison!(FPComparison.ogt)(A, B); 1398 testComparison!(FPComparison.ogt)(A, C); 1399 testComparison!(FPComparison.oge)(A, B); 1400 testComparison!(FPComparison.oge)(A, C); 1401 testComparison!(FPComparison.olt)(A, B); 1402 testComparison!(FPComparison.olt)(A, C); 1403 testComparison!(FPComparison.ole)(A, B); 1404 testComparison!(FPComparison.ole)(A, C); 1405 testComparison!(FPComparison.one)(A, B); 1406 testComparison!(FPComparison.one)(A, C); 1407 testComparison!(FPComparison.ord)(A, B); 1408 testComparison!(FPComparison.ord)(A, C); 1409 testComparison!(FPComparison.ueq)(A, B); 1410 testComparison!(FPComparison.ueq)(A, C); 1411 testComparison!(FPComparison.ugt)(A, B); 1412 testComparison!(FPComparison.ugt)(A, C); 1413 testComparison!(FPComparison.uge)(A, B); 1414 testComparison!(FPComparison.uge)(A, C); 1415 testComparison!(FPComparison.ult)(A, B); 1416 testComparison!(FPComparison.ult)(A, C); 1417 testComparison!(FPComparison.ule)(A, B); 1418 testComparison!(FPComparison.ule)(A, C); 1419 testComparison!(FPComparison.une)(A, B); 1420 testComparison!(FPComparison.une)(A, C); 1421 testComparison!(FPComparison.uno)(A, B); 1422 testComparison!(FPComparison.uno)(A, C); 1423 } 1424 unittest // cmpsd 1425 { 1426 void testComparison(FPComparison comparison)(double2 A, double2 B) 1427 { 1428 double2 result = cmpsd!comparison(A, B); 1429 long2 iresult = cast(long2)result; 1430 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 1431 assert(iresult.array[0] == expected); 1432 assert(result.array[1] == A.array[1]); 1433 } 1434 1435 // Check all comparison type is working 1436 double2 A = [1, 3]; 1437 double2 B = [2, 4]; 1438 double2 C = [double.nan, 5]; 1439 1440 testComparison!(FPComparison.oeq)(A, B); 1441 testComparison!(FPComparison.oeq)(A, C); 1442 testComparison!(FPComparison.ogt)(A, B); 1443 testComparison!(FPComparison.ogt)(A, C); 1444 testComparison!(FPComparison.oge)(A, B); 1445 testComparison!(FPComparison.oge)(A, C); 1446 testComparison!(FPComparison.olt)(A, B); 1447 testComparison!(FPComparison.olt)(A, C); 1448 testComparison!(FPComparison.ole)(A, B); 1449 testComparison!(FPComparison.ole)(A, C); 1450 testComparison!(FPComparison.one)(A, B); 1451 testComparison!(FPComparison.one)(A, C); 1452 testComparison!(FPComparison.ord)(A, B); 1453 testComparison!(FPComparison.ord)(A, C); 1454 testComparison!(FPComparison.ueq)(A, B); 1455 testComparison!(FPComparison.ueq)(A, C); 1456 testComparison!(FPComparison.ugt)(A, B); 1457 testComparison!(FPComparison.ugt)(A, C); 1458 testComparison!(FPComparison.uge)(A, B); 1459 testComparison!(FPComparison.uge)(A, C); 1460 testComparison!(FPComparison.ult)(A, B); 1461 testComparison!(FPComparison.ult)(A, C); 1462 testComparison!(FPComparison.ule)(A, B); 1463 testComparison!(FPComparison.ule)(A, C); 1464 testComparison!(FPComparison.une)(A, B); 1465 testComparison!(FPComparison.une)(A, C); 1466 testComparison!(FPComparison.uno)(A, B); 1467 testComparison!(FPComparison.uno)(A, C); 1468 } 1469 1470 // 1471 // </FLOATING-POINT COMPARISONS> 1472 // 1473 1474 1475 __m64 to_m64(__m128i a) pure @trusted 1476 { 1477 long2 la = cast(long2)a; 1478 long1 r = la.array[0]; 1479 return r; 1480 } 1481 1482 __m128i to_m128i(__m64 a) pure @trusted 1483 { 1484 /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 1485 1486 version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 1487 { 1488 long2 r = a.array[0]; 1489 r.ptr[1] = 0; 1490 return cast(int4)r; 1491 } 1492 else */ 1493 { 1494 long2 r = [0, 0]; 1495 r.ptr[0] = a.array[0]; 1496 return cast(__m128i)r; 1497 } 1498 } 1499 1500 1501 // ADDITIONAL LLVM INTRINSICS 1502 // Basically LDC didn't add them yet 1503 version(LDC) 1504 { 1505 static if (__VERSION__ >= 2097) // LDC 1.27+ 1506 { 1507 pragma(LDC_intrinsic, "llvm.abs.i#") 1508 T inteli_llvm_abs(T)(T val, bool attrib); 1509 } 1510 1511 static if (__VERSION__ >= 2092) // LDC 1.22+ 1512 { 1513 pragma(LDC_intrinsic, "llvm.sadd.sat.i#") 1514 T inteli_llvm_adds(T)(T a, T b) pure @safe; 1515 pragma(LDC_intrinsic, "llvm.ssub.sat.i#") 1516 T inteli_llvm_subs(T)(T a, T b) pure @safe; 1517 pragma(LDC_intrinsic, "llvm.uadd.sat.i#") 1518 T inteli_llvm_addus(T)(T a, T b) pure @safe; 1519 pragma(LDC_intrinsic, "llvm.usub.sat.i#") 1520 T inteli_llvm_subus(T)(T a, T b) pure @safe; 1521 1522 enum LDC_with_saturated_intrinsics = true; 1523 } 1524 else 1525 enum LDC_with_saturated_intrinsics = false; 1526 } 1527 else 1528 enum LDC_with_saturated_intrinsics = false; 1529 1530 // ADDITIONAL x86 INTRINSICS 1531 // Absent from ldc.gccbuiltins_x86 for some reason, but needed. 1532 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td 1533 static if (LDC_with_SSE41) 1534 { 1535 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb") 1536 byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe; 1537 } 1538 1539 // SOME NEON INTRINSICS 1540 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics. 1541 // Not in the public API but the simde project expose it all for the user to use. 1542 // MAYDO: create a new neon.d module, for internal use only. 1543 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64. 1544 static if (LDC_with_ARM64) 1545 { 1546 // VERY USEFUL LINK 1547 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td 1548 // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/ 1549 1550 // Note: it is helpful to verify, in case of complex sequence of intrinsics, that the result is actually false. 1551 // Some intrinsics have trouble when inlined inside another, such as vmovl_low_s32. In this case, it's better to use builtins 1552 // from backend to have an inlining that still match the instruction. 1553 1554 pragma(LDC_intrinsic, "llvm.aarch64.crc32cb") 1555 uint __crc32cb(uint a, uint b) pure @safe; 1556 1557 pragma(LDC_intrinsic, "llvm.aarch64.crc32ch") 1558 uint __crc32ch(uint a, uint b) pure @safe; 1559 1560 pragma(LDC_intrinsic, "llvm.aarch64.crc32cw") 1561 uint __crc32cw(uint a, uint b) pure @safe; 1562 1563 pragma(LDC_intrinsic, "llvm.aarch64.crc32cx") 1564 uint __crc32cd(uint a, ulong b) pure @safe; 1565 1566 //pragma(LDC_intrinsic, "llvm.aarch64.dmb") 1567 // uint __dmb(int a) @safe; // didn't found a name in intrinsic list 1568 1569 pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8") 1570 byte16 vabdq_u8(byte16 a, byte16 b) pure @safe; 1571 1572 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16") 1573 short8 vabsq_s16(short8 a) pure @safe; 1574 1575 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32") 1576 int4 vabsq_s32(int4 a) pure @safe; 1577 1578 pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8") 1579 byte16 vabsq_s8(byte16 a) pure @safe; 1580 1581 byte8 vand_u8(byte8 a, byte8 b) pure @safe 1582 { 1583 return a & b; 1584 } 1585 1586 long2 vandq_s64(long2 a, long2 b) 1587 { 1588 return a & b; 1589 } 1590 1591 long2 vbicq_s64(long2 a, long2 b) pure @safe 1592 { 1593 return a & ~b; 1594 } 1595 1596 int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe 1597 { 1598 return c ^ ((c ^ b) & a); 1599 } 1600 1601 byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe 1602 { 1603 return c ^ ((c ^ b) & a); 1604 } 1605 1606 long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe 1607 { 1608 return c ^ ((c ^ b) & a); 1609 } 1610 1611 short8 vcombine_s16(short4 lo, short4 hi) pure @trusted 1612 { 1613 short8 r; 1614 r.ptr[0] = lo.array[0]; 1615 r.ptr[1] = lo.array[1]; 1616 r.ptr[2] = lo.array[2]; 1617 r.ptr[3] = lo.array[3]; 1618 r.ptr[4] = hi.array[0]; 1619 r.ptr[5] = hi.array[1]; 1620 r.ptr[6] = hi.array[2]; 1621 r.ptr[7] = hi.array[3]; 1622 return r; 1623 } 1624 1625 int4 vcombine_s32(int2 lo, int2 hi) pure @trusted 1626 { 1627 int4 r; 1628 r.ptr[0] = lo.array[0]; 1629 r.ptr[1] = lo.array[1]; 1630 r.ptr[2] = hi.array[0]; 1631 r.ptr[3] = hi.array[1]; 1632 return r; 1633 } 1634 1635 byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted 1636 { 1637 byte16 r; 1638 r.ptr[0] = lo.array[0]; 1639 r.ptr[1] = lo.array[1]; 1640 r.ptr[2] = lo.array[2]; 1641 r.ptr[3] = lo.array[3]; 1642 r.ptr[4] = lo.array[4]; 1643 r.ptr[5] = lo.array[5]; 1644 r.ptr[6] = lo.array[6]; 1645 r.ptr[7] = lo.array[7]; 1646 r.ptr[8] = hi.array[0]; 1647 r.ptr[9] = hi.array[1]; 1648 r.ptr[10] = hi.array[2]; 1649 r.ptr[11] = hi.array[3]; 1650 r.ptr[12] = hi.array[4]; 1651 r.ptr[13] = hi.array[5]; 1652 r.ptr[14] = hi.array[6]; 1653 r.ptr[15] = hi.array[7]; 1654 return r; 1655 } 1656 1657 short8 vcombine_u16(short4 lo, short4 hi) pure @trusted 1658 { 1659 short8 r; 1660 r.ptr[0] = lo.array[0]; 1661 r.ptr[1] = lo.array[1]; 1662 r.ptr[2] = lo.array[2]; 1663 r.ptr[3] = lo.array[3]; 1664 r.ptr[4] = hi.array[0]; 1665 r.ptr[5] = hi.array[1]; 1666 r.ptr[6] = hi.array[2]; 1667 r.ptr[7] = hi.array[3]; 1668 return r; 1669 } 1670 1671 1672 // float4 => int4 1673 1674 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32") 1675 int4 vcvtmq_s32_f32(float4 a) pure @safe; 1676 1677 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32") 1678 int4 vcvtnq_s32_f32(float4 a) pure @safe; 1679 1680 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32") 1681 int4 vcvtpq_s32_f32(float4 a) pure @safe; 1682 1683 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32") 1684 int4 vcvtzq_s32_f32(float4 a) pure @safe; 1685 1686 1687 // float4 <=> half4 1688 1689 pragma(LDC_intrinsic, "llvm.aarch64.neon.vcvtfp2hf") 1690 short4 vcvt_f16_f32(float4 a) pure @safe; 1691 1692 pragma(LDC_intrinsic, "llvm.aarch64.neon.vcvthf2fp") 1693 float4 vcvt_f32_f16(short4 a) pure @safe; 1694 1695 // double2 => long2 1696 1697 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64") 1698 long2 vcvtmq_s64_f64(double2 a) pure @safe; 1699 1700 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64") 1701 long2 vcvtnq_s64_f64(double2 a) pure @safe; 1702 1703 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64") 1704 long2 vcvtpq_s64_f64(double2 a) pure @safe; 1705 1706 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64") 1707 long2 vcvtzq_s64_f64(double2 a) pure @safe; 1708 1709 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32") 1710 int vcvtms_s32_f32(float a) pure @safe; 1711 1712 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32") 1713 int vcvtns_s32_f32(float a) pure @safe; 1714 1715 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32") 1716 int vcvtps_s32_f32(float a) pure @safe; 1717 1718 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32") 1719 int vcvts_s32_f32(float a) pure @safe; 1720 1721 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64") 1722 int vcvtms_s32_f64(double a) pure @safe; 1723 1724 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64") 1725 int vcvtns_s32_f64(double a) pure @safe; 1726 1727 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64") 1728 int vcvtps_s32_f64(double a) pure @safe; 1729 1730 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64") 1731 int vcvts_s32_f64(double a) pure @safe; 1732 1733 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32") 1734 long vcvtms_s64_f32(float a) pure @safe; 1735 1736 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32") 1737 long vcvtns_s64_f32(float a) pure @safe; 1738 1739 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32") 1740 long vcvtps_s64_f32(float a) pure @safe; 1741 1742 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32") 1743 long vcvts_s64_f32(float a) pure @safe; 1744 1745 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64") 1746 long vcvtms_s64_f64(double a) pure @safe; 1747 1748 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64") 1749 long vcvtns_s64_f64(double a) pure @safe; 1750 1751 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64") 1752 long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64 1753 1754 pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64") 1755 long vcvts_s64_f64(double a) pure @safe; 1756 1757 long2 vdupq_n_s64(long value) pure @safe 1758 { 1759 long2 r; 1760 r = value; 1761 return r; 1762 } 1763 1764 short4 vget_high_s16(short8 a) pure @trusted 1765 { 1766 short4 r; 1767 r.ptr[0] = a.array[4]; 1768 r.ptr[1] = a.array[5]; 1769 r.ptr[2] = a.array[6]; 1770 r.ptr[3] = a.array[7]; 1771 return r; 1772 } 1773 1774 int2 vget_high_s32(int4 a) pure @trusted 1775 { 1776 int2 r; 1777 r.ptr[0] = a.array[2]; 1778 r.ptr[1] = a.array[3]; 1779 return r; 1780 } 1781 1782 byte8 vget_high_u8(byte16 a) pure @trusted 1783 { 1784 byte8 r; 1785 r.ptr[0] = a.array[8]; 1786 r.ptr[1] = a.array[9]; 1787 r.ptr[2] = a.array[10]; 1788 r.ptr[3] = a.array[11]; 1789 r.ptr[4] = a.array[12]; 1790 r.ptr[5] = a.array[13]; 1791 r.ptr[6] = a.array[14]; 1792 r.ptr[7] = a.array[15]; 1793 return r; 1794 } 1795 1796 short4 vget_low_s16(short8 a) pure @trusted 1797 { 1798 short4 r; 1799 r.ptr[0] = a.array[0]; 1800 r.ptr[1] = a.array[1]; 1801 r.ptr[2] = a.array[2]; 1802 r.ptr[3] = a.array[3]; 1803 return r; 1804 } 1805 1806 int2 vget_low_s32(int4 a) pure @trusted 1807 { 1808 int2 r; 1809 r.ptr[0] = a.array[0]; 1810 r.ptr[1] = a.array[1]; 1811 return r; 1812 } 1813 1814 byte8 vget_low_u8(byte16 a) pure @trusted 1815 { 1816 byte8 r; 1817 r.ptr[0] = a.array[0]; 1818 r.ptr[1] = a.array[1]; 1819 r.ptr[2] = a.array[2]; 1820 r.ptr[3] = a.array[3]; 1821 r.ptr[4] = a.array[4]; 1822 r.ptr[5] = a.array[5]; 1823 r.ptr[6] = a.array[6]; 1824 r.ptr[7] = a.array[7]; 1825 return r; 1826 } 1827 1828 long vgetq_lane_s64(long2 v, const int lane) pure @safe 1829 { 1830 return v.array[lane]; 1831 } 1832 1833 pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16") 1834 short8 vmaxq_s16(short8 a, short8 b) pure @safe; 1835 1836 int4 vmaxq_s32(int4 a, int4 b) pure @safe 1837 { 1838 int4 r; 1839 r[0] = a[0] >= b[0] ? a[0] : b[0]; 1840 r[1] = a[1] >= b[1] ? a[1] : b[1]; 1841 r[2] = a[2] >= b[2] ? a[2] : b[2]; 1842 r[3] = a[3] >= b[3] ? a[3] : b[3]; 1843 return r; 1844 } 1845 1846 pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16") 1847 short8 vminq_s16(short8 a, short8 b) pure @safe; 1848 1849 int4 vmovl_u16(short4 a) pure @trusted 1850 { 1851 int4 r; 1852 r.ptr[0] = cast(ushort)a.array[0]; 1853 r.ptr[1] = cast(ushort)a.array[1]; 1854 r.ptr[2] = cast(ushort)a.array[2]; 1855 r.ptr[3] = cast(ushort)a.array[3]; 1856 return r; 1857 } 1858 1859 int2 vmovn_s64(long2 a) pure @trusted 1860 { 1861 int2 r; 1862 r.ptr[0] = cast(int)(a.array[0]); 1863 r.ptr[1] = cast(int)(a.array[1]); 1864 return r; 1865 } 1866 1867 int4 vmull_s16(short4 a, short4 b) pure @trusted 1868 { 1869 int4 r; 1870 r.ptr[0] = a.array[0] * b.array[0]; 1871 r.ptr[1] = a.array[1] * b.array[1]; 1872 r.ptr[2] = a.array[2] * b.array[2]; 1873 r.ptr[3] = a.array[3] * b.array[3]; 1874 return r; 1875 } 1876 1877 pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64") 1878 long2 vmull_s32(int2 a, int2 b) pure @safe; 1879 1880 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16") 1881 short4 vpadd_s16(short4 a, short4 b) pure @safe; 1882 1883 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32") 1884 int2 vpadd_s32(int2 a, int2 b) pure @safe; 1885 1886 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8") 1887 byte8 vpadd_u8(byte8 a, byte8 b) pure @safe; 1888 1889 pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8") 1890 short8 vpaddlq_u8 (byte16 a) pure @safe; 1891 1892 static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin 1893 { 1894 pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32") 1895 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1896 } 1897 else 1898 { 1899 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32") 1900 float4 vpaddq_f32(float4 a, float4 b) pure @safe; 1901 } 1902 1903 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16") 1904 short8 vpaddq_s16(short8 a, short8 b) pure @safe; 1905 1906 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8") 1907 byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe; 1908 1909 pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32") 1910 int4 vpaddq_s32(int4 a, int4 b) pure @safe; 1911 1912 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16") 1913 short4 vqadd_s16(short4 a, short4 b) pure @safe; 1914 1915 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16") 1916 short8 vqaddq_s16(short8 a, short8 b) pure @safe; 1917 1918 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8") 1919 byte8 vqmovn_s16(short8 a) pure @safe; 1920 1921 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16") 1922 short4 vqmovn_s32(int4 a) pure @safe; 1923 1924 pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16") 1925 short4 vqmovn_u32(int4 a) pure @safe; 1926 1927 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8") 1928 byte8 vqmovun_s16(short8 a) pure @safe; 1929 1930 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16") 1931 short4 vqsub_s16(short4 a, short4 b) pure @safe; 1932 1933 pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16") 1934 short8 vqsubq_s16(short8 a, short8 b) pure @safe; 1935 1936 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8") 1937 byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe; 1938 1939 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8") 1940 byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe; 1941 1942 pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16") 1943 short8 vrhadd_u16(short8 a, short8 b) pure @safe; 1944 1945 pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16") 1946 short4 vrshrn_n_s32(int4 a, int n) pure @safe; 1947 1948 byte8 vshr_u8(byte8 a, byte8 b) pure @safe 1949 { 1950 return a >>> b; 1951 } 1952 1953 byte16 vshrq_n_s8(byte16 a, byte r) pure @safe 1954 { 1955 a = a >> byte16(cast(byte)r); 1956 return a; 1957 } 1958 1959 pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8") 1960 byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe; 1961 } 1962 1963 version(unittest) 1964 { 1965 double abs_double(double x) @trusted 1966 { 1967 version(LDC) 1968 return llvm_fabs(x); 1969 else 1970 { 1971 long uf = *cast(long*)(&x); 1972 uf &= 0x7fffffff_ffffffff; 1973 return *cast(double*)(&uf); 1974 } 1975 } 1976 } 1977 1978 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure 1979 1980 bool isnan(float x) pure @trusted 1981 { 1982 uint u = *cast(uint*)(&x); 1983 bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF); 1984 return result; 1985 } 1986 unittest 1987 { 1988 float x = float.nan; 1989 assert(isnan(x)); 1990 1991 x = 0; 1992 assert(!isnan(x)); 1993 1994 x = float.infinity; 1995 assert(!isnan(x)); 1996 } 1997 1998 bool isnan(double x) pure @trusted 1999 { 2000 ulong u = *cast(ulong*)(&x); 2001 return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF); 2002 } 2003 unittest 2004 { 2005 double x = double.nan; 2006 assert(isnan(x)); 2007 2008 x = 0; 2009 assert(!isnan(x)); 2010 2011 x = double.infinity; 2012 assert(!isnan(x)); 2013 }