1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.internals; 7 8 import inteli.types; 9 10 // The only math functions needed for intel-intrinsics 11 public import core.math: fabs, sqrt; // since they are intrinsics 12 13 version(LDC) 14 { 15 public import core.simd; 16 public import ldc.simd; 17 public import ldc.gccbuiltins_x86; 18 public import ldc.intrinsics; 19 public import ldc.llvmasm: __asm; 20 21 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 22 static if (__VERSION__ >= 2083) 23 { 24 import ldc.llvmasm; 25 alias LDCInlineIR = __ir_pure; 26 } 27 else 28 { 29 alias LDCInlineIR = inlineIR; 30 } 31 } 32 33 34 35 package: 36 nothrow @nogc: 37 38 39 // 40 // <ROUNDING> 41 // 42 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 43 // doesn't change the FPU rounding mode, and isn't expected to do so. 44 // So we devised these rounding function to help having consistent rouding between 45 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 46 // 47 48 49 int convertFloatToInt32UsingMXCSR(float value) pure @safe 50 { 51 int result; 52 asm pure nothrow @nogc @trusted 53 { 54 cvtss2si EAX, value; 55 mov result, EAX; 56 } 57 return result; 58 } 59 60 int convertDoubleToInt32UsingMXCSR(double value) pure @safe 61 { 62 int result; 63 asm pure nothrow @nogc @trusted 64 { 65 cvtsd2si EAX, value; 66 mov result, EAX; 67 } 68 return result; 69 } 70 71 long convertFloatToInt64UsingMXCSR(float value) pure @safe 72 { 73 // 64-bit can use an SSE instruction 74 version(D_InlineAsm_X86_64) 75 { 76 long result; 77 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 78 { 79 asm pure nothrow @nogc @trusted 80 { 81 movss XMM0, value; 82 cvtss2si RAX, XMM0; 83 mov result, RAX; 84 } 85 } 86 else 87 { 88 asm pure nothrow @nogc @trusted 89 { 90 movss XMM0, value; 91 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 92 mov result, RAX; 93 } 94 } 95 return result; 96 } 97 else version(D_InlineAsm_X86) 98 { 99 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 100 // This leads to an unfortunate FPU sequence in every C++ compiler. 101 // See: https://godbolt.org/z/vZym77 102 103 // Get current MXCSR rounding 104 uint sseRounding; 105 ushort savedFPUCW; 106 ushort newFPUCW; 107 long result; 108 asm pure nothrow @nogc @trusted 109 { 110 stmxcsr sseRounding; 111 fld value; 112 fnstcw savedFPUCW; 113 mov AX, savedFPUCW; 114 and AX, 0xf3ff; // clear FPU rounding bits 115 movzx ECX, word ptr sseRounding; 116 and ECX, 0x6000; // only keep SSE rounding bits 117 shr ECX, 3; 118 or AX, CX; // make a new control word for FPU with SSE bits 119 mov newFPUCW, AX; 120 fldcw newFPUCW; 121 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 122 fldcw savedFPUCW; 123 } 124 return result; 125 } 126 else 127 static assert(false); 128 } 129 130 ///ditto 131 long convertDoubleToInt64UsingMXCSR(double value) pure @safe 132 { 133 // 64-bit can use an SSE instruction 134 version(D_InlineAsm_X86_64) 135 { 136 long result; 137 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 138 { 139 asm pure nothrow @nogc @trusted 140 { 141 movsd XMM0, value; 142 cvtsd2si RAX, XMM0; 143 mov result, RAX; 144 } 145 } 146 else 147 { 148 asm pure nothrow @nogc @trusted 149 { 150 movsd XMM0, value; 151 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 152 mov result, RAX; 153 } 154 } 155 return result; 156 } 157 else version(D_InlineAsm_X86) 158 { 159 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 160 // This leads to an unfortunate FPU sequence in every C++ compiler. 161 // See: https://godbolt.org/z/vZym77 162 163 // Get current MXCSR rounding 164 uint sseRounding; 165 ushort savedFPUCW; 166 ushort newFPUCW; 167 long result; 168 asm pure nothrow @nogc @trusted 169 { 170 stmxcsr sseRounding; 171 fld value; 172 fnstcw savedFPUCW; 173 mov AX, savedFPUCW; 174 and AX, 0xf3ff; 175 movzx ECX, word ptr sseRounding; 176 and ECX, 0x6000; 177 shr ECX, 3; 178 or AX, CX; 179 mov newFPUCW, AX; 180 fldcw newFPUCW; 181 fistp result; 182 fldcw savedFPUCW; 183 } 184 return result; 185 } 186 else 187 static assert(false); 188 } 189 190 191 // 192 // </ROUNDING> 193 // 194 195 196 // using the Intel terminology here 197 198 byte saturateSignedWordToSignedByte(short value) pure @safe 199 { 200 if (value > 127) value = 127; 201 if (value < -128) value = -128; 202 return cast(byte) value; 203 } 204 205 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 206 { 207 if (value > 255) value = 255; 208 if (value < 0) value = 0; 209 return cast(ubyte) value; 210 } 211 212 short saturateSignedIntToSignedShort(int value) pure @safe 213 { 214 if (value > 32767) value = 32767; 215 if (value < -32768) value = -32768; 216 return cast(short) value; 217 } 218 219 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 220 { 221 if (value > 65535) value = 65535; 222 if (value < 0) value = 0; 223 return cast(ushort) value; 224 } 225 226 unittest // test saturate operations 227 { 228 assert( saturateSignedWordToSignedByte(32000) == 127); 229 assert( saturateSignedWordToUnsignedByte(32000) == 255); 230 assert( saturateSignedWordToSignedByte(-4000) == -128); 231 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 232 assert( saturateSignedIntToSignedShort(32768) == 32767); 233 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 234 assert( saturateSignedIntToSignedShort(-32769) == -32768); 235 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 236 } 237 238 version(unittest) 239 { 240 // This is just for debugging tests 241 import core.stdc.stdio: printf; 242 243 // printing vectors for implementation 244 // Note: you can override `pure` within a `debug` clause 245 246 void _mm_print_pi32(__m64 v) @trusted 247 { 248 int2 C = cast(int2)v; 249 printf("%d %d\n", C[0], C[1]); 250 } 251 252 void _mm_print_pi16(__m64 v) @trusted 253 { 254 short4 C = cast(short4)v; 255 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 256 } 257 258 void _mm_print_pi8(__m64 v) @trusted 259 { 260 byte8 C = cast(byte8)v; 261 printf("%d %d %d %d %d %d %d %d\n", 262 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 263 } 264 265 void _mm_print_epi32(__m128i v) @trusted 266 { 267 printf("%d %d %d %d\n", 268 v[0], v[1], v[2], v[3]); 269 } 270 271 void _mm_print_epi16(__m128i v) @trusted 272 { 273 short8 C = cast(short8)v; 274 printf("%d %d %d %d %d %d %d %d\n", 275 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 276 } 277 278 void _mm_print_epi8(__m128i v) @trusted 279 { 280 byte16 C = cast(byte16)v; 281 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 282 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 283 } 284 285 void _mm_print_ps(__m128 v) @trusted 286 { 287 float4 C = cast(float4)v; 288 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 289 } 290 291 void _mm_print_pd(__m128d v) @trusted 292 { 293 double2 C = cast(double2)v; 294 printf("%f %f\n", C[0], C[1]); 295 } 296 } 297 298 299 // 300 // <FLOATING-POINT COMPARISONS> 301 // 302 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 303 // need different IR generation. 304 305 enum FPComparison 306 { 307 oeq, // ordered and equal 308 ogt, // ordered and greater than 309 oge, // ordered and greater than or equal 310 olt, // ordered and less than 311 ole, // ordered and less than or equal 312 one, // ordered and not equal 313 ord, // ordered (no nans) 314 ueq, // unordered or equal 315 ugt, // unordered or greater than ("nle") 316 uge, // unordered or greater than or equal ("nlt") 317 ult, // unordered or less than ("nge") 318 ule, // unordered or less than or equal ("ngt") 319 une, // unordered or not equal ("neq") 320 uno, // unordered (either nans) 321 } 322 323 private static immutable string[FPComparison.max+1] FPComparisonToString = 324 [ 325 "oeq", 326 "ogt", 327 "oge", 328 "olt", 329 "ole", 330 "one", 331 "ord", 332 "ueq", 333 "ugt", 334 "uge", 335 "ult", 336 "ule", 337 "une", 338 "uno", 339 ]; 340 341 // Individual float comparison: returns -1 for true or 0 for false. 342 // Useful for DMD and testing 343 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 344 { 345 import std.math; 346 bool unordered = isNaN(a) || isNaN(b); 347 final switch(comparison) with(FPComparison) 348 { 349 case oeq: return a == b; 350 case ogt: return a > b; 351 case oge: return a >= b; 352 case olt: return a < b; 353 case ole: return a <= b; 354 case one: return !unordered && (a != b); // NaN with != always yields true 355 case ord: return !unordered; 356 case ueq: return unordered || (a == b); 357 case ugt: return unordered || (a > b); 358 case uge: return unordered || (a >= b); 359 case ult: return unordered || (a < b); 360 case ule: return unordered || (a <= b); 361 case une: return (a != b); // NaN with != always yields true 362 case uno: return unordered; 363 } 364 } 365 366 version(LDC) 367 { 368 /// Provides packed float comparisons 369 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 370 { 371 enum ir = ` 372 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 373 %r = sext <4 x i1> %cmp to <4 x i32> 374 ret <4 x i32> %r`; 375 376 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 377 } 378 379 /// Provides packed double comparisons 380 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 381 { 382 enum ir = ` 383 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 384 %r = sext <2 x i1> %cmp to <2 x i64> 385 ret <2 x i64> %r`; 386 387 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 388 } 389 390 /// CMPSS-style comparisons 391 /// clang implement it through x86 intrinsics, it is possible with IR alone 392 /// but leads to less optimal code. 393 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 394 /// Not that simple. 395 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 396 { 397 /* 398 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 399 enum bool invertOp = (predicateNumber & 0x80) != 0; 400 static if(invertOp) 401 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 402 else 403 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 404 */ 405 enum ir = ` 406 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 407 %r = sext i1 %cmp to i32 408 %r2 = bitcast i32 %r to float 409 ret float %r2`; 410 411 float4 r = a; 412 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 413 return r; 414 } 415 416 /// CMPSD-style comparisons 417 /// clang implement it through x86 intrinsics, it is possible with IR alone 418 /// but leads to less optimal code. 419 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 420 /// Not that simple. 421 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 422 { 423 enum ir = ` 424 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 425 %r = sext i1 %cmp to i64 426 %r2 = bitcast i64 %r to double 427 ret double %r2`; 428 429 double2 r = a; 430 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 431 return r; 432 } 433 434 // Note: ucomss and ucomsd are left unimplemented 435 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 436 { 437 enum ir = ` 438 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 439 %r = zext i1 %cmp to i32 440 ret i32 %r`; 441 442 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 443 } 444 445 // Note: ucomss and ucomsd are left unimplemented 446 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 447 { 448 enum ir = ` 449 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 450 %r = zext i1 %cmp to i32 451 ret i32 %r`; 452 453 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 454 } 455 } 456 else 457 { 458 /// Provides packed float comparisons 459 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 460 { 461 int4 result; 462 foreach(i; 0..4) 463 { 464 result[i] = compareFloat!float(comparison, a[i], b[i]) ? -1 : 0; 465 } 466 return result; 467 } 468 469 /// Provides packed double comparisons 470 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 471 { 472 long2 result; 473 foreach(i; 0..2) 474 { 475 result[i] = compareFloat!double(comparison, a[i], b[i]) ? -1 : 0; 476 } 477 return result; 478 } 479 480 /// Provides CMPSS-style comparison 481 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 482 { 483 int4 result = cast(int4)a; 484 result[0] = compareFloat!float(comparison, a[0], b[0]) ? -1 : 0; 485 return cast(float4)result; 486 } 487 488 /// Provides CMPSD-style comparison 489 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 490 { 491 long2 result = cast(long2)a; 492 result[0] = compareFloat!double(comparison, a[0], b[0]) ? -1 : 0; 493 return cast(double2)result; 494 } 495 496 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 497 { 498 return compareFloat!float(comparison, a[0], b[0]) ? 1 : 0; 499 } 500 501 // Note: ucomss and ucomsd are left unimplemented 502 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 503 { 504 return compareFloat!double(comparison, a[0], b[0]) ? 1 : 0; 505 } 506 } 507 unittest // cmpps 508 { 509 // Check all comparison type is working 510 float4 A = [1, 3, 5, float.nan]; 511 float4 B = [2, 3, 4, 5]; 512 513 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 514 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 515 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 516 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 517 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 518 int4 result_one = cmpps!(FPComparison.one)(A, B); 519 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 520 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 521 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 522 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 523 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 524 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 525 int4 result_une = cmpps!(FPComparison.une)(A, B); 526 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 527 528 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 529 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 530 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 531 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 532 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 533 static immutable int[4] correct_one = [-1, 0,-1, 0]; 534 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 535 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 536 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 537 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 538 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 539 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 540 static immutable int[4] correct_une = [-1, 0,-1,-1]; 541 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 542 543 assert(result_oeq.array == correct_oeq); 544 assert(result_ogt.array == correct_ogt); 545 assert(result_oge.array == correct_oge); 546 assert(result_olt.array == correct_olt); 547 assert(result_ole.array == correct_ole); 548 assert(result_one.array == correct_one); 549 assert(result_ord.array == correct_ord); 550 assert(result_ueq.array == correct_ueq); 551 assert(result_ugt.array == correct_ugt); 552 assert(result_uge.array == correct_uge); 553 assert(result_ult.array == correct_ult); 554 assert(result_ule.array == correct_ule); 555 assert(result_une.array == correct_une); 556 assert(result_uno.array == correct_uno); 557 } 558 unittest 559 { 560 double2 a = [1, 3]; 561 double2 b = [2, 3]; 562 long2 c = cmppd!(FPComparison.ult)(a, b); 563 static immutable long[2] correct = [cast(long)(-1), 0]; 564 assert(c.array == correct); 565 } 566 unittest // cmpss and comss 567 { 568 void testComparison(FPComparison comparison)(float4 A, float4 B) 569 { 570 float4 result = cmpss!comparison(A, B); 571 int4 iresult = cast(int4)result; 572 int expected = compareFloat!float(comparison, A[0], B[0]) ? -1 : 0; 573 assert(iresult[0] == expected); 574 assert(result[1] == A[1]); 575 assert(result[2] == A[2]); 576 assert(result[3] == A[3]); 577 578 // check comss 579 int comResult = comss!comparison(A, B); 580 assert( (expected != 0) == (comResult != 0) ); 581 } 582 583 // Check all comparison type is working 584 float4 A = [1, 3, 5, 6]; 585 float4 B = [2, 3, 4, 5]; 586 float4 C = [float.nan, 3, 4, 5]; 587 588 testComparison!(FPComparison.oeq)(A, B); 589 testComparison!(FPComparison.oeq)(A, C); 590 testComparison!(FPComparison.ogt)(A, B); 591 testComparison!(FPComparison.ogt)(A, C); 592 testComparison!(FPComparison.oge)(A, B); 593 testComparison!(FPComparison.oge)(A, C); 594 testComparison!(FPComparison.olt)(A, B); 595 testComparison!(FPComparison.olt)(A, C); 596 testComparison!(FPComparison.ole)(A, B); 597 testComparison!(FPComparison.ole)(A, C); 598 testComparison!(FPComparison.one)(A, B); 599 testComparison!(FPComparison.one)(A, C); 600 testComparison!(FPComparison.ord)(A, B); 601 testComparison!(FPComparison.ord)(A, C); 602 testComparison!(FPComparison.ueq)(A, B); 603 testComparison!(FPComparison.ueq)(A, C); 604 testComparison!(FPComparison.ugt)(A, B); 605 testComparison!(FPComparison.ugt)(A, C); 606 testComparison!(FPComparison.uge)(A, B); 607 testComparison!(FPComparison.uge)(A, C); 608 testComparison!(FPComparison.ult)(A, B); 609 testComparison!(FPComparison.ult)(A, C); 610 testComparison!(FPComparison.ule)(A, B); 611 testComparison!(FPComparison.ule)(A, C); 612 testComparison!(FPComparison.une)(A, B); 613 testComparison!(FPComparison.une)(A, C); 614 testComparison!(FPComparison.uno)(A, B); 615 testComparison!(FPComparison.uno)(A, C); 616 } 617 unittest // cmpsd and comsd 618 { 619 void testComparison(FPComparison comparison)(double2 A, double2 B) 620 { 621 double2 result = cmpsd!comparison(A, B); 622 long2 iresult = cast(long2)result; 623 long expected = compareFloat!double(comparison, A[0], B[0]) ? -1 : 0; 624 assert(iresult[0] == expected); 625 assert(result[1] == A[1]); 626 627 // check comsd 628 int comResult = comsd!comparison(A, B); 629 assert( (expected != 0) == (comResult != 0) ); 630 } 631 632 // Check all comparison type is working 633 double2 A = [1, 3]; 634 double2 B = [2, 4]; 635 double2 C = [double.nan, 5]; 636 637 testComparison!(FPComparison.oeq)(A, B); 638 testComparison!(FPComparison.oeq)(A, C); 639 testComparison!(FPComparison.ogt)(A, B); 640 testComparison!(FPComparison.ogt)(A, C); 641 testComparison!(FPComparison.oge)(A, B); 642 testComparison!(FPComparison.oge)(A, C); 643 testComparison!(FPComparison.olt)(A, B); 644 testComparison!(FPComparison.olt)(A, C); 645 testComparison!(FPComparison.ole)(A, B); 646 testComparison!(FPComparison.ole)(A, C); 647 testComparison!(FPComparison.one)(A, B); 648 testComparison!(FPComparison.one)(A, C); 649 testComparison!(FPComparison.ord)(A, B); 650 testComparison!(FPComparison.ord)(A, C); 651 testComparison!(FPComparison.ueq)(A, B); 652 testComparison!(FPComparison.ueq)(A, C); 653 testComparison!(FPComparison.ugt)(A, B); 654 testComparison!(FPComparison.ugt)(A, C); 655 testComparison!(FPComparison.uge)(A, B); 656 testComparison!(FPComparison.uge)(A, C); 657 testComparison!(FPComparison.ult)(A, B); 658 testComparison!(FPComparison.ult)(A, C); 659 testComparison!(FPComparison.ule)(A, B); 660 testComparison!(FPComparison.ule)(A, C); 661 testComparison!(FPComparison.une)(A, B); 662 testComparison!(FPComparison.une)(A, C); 663 testComparison!(FPComparison.uno)(A, B); 664 testComparison!(FPComparison.uno)(A, C); 665 } 666 667 // 668 // </FLOATING-POINT COMPARISONS> 669 //