1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.internals; 7 8 import inteli.types; 9 10 version(unittest) 11 import core.stdc.stdio; 12 13 // The only math functions needed for intel-intrinsics 14 public import core.math: fabs, sqrt; // since they are intrinsics 15 16 version(LDC) 17 { 18 public import core.simd; 19 public import ldc.simd; 20 public import ldc.gccbuiltins_x86; 21 public import ldc.intrinsics; 22 23 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 24 static if (__VERSION__ >= 2083) 25 { 26 import ldc.llvmasm; 27 alias LDCInlineIR = __ir_pure; 28 } 29 else 30 { 31 alias LDCInlineIR = inlineIR; 32 } 33 } 34 35 36 37 package: 38 nothrow @nogc: 39 40 41 // 42 // <ROUNDING> 43 // 44 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 45 // doesn't change the FPU rounding mode, and isn't expected to do so. 46 // So we devised these rounding function to help having consistent rouding between 47 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 48 // 49 50 51 int convertFloatToInt32UsingMXCSR(float value) pure @safe 52 { 53 int result; 54 asm pure nothrow @nogc @trusted 55 { 56 cvtss2si EAX, value; 57 mov result, EAX; 58 } 59 return result; 60 } 61 62 int convertDoubleToInt32UsingMXCSR(double value) pure @safe 63 { 64 int result; 65 asm pure nothrow @nogc @trusted 66 { 67 cvtsd2si EAX, value; 68 mov result, EAX; 69 } 70 return result; 71 } 72 73 long convertFloatToInt64UsingMXCSR(float value) pure @safe 74 { 75 // 64-bit can use an SSE instruction 76 version(D_InlineAsm_X86_64) 77 { 78 long result; 79 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 80 { 81 asm pure nothrow @nogc @trusted 82 { 83 movss XMM0, value; 84 cvtss2si RAX, XMM0; 85 mov result, RAX; 86 } 87 } 88 else 89 { 90 asm pure nothrow @nogc @trusted 91 { 92 movss XMM0, value; 93 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 94 mov result, RAX; 95 } 96 } 97 return result; 98 } 99 else version(D_InlineAsm_X86) 100 { 101 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 102 // This leads to an unfortunate FPU sequence in every C++ compiler. 103 // See: https://godbolt.org/z/vZym77 104 105 // Get current MXCSR rounding 106 uint sseRounding; 107 ushort savedFPUCW; 108 ushort newFPUCW; 109 long result; 110 asm pure nothrow @nogc @trusted 111 { 112 stmxcsr sseRounding; 113 fld value; 114 fnstcw savedFPUCW; 115 mov AX, savedFPUCW; 116 and AX, 0xf3ff; // clear FPU rounding bits 117 movzx ECX, word ptr sseRounding; 118 and ECX, 0x6000; // only keep SSE rounding bits 119 shr ECX, 3; 120 or AX, CX; // make a new control word for FPU with SSE bits 121 mov newFPUCW, AX; 122 fldcw newFPUCW; 123 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 124 fldcw savedFPUCW; 125 } 126 return result; 127 } 128 else 129 static assert(false); 130 } 131 132 ///ditto 133 long convertDoubleToInt64UsingMXCSR(double value) pure @safe 134 { 135 // 64-bit can use an SSE instruction 136 version(D_InlineAsm_X86_64) 137 { 138 long result; 139 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 140 { 141 asm pure nothrow @nogc @trusted 142 { 143 movsd XMM0, value; 144 cvtsd2si RAX, XMM0; 145 mov result, RAX; 146 } 147 } 148 else 149 { 150 asm pure nothrow @nogc @trusted 151 { 152 movsd XMM0, value; 153 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 154 mov result, RAX; 155 } 156 } 157 return result; 158 } 159 else version(D_InlineAsm_X86) 160 { 161 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 162 // This leads to an unfortunate FPU sequence in every C++ compiler. 163 // See: https://godbolt.org/z/vZym77 164 165 // Get current MXCSR rounding 166 uint sseRounding; 167 ushort savedFPUCW; 168 ushort newFPUCW; 169 long result; 170 asm pure nothrow @nogc @trusted 171 { 172 stmxcsr sseRounding; 173 fld value; 174 fnstcw savedFPUCW; 175 mov AX, savedFPUCW; 176 and AX, 0xf3ff; 177 movzx ECX, word ptr sseRounding; 178 and ECX, 0x6000; 179 shr ECX, 3; 180 or AX, CX; 181 mov newFPUCW, AX; 182 fldcw newFPUCW; 183 fistp result; 184 fldcw savedFPUCW; 185 } 186 return result; 187 } 188 else 189 static assert(false); 190 } 191 192 193 // 194 // </ROUNDING> 195 // 196 197 198 // using the Intel terminology here 199 200 byte saturateSignedWordToSignedByte(short value) pure @safe 201 { 202 if (value > 127) value = 127; 203 if (value < -128) value = -128; 204 return cast(byte) value; 205 } 206 207 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 208 { 209 if (value > 255) value = 255; 210 if (value < 0) value = 0; 211 return cast(ubyte) value; 212 } 213 214 short saturateSignedIntToSignedShort(int value) pure @safe 215 { 216 if (value > 32767) value = 32767; 217 if (value < -32768) value = -32768; 218 return cast(short) value; 219 } 220 221 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 222 { 223 if (value > 65535) value = 65535; 224 if (value < 0) value = 0; 225 return cast(ushort) value; 226 } 227 228 unittest // test saturate operations 229 { 230 assert( saturateSignedWordToSignedByte(32000) == 127); 231 assert( saturateSignedWordToUnsignedByte(32000) == 255); 232 assert( saturateSignedWordToSignedByte(-4000) == -128); 233 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 234 assert( saturateSignedIntToSignedShort(32768) == 32767); 235 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 236 assert( saturateSignedIntToSignedShort(-32769) == -32768); 237 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 238 } 239 240 version(unittest) 241 { 242 // printing vectors for implementation 243 // Note: you can override `pure` within a `debug` clause 244 void _mm_print_epi32(__m128i v) @trusted 245 { 246 printf("%d %d %d %d\n", 247 v[0], v[1], v[2], v[3]); 248 } 249 250 void _mm_print_epi16(__m128i v) @trusted 251 { 252 short8 C = cast(short8)v; 253 printf("%d %d %d %d %d %d %d %d\n", 254 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 255 } 256 257 void _mm_print_epi8(__m128i v) @trusted 258 { 259 byte16 C = cast(byte16)v; 260 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 261 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 262 } 263 } 264 265 266 // 267 // <FLOATING-POINT COMPARISONS> 268 // 269 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 270 // need different IR generation. 271 272 enum FPComparison 273 { 274 oeq, // ordered and equal 275 ogt, // ordered and greater than 276 oge, // ordered and greater than or equal 277 olt, // ordered and less than 278 ole, // ordered and less than or equal 279 one, // ordered and not equal 280 ord, // ordered (no nans) 281 ueq, // unordered or equal 282 ugt, // unordered or greater than ("nle") 283 uge, // unordered or greater than or equal ("nlt") 284 ult, // unordered or less than ("nge") 285 ule, // unordered or less than or equal ("ngt") 286 une, // unordered or not equal ("neq") 287 uno, // unordered (either nans) 288 } 289 290 private static immutable string[FPComparison.max+1] FPComparisonToString = 291 [ 292 "oeq", 293 "ogt", 294 "oge", 295 "olt", 296 "ole", 297 "one", 298 "ord", 299 "ueq", 300 "ugt", 301 "uge", 302 "ult", 303 "ule", 304 "une", 305 "uno", 306 ]; 307 308 // Individual float comparison: returns -1 for true or 0 for false. 309 // Useful for DMD and testing 310 version (unittest) 311 { 312 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 313 { 314 import std.math; 315 bool unordered = isNaN(a) || isNaN(b); 316 final switch(comparison) with(FPComparison) 317 { 318 case oeq: return a == b; 319 case ogt: return a > b; 320 case oge: return a >= b; 321 case olt: return a < b; 322 case ole: return a <= b; 323 case one: return !unordered && (a != b); // NaN with != always yields true 324 case ord: return !unordered; 325 case ueq: return unordered || (a == b); 326 case ugt: return unordered || (a > b); 327 case uge: return unordered || (a >= b); 328 case ult: return unordered || (a < b); 329 case ule: return unordered || (a <= b); 330 case une: return (a != b); // NaN with != always yields true 331 case uno: return unordered; 332 } 333 } 334 } 335 336 version(LDC) 337 { 338 /// Provides packed float comparisons 339 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 340 { 341 enum ir = ` 342 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 343 %r = sext <4 x i1> %cmp to <4 x i32> 344 ret <4 x i32> %r`; 345 346 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 347 } 348 349 /// Provides packed double comparisons 350 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 351 { 352 enum ir = ` 353 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 354 %r = sext <2 x i1> %cmp to <2 x i64> 355 ret <2 x i64> %r`; 356 357 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 358 } 359 360 /// CMPSS-style comparisons 361 /// clang implement it through x86 intrinsics, it is possible with IR alone 362 /// but leads to less optimal code. 363 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 364 /// Not that simple. 365 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 366 { 367 /* 368 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 369 enum bool invertOp = (predicateNumber & 0x80) != 0; 370 static if(invertOp) 371 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 372 else 373 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 374 */ 375 enum ir = ` 376 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 377 %r = sext i1 %cmp to i32 378 %r2 = bitcast i32 %r to float 379 ret float %r2`; 380 381 float4 r = a; 382 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 383 return r; 384 } 385 386 /// CMPSD-style comparisons 387 /// clang implement it through x86 intrinsics, it is possible with IR alone 388 /// but leads to less optimal code. 389 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 390 /// Not that simple. 391 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 392 { 393 enum ir = ` 394 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 395 %r = sext i1 %cmp to i64 396 %r2 = bitcast i64 %r to double 397 ret double %r2`; 398 399 double2 r = a; 400 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 401 return r; 402 } 403 404 // Note: ucomss and ucomsd are left unimplemented 405 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 406 { 407 enum ir = ` 408 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 409 %r = zext i1 %cmp to i32 410 ret i32 %r`; 411 412 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 413 } 414 415 // Note: ucomss and ucomsd are left unimplemented 416 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 417 { 418 enum ir = ` 419 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 420 %r = zext i1 %cmp to i32 421 ret i32 %r`; 422 423 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 424 } 425 } 426 else 427 { 428 /// Provides packed float comparisons 429 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 430 { 431 int4 result; 432 foreach(i; 0..4) 433 { 434 result[i] = compareFloat!float(comparison, a[i], b[i]) ? -1 : 0; 435 } 436 return result; 437 } 438 439 /// Provides packed double comparisons 440 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 441 { 442 long2 result; 443 foreach(i; 0..2) 444 { 445 result[i] = compareFloat!double(comparison, a[i], b[i]) ? -1 : 0; 446 } 447 return result; 448 } 449 450 /// Provides CMPSS-style comparison 451 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 452 { 453 int4 result = cast(int4)a; 454 result[0] = compareFloat!float(comparison, a[0], b[0]) ? -1 : 0; 455 return cast(float4)result; 456 } 457 458 /// Provides CMPSD-style comparison 459 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 460 { 461 long2 result = cast(long2)a; 462 result[0] = compareFloat!double(comparison, a[0], b[0]) ? -1 : 0; 463 return cast(double2)result; 464 } 465 466 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 467 { 468 return compareFloat!float(comparison, a[0], b[0]) ? 1 : 0; 469 } 470 471 // Note: ucomss and ucomsd are left unimplemented 472 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 473 { 474 return compareFloat!double(comparison, a[0], b[0]) ? 1 : 0; 475 } 476 } 477 unittest // cmpps 478 { 479 // Check all comparison type is working 480 float4 A = [1, 3, 5, float.nan]; 481 float4 B = [2, 3, 4, 5]; 482 483 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 484 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 485 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 486 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 487 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 488 int4 result_one = cmpps!(FPComparison.one)(A, B); 489 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 490 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 491 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 492 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 493 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 494 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 495 int4 result_une = cmpps!(FPComparison.une)(A, B); 496 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 497 498 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 499 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 500 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 501 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 502 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 503 static immutable int[4] correct_one = [-1, 0,-1, 0]; 504 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 505 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 506 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 507 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 508 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 509 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 510 static immutable int[4] correct_une = [-1, 0,-1,-1]; 511 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 512 513 assert(result_oeq.array == correct_oeq); 514 assert(result_ogt.array == correct_ogt); 515 assert(result_oge.array == correct_oge); 516 assert(result_olt.array == correct_olt); 517 assert(result_ole.array == correct_ole); 518 assert(result_one.array == correct_one); 519 assert(result_ord.array == correct_ord); 520 assert(result_ueq.array == correct_ueq); 521 assert(result_ugt.array == correct_ugt); 522 assert(result_uge.array == correct_uge); 523 assert(result_ult.array == correct_ult); 524 assert(result_ule.array == correct_ule); 525 assert(result_une.array == correct_une); 526 assert(result_uno.array == correct_uno); 527 } 528 unittest 529 { 530 double2 a = [1, 3]; 531 double2 b = [2, 3]; 532 long2 c = cmppd!(FPComparison.ult)(a, b); 533 static immutable long[2] correct = [cast(long)(-1), 0]; 534 assert(c.array == correct); 535 } 536 unittest // cmpss and comss 537 { 538 void testComparison(FPComparison comparison)(float4 A, float4 B) 539 { 540 float4 result = cmpss!comparison(A, B); 541 int4 iresult = cast(int4)result; 542 int expected = compareFloat!float(comparison, A[0], B[0]) ? -1 : 0; 543 assert(iresult[0] == expected); 544 assert(result[1] == A[1]); 545 assert(result[2] == A[2]); 546 assert(result[3] == A[3]); 547 548 // check comss 549 int comResult = comss!comparison(A, B); 550 assert( (expected != 0) == (comResult != 0) ); 551 } 552 553 // Check all comparison type is working 554 float4 A = [1, 3, 5, 6]; 555 float4 B = [2, 3, 4, 5]; 556 float4 C = [float.nan, 3, 4, 5]; 557 558 testComparison!(FPComparison.oeq)(A, B); 559 testComparison!(FPComparison.oeq)(A, C); 560 testComparison!(FPComparison.ogt)(A, B); 561 testComparison!(FPComparison.ogt)(A, C); 562 testComparison!(FPComparison.oge)(A, B); 563 testComparison!(FPComparison.oge)(A, C); 564 testComparison!(FPComparison.olt)(A, B); 565 testComparison!(FPComparison.olt)(A, C); 566 testComparison!(FPComparison.ole)(A, B); 567 testComparison!(FPComparison.ole)(A, C); 568 testComparison!(FPComparison.one)(A, B); 569 testComparison!(FPComparison.one)(A, C); 570 testComparison!(FPComparison.ord)(A, B); 571 testComparison!(FPComparison.ord)(A, C); 572 testComparison!(FPComparison.ueq)(A, B); 573 testComparison!(FPComparison.ueq)(A, C); 574 testComparison!(FPComparison.ugt)(A, B); 575 testComparison!(FPComparison.ugt)(A, C); 576 testComparison!(FPComparison.uge)(A, B); 577 testComparison!(FPComparison.uge)(A, C); 578 testComparison!(FPComparison.ult)(A, B); 579 testComparison!(FPComparison.ult)(A, C); 580 testComparison!(FPComparison.ule)(A, B); 581 testComparison!(FPComparison.ule)(A, C); 582 testComparison!(FPComparison.une)(A, B); 583 testComparison!(FPComparison.une)(A, C); 584 testComparison!(FPComparison.uno)(A, B); 585 testComparison!(FPComparison.uno)(A, C); 586 } 587 unittest // cmpsd and comsd 588 { 589 void testComparison(FPComparison comparison)(double2 A, double2 B) 590 { 591 double2 result = cmpsd!comparison(A, B); 592 long2 iresult = cast(long2)result; 593 long expected = compareFloat!double(comparison, A[0], B[0]) ? -1 : 0; 594 assert(iresult[0] == expected); 595 assert(result[1] == A[1]); 596 597 // check comsd 598 int comResult = comsd!comparison(A, B); 599 assert( (expected != 0) == (comResult != 0) ); 600 } 601 602 // Check all comparison type is working 603 double2 A = [1, 3]; 604 double2 B = [2, 4]; 605 double2 C = [double.nan, 5]; 606 607 testComparison!(FPComparison.oeq)(A, B); 608 testComparison!(FPComparison.oeq)(A, C); 609 testComparison!(FPComparison.ogt)(A, B); 610 testComparison!(FPComparison.ogt)(A, C); 611 testComparison!(FPComparison.oge)(A, B); 612 testComparison!(FPComparison.oge)(A, C); 613 testComparison!(FPComparison.olt)(A, B); 614 testComparison!(FPComparison.olt)(A, C); 615 testComparison!(FPComparison.ole)(A, B); 616 testComparison!(FPComparison.ole)(A, C); 617 testComparison!(FPComparison.one)(A, B); 618 testComparison!(FPComparison.one)(A, C); 619 testComparison!(FPComparison.ord)(A, B); 620 testComparison!(FPComparison.ord)(A, C); 621 testComparison!(FPComparison.ueq)(A, B); 622 testComparison!(FPComparison.ueq)(A, C); 623 testComparison!(FPComparison.ugt)(A, B); 624 testComparison!(FPComparison.ugt)(A, C); 625 testComparison!(FPComparison.uge)(A, B); 626 testComparison!(FPComparison.uge)(A, C); 627 testComparison!(FPComparison.ult)(A, B); 628 testComparison!(FPComparison.ult)(A, C); 629 testComparison!(FPComparison.ule)(A, B); 630 testComparison!(FPComparison.ule)(A, C); 631 testComparison!(FPComparison.une)(A, B); 632 testComparison!(FPComparison.une)(A, C); 633 testComparison!(FPComparison.uno)(A, B); 634 testComparison!(FPComparison.uno)(A, C); 635 } 636 637 // 638 // </FLOATING-POINT COMPARISONS> 639 //