1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.internals; 7 8 import inteli.types; 9 10 // The only math functions needed for intel-intrinsics 11 public import core.math: sqrt; // since it's an intrinsics 12 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit 13 14 15 version(GNU) 16 { 17 version (X86) 18 { 19 // For 32-bit x86, disable vector extensions with GDC. 20 // It just doesn't work well. 21 enum GDC_with_x86 = true; 22 enum GDC_with_MMX = false; 23 enum GDC_with_SSE = false; 24 enum GDC_with_SSE2 = false; 25 enum GDC_with_SSE3 = false; 26 } 27 else version (X86_64) 28 { 29 // GDC support uses extended inline assembly: 30 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 31 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 32 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 33 34 public import core.simd; 35 36 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 37 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 38 public import gcc.builtins; 39 40 enum GDC_with_x86 = true; 41 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 42 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 43 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 44 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 45 } 46 else 47 { 48 enum GDC_with_x86 = false; 49 enum GDC_with_MMX = false; 50 enum GDC_with_SSE = false; 51 enum GDC_with_SSE2 = false; 52 enum GDC_with_SSE3 = false; 53 } 54 } 55 else version(LDC) 56 { 57 public import core.simd; 58 public import ldc.simd; 59 public import ldc.gccbuiltins_x86; 60 public import ldc.intrinsics; 61 public import ldc.llvmasm: __asm; 62 63 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 64 static if (__VERSION__ >= 2083) 65 { 66 import ldc.llvmasm; 67 alias LDCInlineIR = __ir_pure; 68 69 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 70 alias LDCInlineIREx = __irEx_pure; 71 } 72 else 73 { 74 alias LDCInlineIR = inlineIR; 75 } 76 77 package(inteli) 78 { 79 enum GDC_with_x86 = false; 80 enum GDC_with_MMX = false; 81 enum GDC_with_SSE = false; 82 enum GDC_with_SSE2 = false; 83 enum GDC_with_SSE3 = false; 84 } 85 } 86 else version(DigitalMars) 87 { 88 package(inteli) 89 { 90 enum GDC_with_x86 = false; 91 enum GDC_with_MMX = false; 92 enum GDC_with_SSE = false; 93 enum GDC_with_SSE2 = false; 94 enum GDC_with_SSE3 = false; 95 } 96 } 97 else 98 { 99 static assert(false, "Unknown compiler"); 100 } 101 102 version(DigitalMars) 103 { 104 version(D_InlineAsm_X86) 105 enum DMD_with_asm = true; 106 else version(D_InlineAsm_X86_64) 107 enum DMD_with_asm = true; 108 else 109 enum DMD_with_asm = false; 110 111 version(D_InlineAsm_X86) 112 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 113 else 114 enum DMD_with_32bit_asm = false; 115 } 116 else 117 { 118 enum DMD_with_asm = false; 119 enum DMD_with_32bit_asm = false; 120 } 121 122 123 124 125 package: 126 nothrow @nogc: 127 128 129 // 130 // <ROUNDING> 131 // 132 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 133 // doesn't change the FPU rounding mode, and isn't expected to do so. 134 // So we devised these rounding function to help having consistent rouding between 135 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 136 // 137 138 139 int convertFloatToInt32UsingMXCSR(float value) pure @safe 140 { 141 int result; 142 version(GNU) 143 { 144 asm pure nothrow @nogc @trusted 145 { 146 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 147 } 148 } 149 else 150 { 151 asm pure nothrow @nogc @trusted 152 { 153 cvtss2si EAX, value; 154 mov result, EAX; 155 } 156 } 157 return result; 158 } 159 160 int convertDoubleToInt32UsingMXCSR(double value) pure @safe 161 { 162 int result; 163 version(GNU) 164 { 165 asm pure nothrow @nogc @trusted 166 { 167 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 168 } 169 } 170 else 171 { 172 asm pure nothrow @nogc @trusted 173 { 174 cvtsd2si EAX, value; 175 mov result, EAX; 176 } 177 } 178 return result; 179 } 180 181 long convertFloatToInt64UsingMXCSR(float value) pure @safe 182 { 183 // 64-bit can use an SSE instruction 184 version(D_InlineAsm_X86_64) 185 { 186 long result; 187 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 188 { 189 asm pure nothrow @nogc @trusted 190 { 191 movss XMM0, value; 192 cvtss2si RAX, XMM0; 193 mov result, RAX; 194 } 195 } 196 else 197 { 198 asm pure nothrow @nogc @trusted 199 { 200 movss XMM0, value; 201 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 202 mov result, RAX; 203 } 204 } 205 return result; 206 } 207 else version(D_InlineAsm_X86) 208 { 209 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 210 // This leads to an unfortunate FPU sequence in every C++ compiler. 211 // See: https://godbolt.org/z/vZym77 212 213 // Get current MXCSR rounding 214 uint sseRounding; 215 ushort savedFPUCW; 216 ushort newFPUCW; 217 long result; 218 asm pure nothrow @nogc @trusted 219 { 220 stmxcsr sseRounding; 221 fld value; 222 fnstcw savedFPUCW; 223 mov AX, savedFPUCW; 224 and AX, 0xf3ff; // clear FPU rounding bits 225 movzx ECX, word ptr sseRounding; 226 and ECX, 0x6000; // only keep SSE rounding bits 227 shr ECX, 3; 228 or AX, CX; // make a new control word for FPU with SSE bits 229 mov newFPUCW, AX; 230 fldcw newFPUCW; 231 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 232 fldcw savedFPUCW; 233 } 234 return result; 235 } 236 else static if (GDC_with_x86) 237 { 238 version(X86_64) // 64-bit can just use the right instruction 239 { 240 static assert(GDC_with_SSE); 241 __m128 A; 242 A.ptr[0] = value; 243 return __builtin_ia32_cvtss2si64 (A); 244 } 245 else version(X86) // 32-bit 246 { 247 // This is untested! 248 uint sseRounding; 249 ushort savedFPUCW; 250 ushort newFPUCW; 251 long result; 252 asm pure nothrow @nogc @trusted 253 { 254 "stmxcsr %1;\n" ~ 255 "fld %2;\n" ~ 256 "fnstcw %3;\n" ~ 257 "movw %3, %%ax;\n" ~ 258 "andw $0xf3ff, %%ax;\n" ~ 259 "movzwl %1, %%ecx;\n" ~ 260 "andl $0x6000, %%ecx;\n" ~ 261 "shrl $3, %%ecx;\n" ~ 262 "orw %%cx, %%ax\n" ~ 263 "movw %%ax, %4;\n" ~ 264 "fldcw %4;\n" ~ 265 "fistpll %0;\n" ~ 266 "fldcw %3;\n" 267 : "=m"(result) // %0 268 : "m" (sseRounding), 269 "f" (value), 270 "m" (savedFPUCW), 271 "m" (newFPUCW) 272 : "eax", "ecx", "st"; 273 } 274 return result; 275 } 276 else 277 static assert(false); 278 } 279 else 280 static assert(false); 281 } 282 283 ///ditto 284 long convertDoubleToInt64UsingMXCSR(double value) pure @safe 285 { 286 // 64-bit can use an SSE instruction 287 version(D_InlineAsm_X86_64) 288 { 289 long result; 290 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 291 { 292 asm pure nothrow @nogc @trusted 293 { 294 movsd XMM0, value; 295 cvtsd2si RAX, XMM0; 296 mov result, RAX; 297 } 298 } 299 else 300 { 301 asm pure nothrow @nogc @trusted 302 { 303 movsd XMM0, value; 304 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 305 mov result, RAX; 306 } 307 } 308 return result; 309 } 310 else version(D_InlineAsm_X86) 311 { 312 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 313 // This leads to an unfortunate FPU sequence in every C++ compiler. 314 // See: https://godbolt.org/z/vZym77 315 316 // Get current MXCSR rounding 317 uint sseRounding; 318 ushort savedFPUCW; 319 ushort newFPUCW; 320 long result; 321 asm pure nothrow @nogc @trusted 322 { 323 stmxcsr sseRounding; 324 fld value; 325 fnstcw savedFPUCW; 326 mov AX, savedFPUCW; 327 and AX, 0xf3ff; 328 movzx ECX, word ptr sseRounding; 329 and ECX, 0x6000; 330 shr ECX, 3; 331 or AX, CX; 332 mov newFPUCW, AX; 333 fldcw newFPUCW; 334 fistp result; 335 fldcw savedFPUCW; 336 } 337 return result; 338 } 339 else static if (GDC_with_x86) 340 { 341 version(X86_64) 342 { 343 static assert(GDC_with_SSE2); 344 __m128d A; 345 A.ptr[0] = value; 346 return __builtin_ia32_cvtsd2si64 (A); 347 } 348 else 349 { 350 // This is untested! 351 uint sseRounding; 352 ushort savedFPUCW; 353 ushort newFPUCW; 354 long result; 355 asm pure nothrow @nogc @trusted 356 { 357 "stmxcsr %1;\n" ~ 358 "fld %2;\n" ~ 359 "fnstcw %3;\n" ~ 360 "movw %3, %%ax;\n" ~ 361 "andw $0xf3ff, %%ax;\n" ~ 362 "movzwl %1, %%ecx;\n" ~ 363 "andl $0x6000, %%ecx;\n" ~ 364 "shrl $3, %%ecx;\n" ~ 365 "orw %%cx, %%ax\n" ~ 366 "movw %%ax, %4;\n" ~ 367 "fldcw %4;\n" ~ 368 "fistpll %0;\n" ~ 369 "fldcw %3;\n" 370 : "=m"(result) // %0 371 : "m" (sseRounding), 372 "t" (value), 373 "m" (savedFPUCW), 374 "m" (newFPUCW) 375 : "eax", "ecx", "st"; 376 } 377 return result; 378 } 379 } 380 else 381 static assert(false); 382 } 383 384 385 // 386 // </ROUNDING> 387 // 388 389 390 // using the Intel terminology here 391 392 byte saturateSignedWordToSignedByte(short value) pure @safe 393 { 394 if (value > 127) value = 127; 395 if (value < -128) value = -128; 396 return cast(byte) value; 397 } 398 399 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 400 { 401 if (value > 255) value = 255; 402 if (value < 0) value = 0; 403 return cast(ubyte) value; 404 } 405 406 short saturateSignedIntToSignedShort(int value) pure @safe 407 { 408 if (value > 32767) value = 32767; 409 if (value < -32768) value = -32768; 410 return cast(short) value; 411 } 412 413 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 414 { 415 if (value > 65535) value = 65535; 416 if (value < 0) value = 0; 417 return cast(ushort) value; 418 } 419 420 unittest // test saturate operations 421 { 422 assert( saturateSignedWordToSignedByte(32000) == 127); 423 assert( saturateSignedWordToUnsignedByte(32000) == 255); 424 assert( saturateSignedWordToSignedByte(-4000) == -128); 425 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 426 assert( saturateSignedIntToSignedShort(32768) == 32767); 427 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 428 assert( saturateSignedIntToSignedShort(-32769) == -32768); 429 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 430 } 431 432 version(unittest) 433 { 434 // This is just for debugging tests 435 import core.stdc.stdio: printf; 436 437 // printing vectors for implementation 438 // Note: you can override `pure` within a `debug` clause 439 440 void _mm_print_pi32(__m64 v) @trusted 441 { 442 int[2] C = (cast(int2)v).array; 443 printf("%d %d\n", C[0], C[1]); 444 } 445 446 void _mm_print_pi16(__m64 v) @trusted 447 { 448 short[4] C = (cast(short4)v).array; 449 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 450 } 451 452 void _mm_print_pi8(__m64 v) @trusted 453 { 454 byte[8] C = (cast(byte8)v).array; 455 printf("%d %d %d %d %d %d %d %d\n", 456 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 457 } 458 459 void _mm_print_epi32(__m128i v) @trusted 460 { 461 printf("%d %d %d %d\n", 462 v.array[0], v.array[1], v.array[2], v.array[3]); 463 } 464 465 void _mm_print_epi16(__m128i v) @trusted 466 { 467 short[8] C = (cast(short8)v).array; 468 printf("%d %d %d %d %d %d %d %d\n", 469 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 470 } 471 472 void _mm_print_epi8(__m128i v) @trusted 473 { 474 byte[16] C = (cast(byte16)v).array; 475 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 476 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 477 } 478 479 void _mm_print_ps(__m128 v) @trusted 480 { 481 float[4] C = (cast(float4)v).array; 482 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 483 } 484 485 void _mm_print_pd(__m128d v) @trusted 486 { 487 double[2] C = (cast(double2)v).array; 488 printf("%f %f\n", C[0], C[1]); 489 } 490 } 491 492 493 // 494 // <FLOATING-POINT COMPARISONS> 495 // 496 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 497 // need different IR generation. 498 499 enum FPComparison 500 { 501 oeq, // ordered and equal 502 ogt, // ordered and greater than 503 oge, // ordered and greater than or equal 504 olt, // ordered and less than 505 ole, // ordered and less than or equal 506 one, // ordered and not equal 507 ord, // ordered (no nans) 508 ueq, // unordered or equal 509 ugt, // unordered or greater than ("nle") 510 uge, // unordered or greater than or equal ("nlt") 511 ult, // unordered or less than ("nge") 512 ule, // unordered or less than or equal ("ngt") 513 une, // unordered or not equal ("neq") 514 uno, // unordered (either nans) 515 } 516 517 private static immutable string[FPComparison.max+1] FPComparisonToString = 518 [ 519 "oeq", 520 "ogt", 521 "oge", 522 "olt", 523 "ole", 524 "one", 525 "ord", 526 "ueq", 527 "ugt", 528 "uge", 529 "ult", 530 "ule", 531 "une", 532 "uno", 533 ]; 534 535 // Individual float comparison: returns -1 for true or 0 for false. 536 // Useful for DMD and testing 537 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 538 { 539 import std.math; 540 bool unordered = isNaN(a) || isNaN(b); 541 final switch(comparison) with(FPComparison) 542 { 543 case oeq: return a == b; 544 case ogt: return a > b; 545 case oge: return a >= b; 546 case olt: return a < b; 547 case ole: return a <= b; 548 case one: return !unordered && (a != b); // NaN with != always yields true 549 case ord: return !unordered; 550 case ueq: return unordered || (a == b); 551 case ugt: return unordered || (a > b); 552 case uge: return unordered || (a >= b); 553 case ult: return unordered || (a < b); 554 case ule: return unordered || (a <= b); 555 case une: return (a != b); // NaN with != always yields true 556 case uno: return unordered; 557 } 558 } 559 560 version(LDC) 561 { 562 /// Provides packed float comparisons 563 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 564 { 565 enum ir = ` 566 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 567 %r = sext <4 x i1> %cmp to <4 x i32> 568 ret <4 x i32> %r`; 569 570 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 571 } 572 573 /// Provides packed double comparisons 574 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 575 { 576 enum ir = ` 577 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 578 %r = sext <2 x i1> %cmp to <2 x i64> 579 ret <2 x i64> %r`; 580 581 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 582 } 583 584 /// CMPSS-style comparisons 585 /// clang implement it through x86 intrinsics, it is possible with IR alone 586 /// but leads to less optimal code. 587 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 588 /// Not that simple. 589 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 590 { 591 /* 592 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 593 enum bool invertOp = (predicateNumber & 0x80) != 0; 594 static if(invertOp) 595 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 596 else 597 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 598 */ 599 enum ir = ` 600 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 601 %r = sext i1 %cmp to i32 602 %r2 = bitcast i32 %r to float 603 ret float %r2`; 604 605 float4 r = a; 606 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 607 return r; 608 } 609 610 /// CMPSD-style comparisons 611 /// clang implement it through x86 intrinsics, it is possible with IR alone 612 /// but leads to less optimal code. 613 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 614 /// Not that simple. 615 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 616 { 617 enum ir = ` 618 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 619 %r = sext i1 %cmp to i64 620 %r2 = bitcast i64 %r to double 621 ret double %r2`; 622 623 double2 r = a; 624 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 625 return r; 626 } 627 628 // Note: ucomss and ucomsd are left unimplemented 629 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 630 { 631 enum ir = ` 632 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 633 %r = zext i1 %cmp to i32 634 ret i32 %r`; 635 636 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 637 } 638 639 // Note: ucomss and ucomsd are left unimplemented 640 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 641 { 642 enum ir = ` 643 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 644 %r = zext i1 %cmp to i32 645 ret i32 %r`; 646 647 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 648 } 649 } 650 else 651 { 652 /// Provides packed float comparisons 653 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 654 { 655 int4 result; 656 foreach(i; 0..4) 657 { 658 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 659 } 660 return result; 661 } 662 663 /// Provides packed double comparisons 664 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 665 { 666 long2 result; 667 foreach(i; 0..2) 668 { 669 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 670 } 671 return result; 672 } 673 674 /// Provides CMPSS-style comparison 675 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 676 { 677 int4 result = cast(int4)a; 678 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 679 return cast(float4)result; 680 } 681 682 /// Provides CMPSD-style comparison 683 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 684 { 685 long2 result = cast(long2)a; 686 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 687 return cast(double2)result; 688 } 689 690 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 691 { 692 return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0; 693 } 694 695 // Note: ucomss and ucomsd are left unimplemented 696 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 697 { 698 return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0; 699 } 700 } 701 unittest // cmpps 702 { 703 // Check all comparison type is working 704 float4 A = [1, 3, 5, float.nan]; 705 float4 B = [2, 3, 4, 5]; 706 707 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 708 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 709 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 710 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 711 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 712 int4 result_one = cmpps!(FPComparison.one)(A, B); 713 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 714 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 715 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 716 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 717 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 718 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 719 int4 result_une = cmpps!(FPComparison.une)(A, B); 720 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 721 722 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 723 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 724 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 725 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 726 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 727 static immutable int[4] correct_one = [-1, 0,-1, 0]; 728 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 729 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 730 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 731 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 732 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 733 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 734 static immutable int[4] correct_une = [-1, 0,-1,-1]; 735 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 736 737 assert(result_oeq.array == correct_oeq); 738 assert(result_ogt.array == correct_ogt); 739 assert(result_oge.array == correct_oge); 740 assert(result_olt.array == correct_olt); 741 assert(result_ole.array == correct_ole); 742 assert(result_one.array == correct_one); 743 assert(result_ord.array == correct_ord); 744 assert(result_ueq.array == correct_ueq); 745 assert(result_ugt.array == correct_ugt); 746 assert(result_uge.array == correct_uge); 747 assert(result_ult.array == correct_ult); 748 assert(result_ule.array == correct_ule); 749 assert(result_une.array == correct_une); 750 assert(result_uno.array == correct_uno); 751 } 752 unittest 753 { 754 double2 a = [1, 3]; 755 double2 b = [2, 3]; 756 long2 c = cmppd!(FPComparison.ult)(a, b); 757 static immutable long[2] correct = [cast(long)(-1), 0]; 758 assert(c.array == correct); 759 } 760 unittest // cmpss and comss 761 { 762 void testComparison(FPComparison comparison)(float4 A, float4 B) 763 { 764 float4 result = cmpss!comparison(A, B); 765 int4 iresult = cast(int4)result; 766 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 767 assert(iresult.array[0] == expected); 768 assert(result.array[1] == A.array[1]); 769 assert(result.array[2] == A.array[2]); 770 assert(result.array[3] == A.array[3]); 771 772 // check comss 773 int comResult = comss!comparison(A, B); 774 assert( (expected != 0) == (comResult != 0) ); 775 } 776 777 // Check all comparison type is working 778 float4 A = [1, 3, 5, 6]; 779 float4 B = [2, 3, 4, 5]; 780 float4 C = [float.nan, 3, 4, 5]; 781 782 testComparison!(FPComparison.oeq)(A, B); 783 testComparison!(FPComparison.oeq)(A, C); 784 testComparison!(FPComparison.ogt)(A, B); 785 testComparison!(FPComparison.ogt)(A, C); 786 testComparison!(FPComparison.oge)(A, B); 787 testComparison!(FPComparison.oge)(A, C); 788 testComparison!(FPComparison.olt)(A, B); 789 testComparison!(FPComparison.olt)(A, C); 790 testComparison!(FPComparison.ole)(A, B); 791 testComparison!(FPComparison.ole)(A, C); 792 testComparison!(FPComparison.one)(A, B); 793 testComparison!(FPComparison.one)(A, C); 794 testComparison!(FPComparison.ord)(A, B); 795 testComparison!(FPComparison.ord)(A, C); 796 testComparison!(FPComparison.ueq)(A, B); 797 testComparison!(FPComparison.ueq)(A, C); 798 testComparison!(FPComparison.ugt)(A, B); 799 testComparison!(FPComparison.ugt)(A, C); 800 testComparison!(FPComparison.uge)(A, B); 801 testComparison!(FPComparison.uge)(A, C); 802 testComparison!(FPComparison.ult)(A, B); 803 testComparison!(FPComparison.ult)(A, C); 804 testComparison!(FPComparison.ule)(A, B); 805 testComparison!(FPComparison.ule)(A, C); 806 testComparison!(FPComparison.une)(A, B); 807 testComparison!(FPComparison.une)(A, C); 808 testComparison!(FPComparison.uno)(A, B); 809 testComparison!(FPComparison.uno)(A, C); 810 } 811 unittest // cmpsd and comsd 812 { 813 void testComparison(FPComparison comparison)(double2 A, double2 B) 814 { 815 double2 result = cmpsd!comparison(A, B); 816 long2 iresult = cast(long2)result; 817 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 818 assert(iresult.array[0] == expected); 819 assert(result.array[1] == A.array[1]); 820 821 // check comsd 822 int comResult = comsd!comparison(A, B); 823 assert( (expected != 0) == (comResult != 0) ); 824 } 825 826 // Check all comparison type is working 827 double2 A = [1, 3]; 828 double2 B = [2, 4]; 829 double2 C = [double.nan, 5]; 830 831 testComparison!(FPComparison.oeq)(A, B); 832 testComparison!(FPComparison.oeq)(A, C); 833 testComparison!(FPComparison.ogt)(A, B); 834 testComparison!(FPComparison.ogt)(A, C); 835 testComparison!(FPComparison.oge)(A, B); 836 testComparison!(FPComparison.oge)(A, C); 837 testComparison!(FPComparison.olt)(A, B); 838 testComparison!(FPComparison.olt)(A, C); 839 testComparison!(FPComparison.ole)(A, B); 840 testComparison!(FPComparison.ole)(A, C); 841 testComparison!(FPComparison.one)(A, B); 842 testComparison!(FPComparison.one)(A, C); 843 testComparison!(FPComparison.ord)(A, B); 844 testComparison!(FPComparison.ord)(A, C); 845 testComparison!(FPComparison.ueq)(A, B); 846 testComparison!(FPComparison.ueq)(A, C); 847 testComparison!(FPComparison.ugt)(A, B); 848 testComparison!(FPComparison.ugt)(A, C); 849 testComparison!(FPComparison.uge)(A, B); 850 testComparison!(FPComparison.uge)(A, C); 851 testComparison!(FPComparison.ult)(A, B); 852 testComparison!(FPComparison.ult)(A, C); 853 testComparison!(FPComparison.ule)(A, B); 854 testComparison!(FPComparison.ule)(A, C); 855 testComparison!(FPComparison.une)(A, B); 856 testComparison!(FPComparison.une)(A, C); 857 testComparison!(FPComparison.uno)(A, B); 858 testComparison!(FPComparison.uno)(A, C); 859 } 860 861 // 862 // </FLOATING-POINT COMPARISONS> 863 // 864 865 866 __m64 to_m64(__m128i a) pure @trusted 867 { 868 long2 la = cast(long2)a; 869 long1 r; 870 r.ptr[0] = la.array[0]; 871 return r; 872 } 873 874 __m128i to_m128i(__m64 a) pure @trusted 875 { 876 long2 r = [0, 0]; 877 r.ptr[0] = a.array[0]; 878 return cast(__m128i)r; 879 }