1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.internals; 7 8 import inteli.types; 9 10 // The only math functions needed for intel-intrinsics 11 public import core.math: sqrt; // since it's an intrinsics 12 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit 13 14 15 version(GNU) 16 { 17 version (X86) 18 { 19 // For 32-bit x86, disable vector extensions with GDC. 20 // It just doesn't work well. 21 enum GDC_with_x86 = true; 22 enum GDC_with_MMX = false; 23 enum GDC_with_SSE = false; 24 enum GDC_with_SSE2 = false; 25 enum GDC_with_SSE3 = false; 26 } 27 else version (X86_64) 28 { 29 // GDC support uses extended inline assembly: 30 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html (general information and hints) 31 // https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html (binding variables to registers) 32 // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names) 33 34 public import core.simd; 35 36 // NOTE: These intrinsics are not available in every i386 and x86_64 CPU. 37 // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 38 public import gcc.builtins; 39 40 enum GDC_with_x86 = true; 41 enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there 42 enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there 43 enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there 44 enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT 45 } 46 else 47 { 48 enum GDC_with_x86 = false; 49 enum GDC_with_MMX = false; 50 enum GDC_with_SSE = false; 51 enum GDC_with_SSE2 = false; 52 enum GDC_with_SSE3 = false; 53 } 54 } 55 else version(LDC) 56 { 57 public import core.simd; 58 public import ldc.simd; 59 public import ldc.gccbuiltins_x86; 60 public import ldc.intrinsics; 61 public import ldc.llvmasm: __asm; 62 63 // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR 64 static if (__VERSION__ >= 2083) 65 { 66 import ldc.llvmasm; 67 alias LDCInlineIR = __ir_pure; 68 69 // A version of inline IR with prefix/suffix didn't exist before LDC 1.13 70 alias LDCInlineIREx = __irEx_pure; 71 } 72 else 73 { 74 alias LDCInlineIR = inlineIR; 75 } 76 77 package(inteli) 78 { 79 enum GDC_with_x86 = false; 80 enum GDC_with_MMX = false; 81 enum GDC_with_SSE = false; 82 enum GDC_with_SSE2 = false; 83 enum GDC_with_SSE3 = false; 84 } 85 } 86 else version(DigitalMars) 87 { 88 package(inteli) 89 { 90 enum GDC_with_x86 = false; 91 enum GDC_with_MMX = false; 92 enum GDC_with_SSE = false; 93 enum GDC_with_SSE2 = false; 94 enum GDC_with_SSE3 = false; 95 } 96 } 97 else 98 { 99 static assert(false, "Unknown compiler"); 100 } 101 102 version(DigitalMars) 103 { 104 version(D_InlineAsm_X86) 105 enum DMD_with_asm = true; 106 else version(D_InlineAsm_X86_64) 107 enum DMD_with_asm = true; 108 else 109 enum DMD_with_asm = false; 110 111 version(D_InlineAsm_X86) 112 enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution 113 else 114 enum DMD_with_32bit_asm = false; 115 } 116 else 117 { 118 enum DMD_with_asm = false; 119 enum DMD_with_32bit_asm = false; 120 } 121 122 123 124 125 package: 126 nothrow @nogc: 127 128 129 // 130 // <ROUNDING> 131 // 132 // Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE 133 // doesn't change the FPU rounding mode, and isn't expected to do so. 134 // So we devised these rounding function to help having consistent rouding between 135 // LDC and DMD. It's important that DMD uses what is in MXCST to round. 136 // 137 138 139 int convertFloatToInt32UsingMXCSR(float value) pure @safe 140 { 141 int result; 142 version(GNU) 143 { 144 asm pure nothrow @nogc @trusted 145 { 146 "cvtss2si %1, %0\n": "=r"(result) : "x" (value); 147 } 148 } 149 else 150 { 151 asm pure nothrow @nogc @trusted 152 { 153 cvtss2si EAX, value; 154 mov result, EAX; 155 } 156 } 157 return result; 158 } 159 160 int convertDoubleToInt32UsingMXCSR(double value) pure @safe 161 { 162 int result; 163 version(GNU) 164 { 165 asm pure nothrow @nogc @trusted 166 { 167 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value); 168 } 169 } 170 else 171 { 172 asm pure nothrow @nogc @trusted 173 { 174 cvtsd2si EAX, value; 175 mov result, EAX; 176 } 177 } 178 return result; 179 } 180 181 long convertFloatToInt64UsingMXCSR(float value) pure @safe 182 { 183 // 64-bit can use an SSE instruction 184 version(D_InlineAsm_X86_64) 185 { 186 long result; 187 version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet." 188 { 189 asm pure nothrow @nogc @trusted 190 { 191 movss XMM0, value; 192 cvtss2si RAX, XMM0; 193 mov result, RAX; 194 } 195 } 196 else 197 { 198 asm pure nothrow @nogc @trusted 199 { 200 movss XMM0, value; 201 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit) 202 mov result, RAX; 203 } 204 } 205 return result; 206 } 207 else version(D_InlineAsm_X86) 208 { 209 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 210 // This leads to an unfortunate FPU sequence in every C++ compiler. 211 // See: https://godbolt.org/z/vZym77 212 213 // Get current MXCSR rounding 214 uint sseRounding; 215 ushort savedFPUCW; 216 ushort newFPUCW; 217 long result; 218 asm pure nothrow @nogc @trusted 219 { 220 stmxcsr sseRounding; 221 fld value; 222 fnstcw savedFPUCW; 223 mov AX, savedFPUCW; 224 and AX, 0xf3ff; // clear FPU rounding bits 225 movzx ECX, word ptr sseRounding; 226 and ECX, 0x6000; // only keep SSE rounding bits 227 shr ECX, 3; 228 or AX, CX; // make a new control word for FPU with SSE bits 229 mov newFPUCW, AX; 230 fldcw newFPUCW; 231 fistp qword ptr result; // convert, respecting MXCSR (but not other control word things) 232 fldcw savedFPUCW; 233 } 234 return result; 235 } 236 else static if (GDC_with_x86) 237 { 238 version(X86_64) // 64-bit can just use the right instruction 239 { 240 static assert(GDC_with_SSE); 241 __m128 A; 242 A.ptr[0] = value; 243 return __builtin_ia32_cvtss2si64 (A); 244 } 245 else version(X86) // 32-bit 246 { 247 // This is untested! 248 uint sseRounding; 249 ushort savedFPUCW; 250 ushort newFPUCW; 251 long result; 252 asm pure nothrow @nogc @trusted 253 { 254 "stmxcsr %1;\n" ~ 255 "fld %2;\n" ~ 256 "fnstcw %3;\n" ~ 257 "movw %3, %%ax;\n" ~ 258 "andw $0xf3ff, %%ax;\n" ~ 259 "movzwl %1, %%ecx;\n" ~ 260 "andl $0x6000, %%ecx;\n" ~ 261 "shrl $3, %%ecx;\n" ~ 262 "orw %%cx, %%ax\n" ~ 263 "movw %%ax, %4;\n" ~ 264 "fldcw %4;\n" ~ 265 "fistpll %0;\n" ~ 266 "fldcw %3;\n" 267 : "=m"(result) // %0 268 : "m" (sseRounding), 269 "f" (value), 270 "m" (savedFPUCW), 271 "m" (newFPUCW) 272 : "eax", "ecx", "st"; 273 } 274 return result; 275 } 276 else 277 static assert(false); 278 } 279 else 280 static assert(false); 281 } 282 283 ///ditto 284 long convertDoubleToInt64UsingMXCSR(double value) pure @safe 285 { 286 // 64-bit can use an SSE instruction 287 version(D_InlineAsm_X86_64) 288 { 289 long result; 290 version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet." 291 { 292 asm pure nothrow @nogc @trusted 293 { 294 movsd XMM0, value; 295 cvtsd2si RAX, XMM0; 296 mov result, RAX; 297 } 298 } 299 else 300 { 301 asm pure nothrow @nogc @trusted 302 { 303 movsd XMM0, value; 304 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit) 305 mov result, RAX; 306 } 307 } 308 return result; 309 } 310 else version(D_InlineAsm_X86) 311 { 312 // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int 313 // This leads to an unfortunate FPU sequence in every C++ compiler. 314 // See: https://godbolt.org/z/vZym77 315 316 // Get current MXCSR rounding 317 uint sseRounding; 318 ushort savedFPUCW; 319 ushort newFPUCW; 320 long result; 321 asm pure nothrow @nogc @trusted 322 { 323 stmxcsr sseRounding; 324 fld value; 325 fnstcw savedFPUCW; 326 mov AX, savedFPUCW; 327 and AX, 0xf3ff; 328 movzx ECX, word ptr sseRounding; 329 and ECX, 0x6000; 330 shr ECX, 3; 331 or AX, CX; 332 mov newFPUCW, AX; 333 fldcw newFPUCW; 334 fistp result; 335 fldcw savedFPUCW; 336 } 337 return result; 338 } 339 else static if (GDC_with_x86) 340 { 341 version(X86_64) 342 { 343 static assert(GDC_with_SSE2); 344 __m128d A; 345 A.ptr[0] = value; 346 return __builtin_ia32_cvtsd2si64 (A); 347 } 348 else 349 { 350 // This is untested! 351 uint sseRounding; 352 ushort savedFPUCW; 353 ushort newFPUCW; 354 long result; 355 asm pure nothrow @nogc @trusted 356 { 357 "stmxcsr %1;\n" ~ 358 "fld %2;\n" ~ 359 "fnstcw %3;\n" ~ 360 "movw %3, %%ax;\n" ~ 361 "andw $0xf3ff, %%ax;\n" ~ 362 "movzwl %1, %%ecx;\n" ~ 363 "andl $0x6000, %%ecx;\n" ~ 364 "shrl $3, %%ecx;\n" ~ 365 "orw %%cx, %%ax\n" ~ 366 "movw %%ax, %4;\n" ~ 367 "fldcw %4;\n" ~ 368 "fistpll %0;\n" ~ 369 "fldcw %3;\n" 370 : "=m"(result) // %0 371 : "m" (sseRounding), 372 "t" (value), 373 "m" (savedFPUCW), 374 "m" (newFPUCW) 375 : "eax", "ecx", "st"; 376 } 377 return result; 378 } 379 } 380 else 381 static assert(false); 382 } 383 384 385 // 386 // </ROUNDING> 387 // 388 389 390 // using the Intel terminology here 391 392 byte saturateSignedWordToSignedByte(short value) pure @safe 393 { 394 if (value > 127) value = 127; 395 if (value < -128) value = -128; 396 return cast(byte) value; 397 } 398 399 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe 400 { 401 if (value > 255) value = 255; 402 if (value < 0) value = 0; 403 return cast(ubyte) value; 404 } 405 406 short saturateSignedIntToSignedShort(int value) pure @safe 407 { 408 if (value > 32767) value = 32767; 409 if (value < -32768) value = -32768; 410 return cast(short) value; 411 } 412 413 ushort saturateSignedIntToUnsignedShort(int value) pure @safe 414 { 415 if (value > 65535) value = 65535; 416 if (value < 0) value = 0; 417 return cast(ushort) value; 418 } 419 420 unittest // test saturate operations 421 { 422 assert( saturateSignedWordToSignedByte(32000) == 127); 423 assert( saturateSignedWordToUnsignedByte(32000) == 255); 424 assert( saturateSignedWordToSignedByte(-4000) == -128); 425 assert( saturateSignedWordToUnsignedByte(-4000) == 0); 426 assert( saturateSignedIntToSignedShort(32768) == 32767); 427 assert( saturateSignedIntToUnsignedShort(32768) == 32768); 428 assert( saturateSignedIntToSignedShort(-32769) == -32768); 429 assert( saturateSignedIntToUnsignedShort(-32769) == 0); 430 } 431 432 version(unittest) 433 { 434 // This is just for debugging tests 435 import core.stdc.stdio: printf; 436 437 // printing vectors for implementation 438 // Note: you can override `pure` within a `debug` clause 439 440 void _mm_print_pi64(__m64 v) @trusted 441 { 442 long1 vl = cast(long1)v; 443 printf("%lld\n", vl.array[0]); 444 } 445 446 void _mm_print_pi32(__m64 v) @trusted 447 { 448 int[2] C = (cast(int2)v).array; 449 printf("%d %d\n", C[0], C[1]); 450 } 451 452 void _mm_print_pi16(__m64 v) @trusted 453 { 454 short[4] C = (cast(short4)v).array; 455 printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]); 456 } 457 458 void _mm_print_pi8(__m64 v) @trusted 459 { 460 byte[8] C = (cast(byte8)v).array; 461 printf("%d %d %d %d %d %d %d %d\n", 462 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 463 } 464 465 void _mm_print_epi64(__m128i v) @trusted 466 { 467 long2 vl = cast(long2)v; 468 printf("%lld %lld\n", vl.array[0], vl.array[1]); 469 } 470 471 void _mm_print_epi32(__m128i v) @trusted 472 { 473 printf("%d %d %d %d\n", 474 v.array[0], v.array[1], v.array[2], v.array[3]); 475 } 476 477 void _mm_print_epi16(__m128i v) @trusted 478 { 479 short[8] C = (cast(short8)v).array; 480 printf("%d %d %d %d %d %d %d %d\n", 481 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]); 482 } 483 484 void _mm_print_epi8(__m128i v) @trusted 485 { 486 byte[16] C = (cast(byte16)v).array; 487 printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 488 C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]); 489 } 490 491 void _mm_print_ps(__m128 v) @trusted 492 { 493 float[4] C = (cast(float4)v).array; 494 printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]); 495 } 496 497 void _mm_print_pd(__m128d v) @trusted 498 { 499 double[2] C = (cast(double2)v).array; 500 printf("%f %f\n", C[0], C[1]); 501 } 502 } 503 504 505 // 506 // <FLOATING-POINT COMPARISONS> 507 // 508 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we 509 // need different IR generation. 510 511 enum FPComparison 512 { 513 oeq, // ordered and equal 514 ogt, // ordered and greater than 515 oge, // ordered and greater than or equal 516 olt, // ordered and less than 517 ole, // ordered and less than or equal 518 one, // ordered and not equal 519 ord, // ordered (no nans) 520 ueq, // unordered or equal 521 ugt, // unordered or greater than ("nle") 522 uge, // unordered or greater than or equal ("nlt") 523 ult, // unordered or less than ("nge") 524 ule, // unordered or less than or equal ("ngt") 525 une, // unordered or not equal ("neq") 526 uno, // unordered (either nans) 527 } 528 529 private static immutable string[FPComparison.max+1] FPComparisonToString = 530 [ 531 "oeq", 532 "ogt", 533 "oge", 534 "olt", 535 "ole", 536 "one", 537 "ord", 538 "ueq", 539 "ugt", 540 "uge", 541 "ult", 542 "ule", 543 "une", 544 "uno", 545 ]; 546 547 // Individual float comparison: returns -1 for true or 0 for false. 548 // Useful for DMD and testing 549 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe 550 { 551 import std.math; 552 bool unordered = isNaN(a) || isNaN(b); 553 final switch(comparison) with(FPComparison) 554 { 555 case oeq: return a == b; 556 case ogt: return a > b; 557 case oge: return a >= b; 558 case olt: return a < b; 559 case ole: return a <= b; 560 case one: return !unordered && (a != b); // NaN with != always yields true 561 case ord: return !unordered; 562 case ueq: return unordered || (a == b); 563 case ugt: return unordered || (a > b); 564 case uge: return unordered || (a >= b); 565 case ult: return unordered || (a < b); 566 case ule: return unordered || (a <= b); 567 case une: return (a != b); // NaN with != always yields true 568 case uno: return unordered; 569 } 570 } 571 572 version(LDC) 573 { 574 /// Provides packed float comparisons 575 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe 576 { 577 enum ir = ` 578 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1 579 %r = sext <4 x i1> %cmp to <4 x i32> 580 ret <4 x i32> %r`; 581 582 return LDCInlineIR!(ir, int4, float4, float4)(a, b); 583 } 584 585 /// Provides packed double comparisons 586 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe 587 { 588 enum ir = ` 589 %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1 590 %r = sext <2 x i1> %cmp to <2 x i64> 591 ret <2 x i64> %r`; 592 593 return LDCInlineIR!(ir, long2, double2, double2)(a, b); 594 } 595 596 /// CMPSS-style comparisons 597 /// clang implement it through x86 intrinsics, it is possible with IR alone 598 /// but leads to less optimal code. 599 /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 600 /// Not that simple. 601 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe 602 { 603 /* 604 enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison]; 605 enum bool invertOp = (predicateNumber & 0x80) != 0; 606 static if(invertOp) 607 return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f); 608 else 609 return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f); 610 */ 611 enum ir = ` 612 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 613 %r = sext i1 %cmp to i32 614 %r2 = bitcast i32 %r to float 615 ret float %r2`; 616 617 float4 r = a; 618 r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]); 619 return r; 620 } 621 622 /// CMPSD-style comparisons 623 /// clang implement it through x86 intrinsics, it is possible with IR alone 624 /// but leads to less optimal code. 625 /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 626 /// Not that simple. 627 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe 628 { 629 enum ir = ` 630 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 631 %r = sext i1 %cmp to i64 632 %r2 = bitcast i64 %r to double 633 ret double %r2`; 634 635 double2 r = a; 636 r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]); 637 return r; 638 } 639 640 // Note: ucomss and ucomsd are left unimplemented 641 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 642 { 643 enum ir = ` 644 %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1 645 %r = zext i1 %cmp to i32 646 ret i32 %r`; 647 648 return LDCInlineIR!(ir, int, float, float)(a[0], b[0]); 649 } 650 651 // Note: ucomss and ucomsd are left unimplemented 652 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 653 { 654 enum ir = ` 655 %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1 656 %r = zext i1 %cmp to i32 657 ret i32 %r`; 658 659 return LDCInlineIR!(ir, int, double, double)(a[0], b[0]); 660 } 661 } 662 else 663 { 664 /// Provides packed float comparisons 665 package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted 666 { 667 int4 result; 668 foreach(i; 0..4) 669 { 670 result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0; 671 } 672 return result; 673 } 674 675 /// Provides packed double comparisons 676 package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted 677 { 678 long2 result; 679 foreach(i; 0..2) 680 { 681 result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0; 682 } 683 return result; 684 } 685 686 /// Provides CMPSS-style comparison 687 package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted 688 { 689 int4 result = cast(int4)a; 690 result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0; 691 return cast(float4)result; 692 } 693 694 /// Provides CMPSD-style comparison 695 package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted 696 { 697 long2 result = cast(long2)a; 698 result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0; 699 return cast(double2)result; 700 } 701 702 package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe 703 { 704 return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0; 705 } 706 707 // Note: ucomss and ucomsd are left unimplemented 708 package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe 709 { 710 return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0; 711 } 712 } 713 unittest // cmpps 714 { 715 // Check all comparison type is working 716 float4 A = [1, 3, 5, float.nan]; 717 float4 B = [2, 3, 4, 5]; 718 719 int4 result_oeq = cmpps!(FPComparison.oeq)(A, B); 720 int4 result_ogt = cmpps!(FPComparison.ogt)(A, B); 721 int4 result_oge = cmpps!(FPComparison.oge)(A, B); 722 int4 result_olt = cmpps!(FPComparison.olt)(A, B); 723 int4 result_ole = cmpps!(FPComparison.ole)(A, B); 724 int4 result_one = cmpps!(FPComparison.one)(A, B); 725 int4 result_ord = cmpps!(FPComparison.ord)(A, B); 726 int4 result_ueq = cmpps!(FPComparison.ueq)(A, B); 727 int4 result_ugt = cmpps!(FPComparison.ugt)(A, B); 728 int4 result_uge = cmpps!(FPComparison.uge)(A, B); 729 int4 result_ult = cmpps!(FPComparison.ult)(A, B); 730 int4 result_ule = cmpps!(FPComparison.ule)(A, B); 731 int4 result_une = cmpps!(FPComparison.une)(A, B); 732 int4 result_uno = cmpps!(FPComparison.uno)(A, B); 733 734 static immutable int[4] correct_oeq = [ 0,-1, 0, 0]; 735 static immutable int[4] correct_ogt = [ 0, 0,-1, 0]; 736 static immutable int[4] correct_oge = [ 0,-1,-1, 0]; 737 static immutable int[4] correct_olt = [-1, 0, 0, 0]; 738 static immutable int[4] correct_ole = [-1,-1, 0, 0]; 739 static immutable int[4] correct_one = [-1, 0,-1, 0]; 740 static immutable int[4] correct_ord = [-1,-1,-1, 0]; 741 static immutable int[4] correct_ueq = [ 0,-1, 0,-1]; 742 static immutable int[4] correct_ugt = [ 0, 0,-1,-1]; 743 static immutable int[4] correct_uge = [ 0,-1,-1,-1]; 744 static immutable int[4] correct_ult = [-1, 0, 0,-1]; 745 static immutable int[4] correct_ule = [-1,-1, 0,-1]; 746 static immutable int[4] correct_une = [-1, 0,-1,-1]; 747 static immutable int[4] correct_uno = [ 0, 0, 0,-1]; 748 749 assert(result_oeq.array == correct_oeq); 750 assert(result_ogt.array == correct_ogt); 751 assert(result_oge.array == correct_oge); 752 assert(result_olt.array == correct_olt); 753 assert(result_ole.array == correct_ole); 754 assert(result_one.array == correct_one); 755 assert(result_ord.array == correct_ord); 756 assert(result_ueq.array == correct_ueq); 757 assert(result_ugt.array == correct_ugt); 758 assert(result_uge.array == correct_uge); 759 assert(result_ult.array == correct_ult); 760 assert(result_ule.array == correct_ule); 761 assert(result_une.array == correct_une); 762 assert(result_uno.array == correct_uno); 763 } 764 unittest 765 { 766 double2 a = [1, 3]; 767 double2 b = [2, 3]; 768 long2 c = cmppd!(FPComparison.ult)(a, b); 769 static immutable long[2] correct = [cast(long)(-1), 0]; 770 assert(c.array == correct); 771 } 772 unittest // cmpss and comss 773 { 774 void testComparison(FPComparison comparison)(float4 A, float4 B) 775 { 776 float4 result = cmpss!comparison(A, B); 777 int4 iresult = cast(int4)result; 778 int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0; 779 assert(iresult.array[0] == expected); 780 assert(result.array[1] == A.array[1]); 781 assert(result.array[2] == A.array[2]); 782 assert(result.array[3] == A.array[3]); 783 784 // check comss 785 int comResult = comss!comparison(A, B); 786 assert( (expected != 0) == (comResult != 0) ); 787 } 788 789 // Check all comparison type is working 790 float4 A = [1, 3, 5, 6]; 791 float4 B = [2, 3, 4, 5]; 792 float4 C = [float.nan, 3, 4, 5]; 793 794 testComparison!(FPComparison.oeq)(A, B); 795 testComparison!(FPComparison.oeq)(A, C); 796 testComparison!(FPComparison.ogt)(A, B); 797 testComparison!(FPComparison.ogt)(A, C); 798 testComparison!(FPComparison.oge)(A, B); 799 testComparison!(FPComparison.oge)(A, C); 800 testComparison!(FPComparison.olt)(A, B); 801 testComparison!(FPComparison.olt)(A, C); 802 testComparison!(FPComparison.ole)(A, B); 803 testComparison!(FPComparison.ole)(A, C); 804 testComparison!(FPComparison.one)(A, B); 805 testComparison!(FPComparison.one)(A, C); 806 testComparison!(FPComparison.ord)(A, B); 807 testComparison!(FPComparison.ord)(A, C); 808 testComparison!(FPComparison.ueq)(A, B); 809 testComparison!(FPComparison.ueq)(A, C); 810 testComparison!(FPComparison.ugt)(A, B); 811 testComparison!(FPComparison.ugt)(A, C); 812 testComparison!(FPComparison.uge)(A, B); 813 testComparison!(FPComparison.uge)(A, C); 814 testComparison!(FPComparison.ult)(A, B); 815 testComparison!(FPComparison.ult)(A, C); 816 testComparison!(FPComparison.ule)(A, B); 817 testComparison!(FPComparison.ule)(A, C); 818 testComparison!(FPComparison.une)(A, B); 819 testComparison!(FPComparison.une)(A, C); 820 testComparison!(FPComparison.uno)(A, B); 821 testComparison!(FPComparison.uno)(A, C); 822 } 823 unittest // cmpsd and comsd 824 { 825 void testComparison(FPComparison comparison)(double2 A, double2 B) 826 { 827 double2 result = cmpsd!comparison(A, B); 828 long2 iresult = cast(long2)result; 829 long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0; 830 assert(iresult.array[0] == expected); 831 assert(result.array[1] == A.array[1]); 832 833 // check comsd 834 int comResult = comsd!comparison(A, B); 835 assert( (expected != 0) == (comResult != 0) ); 836 } 837 838 // Check all comparison type is working 839 double2 A = [1, 3]; 840 double2 B = [2, 4]; 841 double2 C = [double.nan, 5]; 842 843 testComparison!(FPComparison.oeq)(A, B); 844 testComparison!(FPComparison.oeq)(A, C); 845 testComparison!(FPComparison.ogt)(A, B); 846 testComparison!(FPComparison.ogt)(A, C); 847 testComparison!(FPComparison.oge)(A, B); 848 testComparison!(FPComparison.oge)(A, C); 849 testComparison!(FPComparison.olt)(A, B); 850 testComparison!(FPComparison.olt)(A, C); 851 testComparison!(FPComparison.ole)(A, B); 852 testComparison!(FPComparison.ole)(A, C); 853 testComparison!(FPComparison.one)(A, B); 854 testComparison!(FPComparison.one)(A, C); 855 testComparison!(FPComparison.ord)(A, B); 856 testComparison!(FPComparison.ord)(A, C); 857 testComparison!(FPComparison.ueq)(A, B); 858 testComparison!(FPComparison.ueq)(A, C); 859 testComparison!(FPComparison.ugt)(A, B); 860 testComparison!(FPComparison.ugt)(A, C); 861 testComparison!(FPComparison.uge)(A, B); 862 testComparison!(FPComparison.uge)(A, C); 863 testComparison!(FPComparison.ult)(A, B); 864 testComparison!(FPComparison.ult)(A, C); 865 testComparison!(FPComparison.ule)(A, B); 866 testComparison!(FPComparison.ule)(A, C); 867 testComparison!(FPComparison.une)(A, B); 868 testComparison!(FPComparison.une)(A, C); 869 testComparison!(FPComparison.uno)(A, B); 870 testComparison!(FPComparison.uno)(A, C); 871 } 872 873 // 874 // </FLOATING-POINT COMPARISONS> 875 // 876 877 878 __m64 to_m64(__m128i a) pure @trusted 879 { 880 long2 la = cast(long2)a; 881 long1 r; 882 r.ptr[0] = la.array[0]; 883 return r; 884 } 885 886 __m128i to_m128i(__m64 a) pure @trusted 887 { 888 long2 r = [0, 0]; 889 r.ptr[0] = a.array[0]; 890 return cast(__m128i)r; 891 }