1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.internals;
7 
8 import inteli.types;
9 
10 // The only math functions needed for intel-intrinsics
11 public import core.math: sqrt; // since it's an intrinsics
12 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit
13 
14 
15 version(GNU)
16 {
17     version (X86)
18     {
19         // For 32-bit x86, disable vector extensions with GDC. 
20         // It just doesn't work well.
21         enum GDC_with_x86 = true;
22         enum GDC_with_MMX = false;
23         enum GDC_with_SSE = false;
24         enum GDC_with_SSE2 = false;
25         enum GDC_with_SSE3 = false;
26     }
27     else version (X86_64)
28     {
29         // GDC support uses extended inline assembly:
30         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
31         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
32         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
33 
34         public import core.simd;
35 
36         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
37         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
38         public import gcc.builtins;
39                 
40         enum GDC_with_x86 = true;
41         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
42         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
43         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
44         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
45     }
46     else
47     {
48         enum GDC_with_x86 = false;
49         enum GDC_with_MMX = false;
50         enum GDC_with_SSE = false;
51         enum GDC_with_SSE2 = false;
52         enum GDC_with_SSE3 = false;
53     }
54 }
55 else version(LDC)
56 {
57     public import core.simd;
58     public import ldc.simd;
59     public import ldc.gccbuiltins_x86;
60     public import ldc.intrinsics;
61     public import ldc.llvmasm: __asm;
62 
63     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
64     static if (__VERSION__ >= 2083)
65     {
66          import ldc.llvmasm;
67          alias LDCInlineIR = __ir_pure;
68 
69          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
70          alias LDCInlineIREx = __irEx_pure; 
71     }
72     else
73     {
74         alias LDCInlineIR = inlineIR;
75     }
76     
77     package(inteli)
78     {
79         enum GDC_with_x86 = false;
80         enum GDC_with_MMX = false;
81         enum GDC_with_SSE = false;
82         enum GDC_with_SSE2 = false;
83         enum GDC_with_SSE3 = false;
84     }
85 }
86 else version(DigitalMars)
87 {
88     package(inteli)
89     {
90         enum GDC_with_x86 = false;
91         enum GDC_with_MMX = false;
92         enum GDC_with_SSE = false;
93         enum GDC_with_SSE2 = false;
94         enum GDC_with_SSE3 = false;
95     }
96 }
97 else
98 {
99     static assert(false, "Unknown compiler");
100 }
101 
102 version(DigitalMars)
103 {
104     version(D_InlineAsm_X86)
105         enum DMD_with_asm = true;
106     else version(D_InlineAsm_X86_64)
107         enum DMD_with_asm = true;
108     else
109         enum DMD_with_asm = false;
110 
111     version(D_InlineAsm_X86)
112         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
113     else
114         enum DMD_with_32bit_asm = false;
115 }
116 else
117 {
118     enum DMD_with_asm = false;
119     enum DMD_with_32bit_asm = false;
120 }
121 
122 
123 
124 
125 package:
126 nothrow @nogc:
127 
128 
129 //
130 //  <ROUNDING>
131 //
132 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
133 //  doesn't change the FPU rounding mode, and isn't expected to do so.
134 //  So we devised these rounding function to help having consistent rouding between 
135 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
136 //
137 
138 
139 int convertFloatToInt32UsingMXCSR(float value) pure @safe
140 {
141     int result;
142     version(GNU)
143     {
144         asm pure nothrow @nogc @trusted
145         {
146             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
147         }
148     }
149     else
150     {        
151         asm pure nothrow @nogc @trusted
152         {
153             cvtss2si EAX, value;
154             mov result, EAX;
155         }
156     }
157     return result;
158 }
159 
160 int convertDoubleToInt32UsingMXCSR(double value) pure @safe
161 {
162     int result;
163     version(GNU)
164     {
165         asm pure nothrow @nogc @trusted
166         {
167             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
168         }
169     }
170     else
171     {        
172         asm pure nothrow @nogc @trusted
173         {
174             cvtsd2si EAX, value;
175             mov result, EAX;
176         }
177     }
178     return result;
179 }
180 
181 long convertFloatToInt64UsingMXCSR(float value) pure @safe
182 {
183     // 64-bit can use an SSE instruction
184     version(D_InlineAsm_X86_64)
185     {
186         long result;
187         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
188         {
189             asm pure nothrow @nogc @trusted
190             {
191                 movss XMM0, value;
192                 cvtss2si RAX, XMM0;
193                 mov result, RAX;
194             }
195         }
196         else
197         {
198             asm pure nothrow @nogc @trusted
199             {
200                 movss XMM0, value;
201                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
202                 mov result, RAX;
203             }
204         }
205         return result;
206     }
207     else version(D_InlineAsm_X86)
208     {
209         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
210         // This leads to an unfortunate FPU sequence in every C++ compiler.
211         // See: https://godbolt.org/z/vZym77
212 
213         // Get current MXCSR rounding
214         uint sseRounding;
215         ushort savedFPUCW;
216         ushort newFPUCW;
217         long result;
218         asm pure nothrow @nogc @trusted
219         {
220             stmxcsr sseRounding;
221             fld value;
222             fnstcw savedFPUCW;
223             mov AX, savedFPUCW;
224             and AX, 0xf3ff;          // clear FPU rounding bits
225             movzx ECX, word ptr sseRounding;
226             and ECX, 0x6000;         // only keep SSE rounding bits
227             shr ECX, 3;
228             or AX, CX;               // make a new control word for FPU with SSE bits
229             mov newFPUCW, AX;
230             fldcw newFPUCW;
231             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
232             fldcw savedFPUCW;
233         }
234         return result;
235     }
236     else static if (GDC_with_x86)
237     {
238         version(X86_64) // 64-bit can just use the right instruction
239         {
240             static assert(GDC_with_SSE);
241             __m128 A;
242             A.ptr[0] = value;
243             return __builtin_ia32_cvtss2si64 (A);
244         }
245         else version(X86) // 32-bit
246         {
247             // This is untested!
248             uint sseRounding;
249             ushort savedFPUCW;
250             ushort newFPUCW;
251             long result;
252             asm pure nothrow @nogc @trusted
253             {
254                 "stmxcsr %1;\n" ~
255                 "fld %2;\n" ~
256                 "fnstcw %3;\n" ~
257                 "movw %3, %%ax;\n" ~
258                 "andw $0xf3ff, %%ax;\n" ~
259                 "movzwl %1, %%ecx;\n" ~
260                 "andl $0x6000, %%ecx;\n" ~
261                 "shrl $3, %%ecx;\n" ~
262                 "orw %%cx, %%ax\n" ~
263                 "movw %%ax, %4;\n" ~
264                 "fldcw %4;\n" ~
265                 "fistpll %0;\n" ~
266                 "fldcw %3;\n" 
267                   : "=m"(result)    // %0
268                   : "m" (sseRounding),
269                     "f" (value),
270                     "m" (savedFPUCW),
271                     "m" (newFPUCW) 
272                   : "eax", "ecx", "st";
273             }
274             return result;
275         }
276         else
277             static assert(false);
278     }
279     else
280         static assert(false);
281 }
282 
283 ///ditto
284 long convertDoubleToInt64UsingMXCSR(double value) pure @safe
285 {
286     // 64-bit can use an SSE instruction
287     version(D_InlineAsm_X86_64)
288     {
289         long result;
290         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
291         {
292             asm pure nothrow @nogc @trusted
293             {
294                 movsd XMM0, value;
295                 cvtsd2si RAX, XMM0;
296                 mov result, RAX;
297             }
298         }
299         else
300         {
301             asm pure nothrow @nogc @trusted
302             {
303                 movsd XMM0, value;
304                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
305                 mov result, RAX;
306             }
307         }
308         return result;
309     }
310     else version(D_InlineAsm_X86)
311     {
312         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
313         // This leads to an unfortunate FPU sequence in every C++ compiler.
314         // See: https://godbolt.org/z/vZym77
315 
316         // Get current MXCSR rounding
317         uint sseRounding;
318         ushort savedFPUCW;
319         ushort newFPUCW;
320         long result;
321         asm pure nothrow @nogc @trusted
322         {
323             stmxcsr sseRounding;
324             fld value;
325             fnstcw savedFPUCW;
326             mov AX, savedFPUCW;
327             and AX, 0xf3ff;
328             movzx ECX, word ptr sseRounding;
329             and ECX, 0x6000;
330             shr ECX, 3;
331             or AX, CX;
332             mov newFPUCW, AX;
333             fldcw newFPUCW;
334             fistp result;
335             fldcw savedFPUCW;
336         }
337         return result;
338     }
339     else static if (GDC_with_x86)
340     {
341         version(X86_64)
342         {
343             static assert(GDC_with_SSE2);
344             __m128d A;
345             A.ptr[0] = value;
346             return __builtin_ia32_cvtsd2si64 (A);
347         }
348         else
349         {
350             // This is untested!
351             uint sseRounding;
352             ushort savedFPUCW;
353             ushort newFPUCW;
354             long result;
355             asm pure nothrow @nogc @trusted
356             {
357                 "stmxcsr %1;\n" ~
358                 "fld %2;\n" ~
359                 "fnstcw %3;\n" ~
360                 "movw %3, %%ax;\n" ~
361                 "andw $0xf3ff, %%ax;\n" ~
362                 "movzwl %1, %%ecx;\n" ~
363                 "andl $0x6000, %%ecx;\n" ~
364                 "shrl $3, %%ecx;\n" ~
365                 "orw %%cx, %%ax\n" ~
366                 "movw %%ax, %4;\n" ~
367                 "fldcw %4;\n" ~
368                 "fistpll %0;\n" ~
369                 "fldcw %3;\n"         
370                   : "=m"(result)    // %0
371                   : "m" (sseRounding),
372                     "t" (value),
373                     "m" (savedFPUCW),
374                     "m" (newFPUCW) 
375                   : "eax", "ecx", "st";
376             }
377             return result;
378         }
379     }
380     else
381         static assert(false);
382 }
383 
384 
385 //
386 //  </ROUNDING>
387 //
388 
389 
390 // using the Intel terminology here
391 
392 byte saturateSignedWordToSignedByte(short value) pure @safe
393 {
394     if (value > 127) value = 127;
395     if (value < -128) value = -128;
396     return cast(byte) value;
397 }
398 
399 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
400 {
401     if (value > 255) value = 255;
402     if (value < 0) value = 0;
403     return cast(ubyte) value;
404 }
405 
406 short saturateSignedIntToSignedShort(int value) pure @safe
407 {
408     if (value > 32767) value = 32767;
409     if (value < -32768) value = -32768;
410     return cast(short) value;
411 }
412 
413 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
414 {
415     if (value > 65535) value = 65535;
416     if (value < 0) value = 0;
417     return cast(ushort) value;
418 }
419 
420 unittest // test saturate operations
421 {
422     assert( saturateSignedWordToSignedByte(32000) == 127);
423     assert( saturateSignedWordToUnsignedByte(32000) == 255);
424     assert( saturateSignedWordToSignedByte(-4000) == -128);
425     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
426     assert( saturateSignedIntToSignedShort(32768) == 32767);
427     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
428     assert( saturateSignedIntToSignedShort(-32769) == -32768);
429     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
430 }
431 
432 version(unittest)
433 {
434     // This is just for debugging tests
435     import core.stdc.stdio: printf;
436 
437     // printing vectors for implementation
438     // Note: you can override `pure` within a `debug` clause
439 
440     void _mm_print_pi64(__m64 v) @trusted
441     {
442         long1 vl = cast(long1)v;
443         printf("%lld\n", vl.array[0]);
444     }
445 
446     void _mm_print_pi32(__m64 v) @trusted
447     {
448         int[2] C = (cast(int2)v).array;
449         printf("%d %d\n", C[0], C[1]);
450     }
451 
452     void _mm_print_pi16(__m64 v) @trusted
453     {
454         short[4] C = (cast(short4)v).array;
455         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
456     }
457 
458     void _mm_print_pi8(__m64 v) @trusted
459     {
460         byte[8] C = (cast(byte8)v).array;
461         printf("%d %d %d %d %d %d %d %d\n",
462         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
463     }
464 
465     void _mm_print_epi64(__m128i v) @trusted
466     {
467         long2 vl = cast(long2)v;
468         printf("%lld %lld\n", vl.array[0], vl.array[1]);
469     }
470 
471     void _mm_print_epi32(__m128i v) @trusted
472     {
473         printf("%d %d %d %d\n",
474               v.array[0], v.array[1], v.array[2], v.array[3]);
475     }  
476 
477     void _mm_print_epi16(__m128i v) @trusted
478     {
479         short[8] C = (cast(short8)v).array;
480         printf("%d %d %d %d %d %d %d %d\n",
481         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
482     }
483 
484     void _mm_print_epi8(__m128i v) @trusted
485     {
486         byte[16] C = (cast(byte16)v).array;
487         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
488         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
489     }
490 
491     void _mm_print_ps(__m128 v) @trusted
492     {
493         float[4] C = (cast(float4)v).array;
494         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
495     }
496 
497     void _mm_print_pd(__m128d v) @trusted
498     {
499         double[2] C = (cast(double2)v).array;
500         printf("%f %f\n", C[0], C[1]);
501     }    
502 }
503 
504 
505 //
506 //  <FLOATING-POINT COMPARISONS>
507 //
508 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
509 //       need different IR generation.
510 
511 enum FPComparison
512 {
513     oeq,   // ordered and equal
514     ogt,   // ordered and greater than
515     oge,   // ordered and greater than or equal
516     olt,   // ordered and less than
517     ole,   // ordered and less than or equal
518     one,   // ordered and not equal
519     ord,   // ordered (no nans)
520     ueq,   // unordered or equal
521     ugt,   // unordered or greater than ("nle")
522     uge,   // unordered or greater than or equal ("nlt")
523     ult,   // unordered or less than ("nge")
524     ule,   // unordered or less than or equal ("ngt")
525     une,   // unordered or not equal ("neq")
526     uno,   // unordered (either nans)
527 }
528 
529 private static immutable string[FPComparison.max+1] FPComparisonToString =
530 [
531     "oeq",
532     "ogt",
533     "oge",
534     "olt",
535     "ole",
536     "one",
537     "ord",
538     "ueq",
539     "ugt",
540     "uge",
541     "ult",
542     "ule",
543     "une",
544     "uno",
545 ];
546 
547 // Individual float comparison: returns -1 for true or 0 for false.
548 // Useful for DMD and testing
549 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
550 {
551     import std.math;
552     bool unordered = isNaN(a) || isNaN(b);
553     final switch(comparison) with(FPComparison)
554     {
555         case oeq: return a == b;
556         case ogt: return a > b;
557         case oge: return a >= b;
558         case olt: return a < b;
559         case ole: return a <= b;
560         case one: return !unordered && (a != b); // NaN with != always yields true
561         case ord: return !unordered; 
562         case ueq: return unordered || (a == b);
563         case ugt: return unordered || (a > b);
564         case uge: return unordered || (a >= b);
565         case ult: return unordered || (a < b);
566         case ule: return unordered || (a <= b);
567         case une: return (a != b); // NaN with != always yields true
568         case uno: return unordered;
569     }
570 }
571 
572 version(LDC)
573 {
574     /// Provides packed float comparisons
575     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
576     {
577         enum ir = `
578             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
579             %r = sext <4 x i1> %cmp to <4 x i32>
580             ret <4 x i32> %r`;
581 
582         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
583     }
584 
585     /// Provides packed double comparisons
586     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
587     {
588         enum ir = `
589             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
590             %r = sext <2 x i1> %cmp to <2 x i64>
591             ret <2 x i64> %r`;
592 
593         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
594     }
595 
596     /// CMPSS-style comparisons
597     /// clang implement it through x86 intrinsics, it is possible with IR alone
598     /// but leads to less optimal code.
599     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
600     /// Not that simple.
601     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
602     {
603         /*
604         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
605         enum bool invertOp = (predicateNumber & 0x80) != 0;
606         static if(invertOp)
607             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
608         else
609             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
610         */
611         enum ir = `
612             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
613             %r = sext i1 %cmp to i32
614             %r2 = bitcast i32 %r to float
615             ret float %r2`;
616 
617         float4 r = a;
618         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
619         return r;
620     }
621 
622     /// CMPSD-style comparisons
623     /// clang implement it through x86 intrinsics, it is possible with IR alone
624     /// but leads to less optimal code.
625     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
626     /// Not that simple.    
627     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
628     {
629         enum ir = `
630             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
631             %r = sext i1 %cmp to i64
632             %r2 = bitcast i64 %r to double
633             ret double %r2`;
634 
635         double2 r = a;
636         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
637         return r;
638     }
639 
640     // Note: ucomss and ucomsd are left unimplemented
641     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
642     {
643         enum ir = `
644             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
645             %r = zext i1 %cmp to i32
646             ret i32 %r`;
647 
648         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
649     }
650 
651     // Note: ucomss and ucomsd are left unimplemented
652     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
653     {
654         enum ir = `
655             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
656             %r = zext i1 %cmp to i32
657             ret i32 %r`;
658 
659         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
660     }
661 }
662 else
663 {
664     /// Provides packed float comparisons
665     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
666     {
667         int4 result;
668         foreach(i; 0..4)
669         {
670             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
671         }
672         return result;
673     }
674 
675     /// Provides packed double comparisons
676     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
677     {
678         long2 result;
679         foreach(i; 0..2)
680         {
681             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
682         }
683         return result;
684     }
685 
686     /// Provides CMPSS-style comparison
687     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
688     {
689         int4 result = cast(int4)a;
690         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
691         return cast(float4)result;
692     }
693 
694     /// Provides CMPSD-style comparison
695     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
696     {
697         long2 result = cast(long2)a;
698         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
699         return cast(double2)result;
700     }
701 
702     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
703     {
704         return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0;
705     }
706 
707     // Note: ucomss and ucomsd are left unimplemented
708     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
709     {
710         return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0;
711     }
712 }
713 unittest // cmpps
714 {
715     // Check all comparison type is working
716     float4 A = [1, 3, 5, float.nan];
717     float4 B = [2, 3, 4, 5];
718 
719     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
720     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
721     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
722     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
723     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
724     int4 result_one = cmpps!(FPComparison.one)(A, B);
725     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
726     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
727     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
728     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
729     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
730     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
731     int4 result_une = cmpps!(FPComparison.une)(A, B);
732     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
733 
734     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
735     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
736     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
737     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
738     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
739     static immutable int[4] correct_one    = [-1, 0,-1, 0];
740     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
741     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
742     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
743     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
744     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
745     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
746     static immutable int[4] correct_une    = [-1, 0,-1,-1];
747     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
748 
749     assert(result_oeq.array == correct_oeq);
750     assert(result_ogt.array == correct_ogt);
751     assert(result_oge.array == correct_oge);
752     assert(result_olt.array == correct_olt);
753     assert(result_ole.array == correct_ole);
754     assert(result_one.array == correct_one);
755     assert(result_ord.array == correct_ord);
756     assert(result_ueq.array == correct_ueq);
757     assert(result_ugt.array == correct_ugt);
758     assert(result_uge.array == correct_uge);
759     assert(result_ult.array == correct_ult);
760     assert(result_ule.array == correct_ule);
761     assert(result_une.array == correct_une);
762     assert(result_uno.array == correct_uno);
763 }
764 unittest
765 {
766     double2 a = [1, 3];
767     double2 b = [2, 3];
768     long2 c = cmppd!(FPComparison.ult)(a, b);
769     static immutable long[2] correct = [cast(long)(-1), 0];
770     assert(c.array == correct);
771 }
772 unittest // cmpss and comss
773 {
774     void testComparison(FPComparison comparison)(float4 A, float4 B)
775     {
776         float4 result = cmpss!comparison(A, B);
777         int4 iresult = cast(int4)result;
778         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
779         assert(iresult.array[0] == expected);
780         assert(result.array[1] == A.array[1]);
781         assert(result.array[2] == A.array[2]);
782         assert(result.array[3] == A.array[3]);
783 
784         // check comss
785         int comResult = comss!comparison(A, B);
786         assert( (expected != 0) == (comResult != 0) );
787     }
788 
789     // Check all comparison type is working
790     float4 A = [1, 3, 5, 6];
791     float4 B = [2, 3, 4, 5];
792     float4 C = [float.nan, 3, 4, 5];
793 
794     testComparison!(FPComparison.oeq)(A, B);
795     testComparison!(FPComparison.oeq)(A, C);
796     testComparison!(FPComparison.ogt)(A, B);
797     testComparison!(FPComparison.ogt)(A, C);
798     testComparison!(FPComparison.oge)(A, B);
799     testComparison!(FPComparison.oge)(A, C);
800     testComparison!(FPComparison.olt)(A, B);
801     testComparison!(FPComparison.olt)(A, C);
802     testComparison!(FPComparison.ole)(A, B);
803     testComparison!(FPComparison.ole)(A, C);
804     testComparison!(FPComparison.one)(A, B);
805     testComparison!(FPComparison.one)(A, C);
806     testComparison!(FPComparison.ord)(A, B);
807     testComparison!(FPComparison.ord)(A, C);
808     testComparison!(FPComparison.ueq)(A, B);
809     testComparison!(FPComparison.ueq)(A, C);
810     testComparison!(FPComparison.ugt)(A, B);
811     testComparison!(FPComparison.ugt)(A, C);
812     testComparison!(FPComparison.uge)(A, B);
813     testComparison!(FPComparison.uge)(A, C);
814     testComparison!(FPComparison.ult)(A, B);
815     testComparison!(FPComparison.ult)(A, C);
816     testComparison!(FPComparison.ule)(A, B);
817     testComparison!(FPComparison.ule)(A, C);
818     testComparison!(FPComparison.une)(A, B);
819     testComparison!(FPComparison.une)(A, C);
820     testComparison!(FPComparison.uno)(A, B);
821     testComparison!(FPComparison.uno)(A, C);
822 }
823 unittest // cmpsd and comsd
824 {
825     void testComparison(FPComparison comparison)(double2 A, double2 B)
826     {
827         double2 result = cmpsd!comparison(A, B);
828         long2 iresult = cast(long2)result;
829         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
830         assert(iresult.array[0] == expected);
831         assert(result.array[1] == A.array[1]);
832 
833         // check comsd
834         int comResult = comsd!comparison(A, B);
835         assert( (expected != 0) == (comResult != 0) );
836     }
837 
838     // Check all comparison type is working
839     double2 A = [1, 3];
840     double2 B = [2, 4];
841     double2 C = [double.nan, 5];
842 
843     testComparison!(FPComparison.oeq)(A, B);
844     testComparison!(FPComparison.oeq)(A, C);
845     testComparison!(FPComparison.ogt)(A, B);
846     testComparison!(FPComparison.ogt)(A, C);
847     testComparison!(FPComparison.oge)(A, B);
848     testComparison!(FPComparison.oge)(A, C);
849     testComparison!(FPComparison.olt)(A, B);
850     testComparison!(FPComparison.olt)(A, C);
851     testComparison!(FPComparison.ole)(A, B);
852     testComparison!(FPComparison.ole)(A, C);
853     testComparison!(FPComparison.one)(A, B);
854     testComparison!(FPComparison.one)(A, C);
855     testComparison!(FPComparison.ord)(A, B);
856     testComparison!(FPComparison.ord)(A, C);
857     testComparison!(FPComparison.ueq)(A, B);
858     testComparison!(FPComparison.ueq)(A, C);
859     testComparison!(FPComparison.ugt)(A, B);
860     testComparison!(FPComparison.ugt)(A, C);
861     testComparison!(FPComparison.uge)(A, B);
862     testComparison!(FPComparison.uge)(A, C);
863     testComparison!(FPComparison.ult)(A, B);
864     testComparison!(FPComparison.ult)(A, C);
865     testComparison!(FPComparison.ule)(A, B);
866     testComparison!(FPComparison.ule)(A, C);
867     testComparison!(FPComparison.une)(A, B);
868     testComparison!(FPComparison.une)(A, C);
869     testComparison!(FPComparison.uno)(A, B);
870     testComparison!(FPComparison.uno)(A, C);
871 }
872 
873 //
874 //  </FLOATING-POINT COMPARISONS>
875 //
876 
877 
878 __m64 to_m64(__m128i a) pure @trusted
879 {
880     long2 la = cast(long2)a;
881     long1 r;
882     r.ptr[0] = la.array[0];
883     return r;
884 }
885 
886 __m128i to_m128i(__m64 a) pure @trusted
887 {
888     long2 r = [0, 0];
889     r.ptr[0] = a.array[0];
890     return cast(__m128i)r;
891 }