1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.internals;
7 
8 import inteli.types;
9 
10 // The only math functions needed for intel-intrinsics
11 public import core.math: sqrt; // since it's an intrinsics
12 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit
13 
14 
15 version(GNU)
16 {
17     version (X86)
18     {
19         // For 32-bit x86, disable vector extensions with GDC. 
20         // It just doesn't work well.
21         enum GDC_with_x86 = true;
22         enum GDC_with_MMX = false;
23         enum GDC_with_SSE = false;
24         enum GDC_with_SSE2 = false;
25         enum GDC_with_SSE3 = false;
26     }
27     else version (X86_64)
28     {
29         // GDC support uses extended inline assembly:
30         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
31         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
32         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
33 
34         public import core.simd;
35 
36         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
37         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
38         public import gcc.builtins;
39                 
40         enum GDC_with_x86 = true;
41         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
42         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
43         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
44         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
45     }
46     else
47     {
48         enum GDC_with_x86 = false;
49         enum GDC_with_MMX = false;
50         enum GDC_with_SSE = false;
51         enum GDC_with_SSE2 = false;
52         enum GDC_with_SSE3 = false;
53     }
54 }
55 else version(LDC)
56 {
57     public import core.simd;
58     public import ldc.simd;
59     public import ldc.gccbuiltins_x86;
60     public import ldc.intrinsics;
61     public import ldc.llvmasm: __asm;
62 
63     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
64     static if (__VERSION__ >= 2083)
65     {
66          import ldc.llvmasm;
67          alias LDCInlineIR = __ir_pure;
68 
69          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
70          alias LDCInlineIREx = __irEx_pure; 
71     }
72     else
73     {
74         alias LDCInlineIR = inlineIR;
75     }
76     
77     package(inteli)
78     {
79         enum GDC_with_x86 = false;
80         enum GDC_with_MMX = false;
81         enum GDC_with_SSE = false;
82         enum GDC_with_SSE2 = false;
83         enum GDC_with_SSE3 = false;
84     }
85 }
86 else version(DigitalMars)
87 {
88     package(inteli)
89     {
90         enum GDC_with_x86 = false;
91         enum GDC_with_MMX = false;
92         enum GDC_with_SSE = false;
93         enum GDC_with_SSE2 = false;
94         enum GDC_with_SSE3 = false;
95     }
96 }
97 else
98 {
99     static assert(false, "Unknown compiler");
100 }
101 
102 version(DigitalMars)
103 {
104     version(D_InlineAsm_X86)
105         enum DMD_with_asm = true;
106     else version(D_InlineAsm_X86_64)
107         enum DMD_with_asm = true;
108     else
109         enum DMD_with_asm = false;
110 
111     version(D_InlineAsm_X86)
112         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
113     else
114         enum DMD_with_32bit_asm = false;
115 }
116 else
117 {
118     enum DMD_with_asm = false;
119     enum DMD_with_32bit_asm = false;
120 }
121 
122 
123 
124 
125 package:
126 nothrow @nogc:
127 
128 
129 //
130 //  <ROUNDING>
131 //
132 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
133 //  doesn't change the FPU rounding mode, and isn't expected to do so.
134 //  So we devised these rounding function to help having consistent rouding between 
135 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
136 //
137 
138 
139 int convertFloatToInt32UsingMXCSR(float value) pure @safe
140 {
141     int result;
142     version(GNU)
143     {
144         asm pure nothrow @nogc @trusted
145         {
146             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
147         }
148     }
149     else
150     {        
151         asm pure nothrow @nogc @trusted
152         {
153             cvtss2si EAX, value;
154             mov result, EAX;
155         }
156     }
157     return result;
158 }
159 
160 int convertDoubleToInt32UsingMXCSR(double value) pure @safe
161 {
162     int result;
163     version(GNU)
164     {
165         asm pure nothrow @nogc @trusted
166         {
167             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
168         }
169     }
170     else
171     {        
172         asm pure nothrow @nogc @trusted
173         {
174             cvtsd2si EAX, value;
175             mov result, EAX;
176         }
177     }
178     return result;
179 }
180 
181 long convertFloatToInt64UsingMXCSR(float value) pure @safe
182 {
183     // 64-bit can use an SSE instruction
184     version(D_InlineAsm_X86_64)
185     {
186         long result;
187         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
188         {
189             asm pure nothrow @nogc @trusted
190             {
191                 movss XMM0, value;
192                 cvtss2si RAX, XMM0;
193                 mov result, RAX;
194             }
195         }
196         else
197         {
198             asm pure nothrow @nogc @trusted
199             {
200                 movss XMM0, value;
201                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
202                 mov result, RAX;
203             }
204         }
205         return result;
206     }
207     else version(D_InlineAsm_X86)
208     {
209         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
210         // This leads to an unfortunate FPU sequence in every C++ compiler.
211         // See: https://godbolt.org/z/vZym77
212 
213         // Get current MXCSR rounding
214         uint sseRounding;
215         ushort savedFPUCW;
216         ushort newFPUCW;
217         long result;
218         asm pure nothrow @nogc @trusted
219         {
220             stmxcsr sseRounding;
221             fld value;
222             fnstcw savedFPUCW;
223             mov AX, savedFPUCW;
224             and AX, 0xf3ff;          // clear FPU rounding bits
225             movzx ECX, word ptr sseRounding;
226             and ECX, 0x6000;         // only keep SSE rounding bits
227             shr ECX, 3;
228             or AX, CX;               // make a new control word for FPU with SSE bits
229             mov newFPUCW, AX;
230             fldcw newFPUCW;
231             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
232             fldcw savedFPUCW;
233         }
234         return result;
235     }
236     else static if (GDC_with_x86)
237     {
238         version(X86_64) // 64-bit can just use the right instruction
239         {
240             static assert(GDC_with_SSE);
241             __m128 A;
242             A.ptr[0] = value;
243             return __builtin_ia32_cvtss2si64 (A);
244         }
245         else version(X86) // 32-bit
246         {
247             // This is untested!
248             uint sseRounding;
249             ushort savedFPUCW;
250             ushort newFPUCW;
251             long result;
252             asm pure nothrow @nogc @trusted
253             {
254                 "stmxcsr %1;\n" ~
255                 "fld %2;\n" ~
256                 "fnstcw %3;\n" ~
257                 "movw %3, %%ax;\n" ~
258                 "andw $0xf3ff, %%ax;\n" ~
259                 "movzwl %1, %%ecx;\n" ~
260                 "andl $0x6000, %%ecx;\n" ~
261                 "shrl $3, %%ecx;\n" ~
262                 "orw %%cx, %%ax\n" ~
263                 "movw %%ax, %4;\n" ~
264                 "fldcw %4;\n" ~
265                 "fistpll %0;\n" ~
266                 "fldcw %3;\n" 
267                   : "=m"(result)    // %0
268                   : "m" (sseRounding),
269                     "f" (value),
270                     "m" (savedFPUCW),
271                     "m" (newFPUCW) 
272                   : "eax", "ecx", "st";
273             }
274             return result;
275         }
276         else
277             static assert(false);
278     }
279     else
280         static assert(false);
281 }
282 
283 ///ditto
284 long convertDoubleToInt64UsingMXCSR(double value) pure @safe
285 {
286     // 64-bit can use an SSE instruction
287     version(D_InlineAsm_X86_64)
288     {
289         long result;
290         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
291         {
292             asm pure nothrow @nogc @trusted
293             {
294                 movsd XMM0, value;
295                 cvtsd2si RAX, XMM0;
296                 mov result, RAX;
297             }
298         }
299         else
300         {
301             asm pure nothrow @nogc @trusted
302             {
303                 movsd XMM0, value;
304                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
305                 mov result, RAX;
306             }
307         }
308         return result;
309     }
310     else version(D_InlineAsm_X86)
311     {
312         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
313         // This leads to an unfortunate FPU sequence in every C++ compiler.
314         // See: https://godbolt.org/z/vZym77
315 
316         // Get current MXCSR rounding
317         uint sseRounding;
318         ushort savedFPUCW;
319         ushort newFPUCW;
320         long result;
321         asm pure nothrow @nogc @trusted
322         {
323             stmxcsr sseRounding;
324             fld value;
325             fnstcw savedFPUCW;
326             mov AX, savedFPUCW;
327             and AX, 0xf3ff;
328             movzx ECX, word ptr sseRounding;
329             and ECX, 0x6000;
330             shr ECX, 3;
331             or AX, CX;
332             mov newFPUCW, AX;
333             fldcw newFPUCW;
334             fistp result;
335             fldcw savedFPUCW;
336         }
337         return result;
338     }
339     else static if (GDC_with_x86)
340     {
341         version(X86_64)
342         {
343             static assert(GDC_with_SSE2);
344             __m128d A;
345             A.ptr[0] = value;
346             return __builtin_ia32_cvtsd2si64 (A);
347         }
348         else
349         {
350             // This is untested!
351             uint sseRounding;
352             ushort savedFPUCW;
353             ushort newFPUCW;
354             long result;
355             asm pure nothrow @nogc @trusted
356             {
357                 "stmxcsr %1;\n" ~
358                 "fld %2;\n" ~
359                 "fnstcw %3;\n" ~
360                 "movw %3, %%ax;\n" ~
361                 "andw $0xf3ff, %%ax;\n" ~
362                 "movzwl %1, %%ecx;\n" ~
363                 "andl $0x6000, %%ecx;\n" ~
364                 "shrl $3, %%ecx;\n" ~
365                 "orw %%cx, %%ax\n" ~
366                 "movw %%ax, %4;\n" ~
367                 "fldcw %4;\n" ~
368                 "fistpll %0;\n" ~
369                 "fldcw %3;\n"         
370                   : "=m"(result)    // %0
371                   : "m" (sseRounding),
372                     "t" (value),
373                     "m" (savedFPUCW),
374                     "m" (newFPUCW) 
375                   : "eax", "ecx", "st";
376             }
377             return result;
378         }
379     }
380     else
381         static assert(false);
382 }
383 
384 
385 //
386 //  </ROUNDING>
387 //
388 
389 
390 // using the Intel terminology here
391 
392 byte saturateSignedWordToSignedByte(short value) pure @safe
393 {
394     if (value > 127) value = 127;
395     if (value < -128) value = -128;
396     return cast(byte) value;
397 }
398 
399 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
400 {
401     if (value > 255) value = 255;
402     if (value < 0) value = 0;
403     return cast(ubyte) value;
404 }
405 
406 short saturateSignedIntToSignedShort(int value) pure @safe
407 {
408     if (value > 32767) value = 32767;
409     if (value < -32768) value = -32768;
410     return cast(short) value;
411 }
412 
413 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
414 {
415     if (value > 65535) value = 65535;
416     if (value < 0) value = 0;
417     return cast(ushort) value;
418 }
419 
420 unittest // test saturate operations
421 {
422     assert( saturateSignedWordToSignedByte(32000) == 127);
423     assert( saturateSignedWordToUnsignedByte(32000) == 255);
424     assert( saturateSignedWordToSignedByte(-4000) == -128);
425     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
426     assert( saturateSignedIntToSignedShort(32768) == 32767);
427     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
428     assert( saturateSignedIntToSignedShort(-32769) == -32768);
429     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
430 }
431 
432 version(unittest)
433 {
434     // This is just for debugging tests
435     import core.stdc.stdio: printf;
436 
437     // printing vectors for implementation
438     // Note: you can override `pure` within a `debug` clause
439 
440     void _mm_print_pi32(__m64 v) @trusted
441     {
442         int[2] C = (cast(int2)v).array;
443         printf("%d %d\n", C[0], C[1]);
444     }
445 
446     void _mm_print_pi16(__m64 v) @trusted
447     {
448         short[4] C = (cast(short4)v).array;
449         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
450     }
451 
452     void _mm_print_pi8(__m64 v) @trusted
453     {
454         byte[8] C = (cast(byte8)v).array;
455         printf("%d %d %d %d %d %d %d %d\n",
456         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
457     }
458 
459     void _mm_print_epi32(__m128i v) @trusted
460     {
461         printf("%d %d %d %d\n",
462               v.array[0], v.array[1], v.array[2], v.array[3]);
463     }
464 
465     void _mm_print_epi16(__m128i v) @trusted
466     {
467         short[8] C = (cast(short8)v).array;
468         printf("%d %d %d %d %d %d %d %d\n",
469         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
470     }
471 
472     void _mm_print_epi8(__m128i v) @trusted
473     {
474         byte[16] C = (cast(byte16)v).array;
475         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
476         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
477     }
478 
479     void _mm_print_ps(__m128 v) @trusted
480     {
481         float[4] C = (cast(float4)v).array;
482         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
483     }
484 
485     void _mm_print_pd(__m128d v) @trusted
486     {
487         double[2] C = (cast(double2)v).array;
488         printf("%f %f\n", C[0], C[1]);
489     }    
490 }
491 
492 
493 //
494 //  <FLOATING-POINT COMPARISONS>
495 //
496 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
497 //       need different IR generation.
498 
499 enum FPComparison
500 {
501     oeq,   // ordered and equal
502     ogt,   // ordered and greater than
503     oge,   // ordered and greater than or equal
504     olt,   // ordered and less than
505     ole,   // ordered and less than or equal
506     one,   // ordered and not equal
507     ord,   // ordered (no nans)
508     ueq,   // unordered or equal
509     ugt,   // unordered or greater than ("nle")
510     uge,   // unordered or greater than or equal ("nlt")
511     ult,   // unordered or less than ("nge")
512     ule,   // unordered or less than or equal ("ngt")
513     une,   // unordered or not equal ("neq")
514     uno,   // unordered (either nans)
515 }
516 
517 private static immutable string[FPComparison.max+1] FPComparisonToString =
518 [
519     "oeq",
520     "ogt",
521     "oge",
522     "olt",
523     "ole",
524     "one",
525     "ord",
526     "ueq",
527     "ugt",
528     "uge",
529     "ult",
530     "ule",
531     "une",
532     "uno",
533 ];
534 
535 // Individual float comparison: returns -1 for true or 0 for false.
536 // Useful for DMD and testing
537 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
538 {
539     import std.math;
540     bool unordered = isNaN(a) || isNaN(b);
541     final switch(comparison) with(FPComparison)
542     {
543         case oeq: return a == b;
544         case ogt: return a > b;
545         case oge: return a >= b;
546         case olt: return a < b;
547         case ole: return a <= b;
548         case one: return !unordered && (a != b); // NaN with != always yields true
549         case ord: return !unordered; 
550         case ueq: return unordered || (a == b);
551         case ugt: return unordered || (a > b);
552         case uge: return unordered || (a >= b);
553         case ult: return unordered || (a < b);
554         case ule: return unordered || (a <= b);
555         case une: return (a != b); // NaN with != always yields true
556         case uno: return unordered;
557     }
558 }
559 
560 version(LDC)
561 {
562     /// Provides packed float comparisons
563     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
564     {
565         enum ir = `
566             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
567             %r = sext <4 x i1> %cmp to <4 x i32>
568             ret <4 x i32> %r`;
569 
570         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
571     }
572 
573     /// Provides packed double comparisons
574     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
575     {
576         enum ir = `
577             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
578             %r = sext <2 x i1> %cmp to <2 x i64>
579             ret <2 x i64> %r`;
580 
581         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
582     }
583 
584     /// CMPSS-style comparisons
585     /// clang implement it through x86 intrinsics, it is possible with IR alone
586     /// but leads to less optimal code.
587     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
588     /// Not that simple.
589     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
590     {
591         /*
592         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
593         enum bool invertOp = (predicateNumber & 0x80) != 0;
594         static if(invertOp)
595             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
596         else
597             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
598         */
599         enum ir = `
600             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
601             %r = sext i1 %cmp to i32
602             %r2 = bitcast i32 %r to float
603             ret float %r2`;
604 
605         float4 r = a;
606         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
607         return r;
608     }
609 
610     /// CMPSD-style comparisons
611     /// clang implement it through x86 intrinsics, it is possible with IR alone
612     /// but leads to less optimal code.
613     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
614     /// Not that simple.    
615     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
616     {
617         enum ir = `
618             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
619             %r = sext i1 %cmp to i64
620             %r2 = bitcast i64 %r to double
621             ret double %r2`;
622 
623         double2 r = a;
624         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
625         return r;
626     }
627 
628     // Note: ucomss and ucomsd are left unimplemented
629     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
630     {
631         enum ir = `
632             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
633             %r = zext i1 %cmp to i32
634             ret i32 %r`;
635 
636         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
637     }
638 
639     // Note: ucomss and ucomsd are left unimplemented
640     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
641     {
642         enum ir = `
643             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
644             %r = zext i1 %cmp to i32
645             ret i32 %r`;
646 
647         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
648     }
649 }
650 else
651 {
652     /// Provides packed float comparisons
653     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
654     {
655         int4 result;
656         foreach(i; 0..4)
657         {
658             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
659         }
660         return result;
661     }
662 
663     /// Provides packed double comparisons
664     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
665     {
666         long2 result;
667         foreach(i; 0..2)
668         {
669             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
670         }
671         return result;
672     }
673 
674     /// Provides CMPSS-style comparison
675     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
676     {
677         int4 result = cast(int4)a;
678         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
679         return cast(float4)result;
680     }
681 
682     /// Provides CMPSD-style comparison
683     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
684     {
685         long2 result = cast(long2)a;
686         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
687         return cast(double2)result;
688     }
689 
690     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
691     {
692         return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0;
693     }
694 
695     // Note: ucomss and ucomsd are left unimplemented
696     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
697     {
698         return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0;
699     }
700 }
701 unittest // cmpps
702 {
703     // Check all comparison type is working
704     float4 A = [1, 3, 5, float.nan];
705     float4 B = [2, 3, 4, 5];
706 
707     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
708     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
709     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
710     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
711     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
712     int4 result_one = cmpps!(FPComparison.one)(A, B);
713     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
714     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
715     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
716     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
717     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
718     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
719     int4 result_une = cmpps!(FPComparison.une)(A, B);
720     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
721 
722     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
723     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
724     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
725     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
726     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
727     static immutable int[4] correct_one    = [-1, 0,-1, 0];
728     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
729     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
730     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
731     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
732     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
733     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
734     static immutable int[4] correct_une    = [-1, 0,-1,-1];
735     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
736 
737     assert(result_oeq.array == correct_oeq);
738     assert(result_ogt.array == correct_ogt);
739     assert(result_oge.array == correct_oge);
740     assert(result_olt.array == correct_olt);
741     assert(result_ole.array == correct_ole);
742     assert(result_one.array == correct_one);
743     assert(result_ord.array == correct_ord);
744     assert(result_ueq.array == correct_ueq);
745     assert(result_ugt.array == correct_ugt);
746     assert(result_uge.array == correct_uge);
747     assert(result_ult.array == correct_ult);
748     assert(result_ule.array == correct_ule);
749     assert(result_une.array == correct_une);
750     assert(result_uno.array == correct_uno);
751 }
752 unittest
753 {
754     double2 a = [1, 3];
755     double2 b = [2, 3];
756     long2 c = cmppd!(FPComparison.ult)(a, b);
757     static immutable long[2] correct = [cast(long)(-1), 0];
758     assert(c.array == correct);
759 }
760 unittest // cmpss and comss
761 {
762     void testComparison(FPComparison comparison)(float4 A, float4 B)
763     {
764         float4 result = cmpss!comparison(A, B);
765         int4 iresult = cast(int4)result;
766         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
767         assert(iresult.array[0] == expected);
768         assert(result.array[1] == A.array[1]);
769         assert(result.array[2] == A.array[2]);
770         assert(result.array[3] == A.array[3]);
771 
772         // check comss
773         int comResult = comss!comparison(A, B);
774         assert( (expected != 0) == (comResult != 0) );
775     }
776 
777     // Check all comparison type is working
778     float4 A = [1, 3, 5, 6];
779     float4 B = [2, 3, 4, 5];
780     float4 C = [float.nan, 3, 4, 5];
781 
782     testComparison!(FPComparison.oeq)(A, B);
783     testComparison!(FPComparison.oeq)(A, C);
784     testComparison!(FPComparison.ogt)(A, B);
785     testComparison!(FPComparison.ogt)(A, C);
786     testComparison!(FPComparison.oge)(A, B);
787     testComparison!(FPComparison.oge)(A, C);
788     testComparison!(FPComparison.olt)(A, B);
789     testComparison!(FPComparison.olt)(A, C);
790     testComparison!(FPComparison.ole)(A, B);
791     testComparison!(FPComparison.ole)(A, C);
792     testComparison!(FPComparison.one)(A, B);
793     testComparison!(FPComparison.one)(A, C);
794     testComparison!(FPComparison.ord)(A, B);
795     testComparison!(FPComparison.ord)(A, C);
796     testComparison!(FPComparison.ueq)(A, B);
797     testComparison!(FPComparison.ueq)(A, C);
798     testComparison!(FPComparison.ugt)(A, B);
799     testComparison!(FPComparison.ugt)(A, C);
800     testComparison!(FPComparison.uge)(A, B);
801     testComparison!(FPComparison.uge)(A, C);
802     testComparison!(FPComparison.ult)(A, B);
803     testComparison!(FPComparison.ult)(A, C);
804     testComparison!(FPComparison.ule)(A, B);
805     testComparison!(FPComparison.ule)(A, C);
806     testComparison!(FPComparison.une)(A, B);
807     testComparison!(FPComparison.une)(A, C);
808     testComparison!(FPComparison.uno)(A, B);
809     testComparison!(FPComparison.uno)(A, C);
810 }
811 unittest // cmpsd and comsd
812 {
813     void testComparison(FPComparison comparison)(double2 A, double2 B)
814     {
815         double2 result = cmpsd!comparison(A, B);
816         long2 iresult = cast(long2)result;
817         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
818         assert(iresult.array[0] == expected);
819         assert(result.array[1] == A.array[1]);
820 
821         // check comsd
822         int comResult = comsd!comparison(A, B);
823         assert( (expected != 0) == (comResult != 0) );
824     }
825 
826     // Check all comparison type is working
827     double2 A = [1, 3];
828     double2 B = [2, 4];
829     double2 C = [double.nan, 5];
830 
831     testComparison!(FPComparison.oeq)(A, B);
832     testComparison!(FPComparison.oeq)(A, C);
833     testComparison!(FPComparison.ogt)(A, B);
834     testComparison!(FPComparison.ogt)(A, C);
835     testComparison!(FPComparison.oge)(A, B);
836     testComparison!(FPComparison.oge)(A, C);
837     testComparison!(FPComparison.olt)(A, B);
838     testComparison!(FPComparison.olt)(A, C);
839     testComparison!(FPComparison.ole)(A, B);
840     testComparison!(FPComparison.ole)(A, C);
841     testComparison!(FPComparison.one)(A, B);
842     testComparison!(FPComparison.one)(A, C);
843     testComparison!(FPComparison.ord)(A, B);
844     testComparison!(FPComparison.ord)(A, C);
845     testComparison!(FPComparison.ueq)(A, B);
846     testComparison!(FPComparison.ueq)(A, C);
847     testComparison!(FPComparison.ugt)(A, B);
848     testComparison!(FPComparison.ugt)(A, C);
849     testComparison!(FPComparison.uge)(A, B);
850     testComparison!(FPComparison.uge)(A, C);
851     testComparison!(FPComparison.ult)(A, B);
852     testComparison!(FPComparison.ult)(A, C);
853     testComparison!(FPComparison.ule)(A, B);
854     testComparison!(FPComparison.ule)(A, C);
855     testComparison!(FPComparison.une)(A, B);
856     testComparison!(FPComparison.une)(A, C);
857     testComparison!(FPComparison.uno)(A, B);
858     testComparison!(FPComparison.uno)(A, C);
859 }
860 
861 //
862 //  </FLOATING-POINT COMPARISONS>
863 //
864 
865 
866 __m64 to_m64(__m128i a) pure @trusted
867 {
868     long2 la = cast(long2)a;
869     long1 r;
870     r.ptr[0] = la.array[0];
871     return r;
872 }
873 
874 __m128i to_m128i(__m64 a) pure @trusted
875 {
876     long2 r = [0, 0];
877     r.ptr[0] = a.array[0];
878     return cast(__m128i)r;
879 }