inteli.internals source code

1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.internals;
7 
8 import inteli.types;
9 
10 // The only math functions needed for intel-intrinsics
11 public import core.math: fabs, sqrt; // since they are intrinsics
12 
13 version(LDC)
14 {
15     public import core.simd;
16     public import ldc.simd;
17     public import ldc.gccbuiltins_x86;
18     public import ldc.intrinsics;
19     public import ldc.llvmasm: __asm;
20 
21     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
22     static if (__VERSION__ >= 2083)
23     {
24          import ldc.llvmasm;
25          alias LDCInlineIR = __ir_pure;
26 
27          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
28          alias LDCInlineIREx = __irEx_pure; 
29     }
30     else
31     {
32         alias LDCInlineIR = inlineIR;
33     }
34 }
35 
36 
37 
38 package:
39 nothrow @nogc:
40 
41 
42 //
43 //  <ROUNDING>
44 //
45 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
46 //  doesn't change the FPU rounding mode, and isn't expected to do so.
47 //  So we devised these rounding function to help having consistent rouding between 
48 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
49 //
50 
51 
52 int convertFloatToInt32UsingMXCSR(float value) pure @safe
53 {
54     int result;
55     asm pure nothrow @nogc @trusted
56     {
57         cvtss2si EAX, value;
58         mov result, EAX;
59     }
60     return result;
61 }
62 
63 int convertDoubleToInt32UsingMXCSR(double value) pure @safe
64 {
65     int result;
66     asm pure nothrow @nogc @trusted
67     {
68         cvtsd2si EAX, value;
69         mov result, EAX;
70     }
71     return result;
72 }
73 
74 long convertFloatToInt64UsingMXCSR(float value) pure @safe
75 {
76     // 64-bit can use an SSE instruction
77     version(D_InlineAsm_X86_64)
78     {
79         long result;
80         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
81         {
82             asm pure nothrow @nogc @trusted
83             {
84                 movss XMM0, value;
85                 cvtss2si RAX, XMM0;
86                 mov result, RAX;
87             }
88         }
89         else
90         {
91             asm pure nothrow @nogc @trusted
92             {
93                 movss XMM0, value;
94                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
95                 mov result, RAX;
96             }
97         }
98         return result;
99     }
100     else version(D_InlineAsm_X86)
101     {
102         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
103         // This leads to an unfortunate FPU sequence in every C++ compiler.
104         // See: https://godbolt.org/z/vZym77
105 
106         // Get current MXCSR rounding
107         uint sseRounding;
108         ushort savedFPUCW;
109         ushort newFPUCW;
110         long result;
111         asm pure nothrow @nogc @trusted
112         {
113             stmxcsr sseRounding;
114             fld value;
115             fnstcw savedFPUCW;
116             mov AX, savedFPUCW;
117             and AX, 0xf3ff;          // clear FPU rounding bits
118             movzx ECX, word ptr sseRounding;
119             and ECX, 0x6000;         // only keep SSE rounding bits
120             shr ECX, 3;
121             or AX, CX;               // make a new control word for FPU with SSE bits
122             mov newFPUCW, AX;
123             fldcw newFPUCW;
124             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
125             fldcw savedFPUCW;
126         }
127         return result;
128     }
129     else
130         static assert(false);
131 }
132 
133 ///ditto
134 long convertDoubleToInt64UsingMXCSR(double value) pure @safe
135 {
136     // 64-bit can use an SSE instruction
137     version(D_InlineAsm_X86_64)
138     {
139         long result;
140         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
141         {
142             asm pure nothrow @nogc @trusted
143             {
144                 movsd XMM0, value;
145                 cvtsd2si RAX, XMM0;
146                 mov result, RAX;
147             }
148         }
149         else
150         {
151             asm pure nothrow @nogc @trusted
152             {
153                 movsd XMM0, value;
154                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
155                 mov result, RAX;
156             }
157         }
158         return result;
159     }
160     else version(D_InlineAsm_X86)
161     {
162         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
163         // This leads to an unfortunate FPU sequence in every C++ compiler.
164         // See: https://godbolt.org/z/vZym77
165 
166         // Get current MXCSR rounding
167         uint sseRounding;
168         ushort savedFPUCW;
169         ushort newFPUCW;
170         long result;
171         asm pure nothrow @nogc @trusted
172         {
173             stmxcsr sseRounding;
174             fld value;
175             fnstcw savedFPUCW;
176             mov AX, savedFPUCW;
177             and AX, 0xf3ff;
178             movzx ECX, word ptr sseRounding;
179             and ECX, 0x6000;
180             shr ECX, 3;
181             or AX, CX;
182             mov newFPUCW, AX;
183             fldcw newFPUCW;
184             fistp result;
185             fldcw savedFPUCW;
186         }
187         return result;
188     }
189     else
190         static assert(false);
191 }
192 
193 
194 //
195 //  </ROUNDING>
196 //
197 
198 
199 // using the Intel terminology here
200 
201 byte saturateSignedWordToSignedByte(short value) pure @safe
202 {
203     if (value > 127) value = 127;
204     if (value < -128) value = -128;
205     return cast(byte) value;
206 }
207 
208 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
209 {
210     if (value > 255) value = 255;
211     if (value < 0) value = 0;
212     return cast(ubyte) value;
213 }
214 
215 short saturateSignedIntToSignedShort(int value) pure @safe
216 {
217     if (value > 32767) value = 32767;
218     if (value < -32768) value = -32768;
219     return cast(short) value;
220 }
221 
222 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
223 {
224     if (value > 65535) value = 65535;
225     if (value < 0) value = 0;
226     return cast(ushort) value;
227 }
228 
229 unittest // test saturate operations
230 {
231     assert( saturateSignedWordToSignedByte(32000) == 127);
232     assert( saturateSignedWordToUnsignedByte(32000) == 255);
233     assert( saturateSignedWordToSignedByte(-4000) == -128);
234     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
235     assert( saturateSignedIntToSignedShort(32768) == 32767);
236     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
237     assert( saturateSignedIntToSignedShort(-32769) == -32768);
238     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
239 }
240 
241 version(unittest)
242 {
243     // This is just for debugging tests
244     import core.stdc.stdio: printf;
245 
246     // printing vectors for implementation
247     // Note: you can override `pure` within a `debug` clause
248 
249     void _mm_print_pi32(__m64 v) @trusted
250     {
251         int2 C = cast(int2)v;
252         printf("%d %d\n", C[0], C[1]);
253     }
254 
255     void _mm_print_pi16(__m64 v) @trusted
256     {
257         short4 C = cast(short4)v;
258         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
259     }
260 
261     void _mm_print_pi8(__m64 v) @trusted
262     {
263         byte8 C = cast(byte8)v;
264         printf("%d %d %d %d %d %d %d %d\n",
265         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
266     }
267 
268     void _mm_print_epi32(__m128i v) @trusted
269     {
270         printf("%d %d %d %d\n",
271               v[0], v[1], v[2], v[3]);
272     }
273 
274     void _mm_print_epi16(__m128i v) @trusted
275     {
276         short8 C = cast(short8)v;
277         printf("%d %d %d %d %d %d %d %d\n",
278         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
279     }
280 
281     void _mm_print_epi8(__m128i v) @trusted
282     {
283         byte16 C = cast(byte16)v;
284         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
285         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
286     }
287 
288     void _mm_print_ps(__m128 v) @trusted
289     {
290         float4 C = cast(float4)v;
291         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
292     }
293 
294     void _mm_print_pd(__m128d v) @trusted
295     {
296         double2 C = cast(double2)v;
297         printf("%f %f\n", C[0], C[1]);
298     }    
299 }
300 
301 
302 //
303 //  <FLOATING-POINT COMPARISONS>
304 //
305 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
306 //       need different IR generation.
307 
308 enum FPComparison
309 {
310     oeq,   // ordered and equal
311     ogt,   // ordered and greater than
312     oge,   // ordered and greater than or equal
313     olt,   // ordered and less than
314     ole,   // ordered and less than or equal
315     one,   // ordered and not equal
316     ord,   // ordered (no nans)
317     ueq,   // unordered or equal
318     ugt,   // unordered or greater than ("nle")
319     uge,   // unordered or greater than or equal ("nlt")
320     ult,   // unordered or less than ("nge")
321     ule,   // unordered or less than or equal ("ngt")
322     une,   // unordered or not equal ("neq")
323     uno,   // unordered (either nans)
324 }
325 
326 private static immutable string[FPComparison.max+1] FPComparisonToString =
327 [
328     "oeq",
329     "ogt",
330     "oge",
331     "olt",
332     "ole",
333     "one",
334     "ord",
335     "ueq",
336     "ugt",
337     "uge",
338     "ult",
339     "ule",
340     "une",
341     "uno",
342 ];
343 
344 // Individual float comparison: returns -1 for true or 0 for false.
345 // Useful for DMD and testing
346 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
347 {
348     import std.math;
349     bool unordered = isNaN(a) || isNaN(b);
350     final switch(comparison) with(FPComparison)
351     {
352         case oeq: return a == b;
353         case ogt: return a > b;
354         case oge: return a >= b;
355         case olt: return a < b;
356         case ole: return a <= b;
357         case one: return !unordered && (a != b); // NaN with != always yields true
358         case ord: return !unordered; 
359         case ueq: return unordered || (a == b);
360         case ugt: return unordered || (a > b);
361         case uge: return unordered || (a >= b);
362         case ult: return unordered || (a < b);
363         case ule: return unordered || (a <= b);
364         case une: return (a != b); // NaN with != always yields true
365         case uno: return unordered;
366     }
367 }
368 
369 version(LDC)
370 {
371     /// Provides packed float comparisons
372     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
373     {
374         enum ir = `
375             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
376             %r = sext <4 x i1> %cmp to <4 x i32>
377             ret <4 x i32> %r`;
378 
379         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
380     }
381 
382     /// Provides packed double comparisons
383     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
384     {
385         enum ir = `
386             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
387             %r = sext <2 x i1> %cmp to <2 x i64>
388             ret <2 x i64> %r`;
389 
390         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
391     }
392 
393     /// CMPSS-style comparisons
394     /// clang implement it through x86 intrinsics, it is possible with IR alone
395     /// but leads to less optimal code.
396     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
397     /// Not that simple.
398     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
399     {
400         /*
401         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
402         enum bool invertOp = (predicateNumber & 0x80) != 0;
403         static if(invertOp)
404             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
405         else
406             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
407         */
408         enum ir = `
409             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
410             %r = sext i1 %cmp to i32
411             %r2 = bitcast i32 %r to float
412             ret float %r2`;
413 
414         float4 r = a;
415         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
416         return r;
417     }
418 
419     /// CMPSD-style comparisons
420     /// clang implement it through x86 intrinsics, it is possible with IR alone
421     /// but leads to less optimal code.
422     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
423     /// Not that simple.    
424     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
425     {
426         enum ir = `
427             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
428             %r = sext i1 %cmp to i64
429             %r2 = bitcast i64 %r to double
430             ret double %r2`;
431 
432         double2 r = a;
433         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
434         return r;
435     }
436 
437     // Note: ucomss and ucomsd are left unimplemented
438     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
439     {
440         enum ir = `
441             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
442             %r = zext i1 %cmp to i32
443             ret i32 %r`;
444 
445         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
446     }
447 
448     // Note: ucomss and ucomsd are left unimplemented
449     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
450     {
451         enum ir = `
452             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
453             %r = zext i1 %cmp to i32
454             ret i32 %r`;
455 
456         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
457     }
458 }
459 else
460 {
461     /// Provides packed float comparisons
462     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
463     {
464         int4 result;
465         foreach(i; 0..4)
466         {
467             result[i] = compareFloat!float(comparison, a[i], b[i]) ? -1 : 0;
468         }
469         return result;
470     }
471 
472     /// Provides packed double comparisons
473     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
474     {
475         long2 result;
476         foreach(i; 0..2)
477         {
478             result[i] = compareFloat!double(comparison, a[i], b[i]) ? -1 : 0;
479         }
480         return result;
481     }
482 
483     /// Provides CMPSS-style comparison
484     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
485     {
486         int4 result = cast(int4)a;
487         result[0] = compareFloat!float(comparison, a[0], b[0]) ? -1 : 0;
488         return cast(float4)result;
489     }
490 
491     /// Provides CMPSD-style comparison
492     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
493     {
494         long2 result = cast(long2)a;
495         result[0] = compareFloat!double(comparison, a[0], b[0]) ? -1 : 0;
496         return cast(double2)result;
497     }
498 
499     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
500     {
501         return compareFloat!float(comparison, a[0], b[0]) ? 1 : 0;
502     }
503 
504     // Note: ucomss and ucomsd are left unimplemented
505     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
506     {
507         return compareFloat!double(comparison, a[0], b[0]) ? 1 : 0;
508     }
509 }
510 unittest // cmpps
511 {
512     // Check all comparison type is working
513     float4 A = [1, 3, 5, float.nan];
514     float4 B = [2, 3, 4, 5];
515 
516     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
517     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
518     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
519     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
520     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
521     int4 result_one = cmpps!(FPComparison.one)(A, B);
522     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
523     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
524     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
525     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
526     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
527     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
528     int4 result_une = cmpps!(FPComparison.une)(A, B);
529     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
530 
531     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
532     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
533     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
534     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
535     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
536     static immutable int[4] correct_one    = [-1, 0,-1, 0];
537     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
538     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
539     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
540     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
541     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
542     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
543     static immutable int[4] correct_une    = [-1, 0,-1,-1];
544     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
545 
546     assert(result_oeq.array == correct_oeq);
547     assert(result_ogt.array == correct_ogt);
548     assert(result_oge.array == correct_oge);
549     assert(result_olt.array == correct_olt);
550     assert(result_ole.array == correct_ole);
551     assert(result_one.array == correct_one);
552     assert(result_ord.array == correct_ord);
553     assert(result_ueq.array == correct_ueq);
554     assert(result_ugt.array == correct_ugt);
555     assert(result_uge.array == correct_uge);
556     assert(result_ult.array == correct_ult);
557     assert(result_ule.array == correct_ule);
558     assert(result_une.array == correct_une);
559     assert(result_uno.array == correct_uno);
560 }
561 unittest
562 {
563     double2 a = [1, 3];
564     double2 b = [2, 3];
565     long2 c = cmppd!(FPComparison.ult)(a, b);
566     static immutable long[2] correct = [cast(long)(-1), 0];
567     assert(c.array == correct);
568 }
569 unittest // cmpss and comss
570 {
571     void testComparison(FPComparison comparison)(float4 A, float4 B)
572     {
573         float4 result = cmpss!comparison(A, B);
574         int4 iresult = cast(int4)result;
575         int expected = compareFloat!float(comparison, A[0], B[0]) ? -1 : 0;
576         assert(iresult[0] == expected);
577         assert(result[1] == A[1]);
578         assert(result[2] == A[2]);
579         assert(result[3] == A[3]);
580 
581         // check comss
582         int comResult = comss!comparison(A, B);
583         assert( (expected != 0) == (comResult != 0) );
584     }
585 
586     // Check all comparison type is working
587     float4 A = [1, 3, 5, 6];
588     float4 B = [2, 3, 4, 5];
589     float4 C = [float.nan, 3, 4, 5];
590 
591     testComparison!(FPComparison.oeq)(A, B);
592     testComparison!(FPComparison.oeq)(A, C);
593     testComparison!(FPComparison.ogt)(A, B);
594     testComparison!(FPComparison.ogt)(A, C);
595     testComparison!(FPComparison.oge)(A, B);
596     testComparison!(FPComparison.oge)(A, C);
597     testComparison!(FPComparison.olt)(A, B);
598     testComparison!(FPComparison.olt)(A, C);
599     testComparison!(FPComparison.ole)(A, B);
600     testComparison!(FPComparison.ole)(A, C);
601     testComparison!(FPComparison.one)(A, B);
602     testComparison!(FPComparison.one)(A, C);
603     testComparison!(FPComparison.ord)(A, B);
604     testComparison!(FPComparison.ord)(A, C);
605     testComparison!(FPComparison.ueq)(A, B);
606     testComparison!(FPComparison.ueq)(A, C);
607     testComparison!(FPComparison.ugt)(A, B);
608     testComparison!(FPComparison.ugt)(A, C);
609     testComparison!(FPComparison.uge)(A, B);
610     testComparison!(FPComparison.uge)(A, C);
611     testComparison!(FPComparison.ult)(A, B);
612     testComparison!(FPComparison.ult)(A, C);
613     testComparison!(FPComparison.ule)(A, B);
614     testComparison!(FPComparison.ule)(A, C);
615     testComparison!(FPComparison.une)(A, B);
616     testComparison!(FPComparison.une)(A, C);
617     testComparison!(FPComparison.uno)(A, B);
618     testComparison!(FPComparison.uno)(A, C);
619 }
620 unittest // cmpsd and comsd
621 {
622     void testComparison(FPComparison comparison)(double2 A, double2 B)
623     {
624         double2 result = cmpsd!comparison(A, B);
625         long2 iresult = cast(long2)result;
626         long expected = compareFloat!double(comparison, A[0], B[0]) ? -1 : 0;
627         assert(iresult[0] == expected);
628         assert(result[1] == A[1]);
629 
630         // check comsd
631         int comResult = comsd!comparison(A, B);
632         assert( (expected != 0) == (comResult != 0) );
633     }
634 
635     // Check all comparison type is working
636     double2 A = [1, 3];
637     double2 B = [2, 4];
638     double2 C = [double.nan, 5];
639 
640     testComparison!(FPComparison.oeq)(A, B);
641     testComparison!(FPComparison.oeq)(A, C);
642     testComparison!(FPComparison.ogt)(A, B);
643     testComparison!(FPComparison.ogt)(A, C);
644     testComparison!(FPComparison.oge)(A, B);
645     testComparison!(FPComparison.oge)(A, C);
646     testComparison!(FPComparison.olt)(A, B);
647     testComparison!(FPComparison.olt)(A, C);
648     testComparison!(FPComparison.ole)(A, B);
649     testComparison!(FPComparison.ole)(A, C);
650     testComparison!(FPComparison.one)(A, B);
651     testComparison!(FPComparison.one)(A, C);
652     testComparison!(FPComparison.ord)(A, B);
653     testComparison!(FPComparison.ord)(A, C);
654     testComparison!(FPComparison.ueq)(A, B);
655     testComparison!(FPComparison.ueq)(A, C);
656     testComparison!(FPComparison.ugt)(A, B);
657     testComparison!(FPComparison.ugt)(A, C);
658     testComparison!(FPComparison.uge)(A, B);
659     testComparison!(FPComparison.uge)(A, C);
660     testComparison!(FPComparison.ult)(A, B);
661     testComparison!(FPComparison.ult)(A, C);
662     testComparison!(FPComparison.ule)(A, B);
663     testComparison!(FPComparison.ule)(A, C);
664     testComparison!(FPComparison.une)(A, B);
665     testComparison!(FPComparison.une)(A, C);
666     testComparison!(FPComparison.uno)(A, B);
667     testComparison!(FPComparison.uno)(A, C);
668 }
669 
670 //
671 //  </FLOATING-POINT COMPARISONS>
672 //
673 
674 
675 __m64 to_m64(__m128i a) pure @safe
676 {
677     long2 la = cast(long2)a;
678     long1 r;
679     r[0] = la[0];
680     return r;
681 }
682 
683 __m128i to_m128i(__m64 a) pure @safe
684 {
685     long2 r = [0, 0];
686     r[0] = a[0];
687     return cast(__m128i)r;
688 }