1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.internals;
7 
8 import inteli.types;
9 
10 // The only math functions needed for intel-intrinsics
11 public import core.math: sqrt; // since it's an intrinsics
12 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit
13 
14 
15 version(GNU)
16 {
17     version (X86)
18     {
19         // For 32-bit x86, disable vector extensions with GDC. 
20         // It just doesn't work well.
21         enum GDC_with_x86 = true;
22         enum GDC_with_MMX = false;
23         enum GDC_with_SSE = false;
24         enum GDC_with_SSE2 = false;
25         enum GDC_with_SSE3 = false;
26         enum LDC_with_ARM32 = false;
27         enum LDC_with_ARM64 = false;
28         enum LDC_with_SSE1 = false;
29         enum LDC_with_SSE2 = false;
30         enum LDC_with_SSE3 = false;
31     }
32     else version (X86_64)
33     {
34         // GDC support uses extended inline assembly:
35         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
36         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
37         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
38 
39         public import core.simd;
40 
41         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
42         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
43         public import gcc.builtins;
44                 
45         enum GDC_with_x86 = true;
46         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
47         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
48         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
49         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
50         enum LDC_with_ARM32 = false;
51         enum LDC_with_ARM64 = false;
52         enum LDC_with_SSE1 = false;
53         enum LDC_with_SSE2 = false;
54         enum LDC_with_SSE3 = false;
55     }
56     else
57     {
58         enum GDC_with_x86 = false;
59         enum GDC_with_MMX = false;
60         enum GDC_with_SSE = false;
61         enum GDC_with_SSE2 = false;
62         enum GDC_with_SSE3 = false;
63         enum LDC_with_ARM32 = false;
64         enum LDC_with_ARM64 = false;
65         enum LDC_with_SSE1 = false;
66         enum LDC_with_SSE2 = false;
67         enum LDC_with_SSE3 = false;
68     }
69 }
70 else version(LDC)
71 {
72     public import core.simd;
73     public import ldc.simd;
74     public import ldc.intrinsics;
75     public import ldc.llvmasm: __asm;
76 
77     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
78     static if (__VERSION__ >= 2083)
79     {
80          import ldc.llvmasm;
81          alias LDCInlineIR = __ir_pure;
82 
83          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
84          alias LDCInlineIREx = __irEx_pure; 
85     }
86     else
87     {
88         alias LDCInlineIR = inlineIR;
89     }
90     
91     package(inteli)
92     {
93         enum GDC_with_x86 = false;
94         enum GDC_with_MMX = false;
95         enum GDC_with_SSE = false;
96         enum GDC_with_SSE2 = false;
97         enum GDC_with_SSE3 = false;
98     }
99 
100     version(ARM)
101     {
102         public import ldc.gccbuiltins_arm;
103         enum LDC_with_ARM32 = true;
104         enum LDC_with_ARM64 = false;
105         enum LDC_with_SSE1 = false;
106         enum LDC_with_SSE2 = false;
107         enum LDC_with_SSE3 = false;
108     }
109     else version(AArch64)
110     {
111         //public import ldc.gccbuiltins_arm;
112         enum LDC_with_ARM32 = false;
113         enum LDC_with_ARM64 = true;
114         enum LDC_with_SSE1 = false;
115         enum LDC_with_SSE2 = false;
116         enum LDC_with_SSE3 = false;
117     }
118     else
119     {
120         public import ldc.gccbuiltins_x86;
121         enum LDC_with_ARM32 = false;
122         enum LDC_with_ARM64 = false;
123         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
124         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
125         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
126     }
127 }
128 else version(DigitalMars)
129 {
130     package(inteli)
131     {
132         enum GDC_with_x86 = false;
133         enum GDC_with_MMX = false;
134         enum GDC_with_SSE = false;
135         enum GDC_with_SSE2 = false;
136         enum GDC_with_SSE3 = false;
137         enum LDC_with_ARM32 = false;
138         enum LDC_with_ARM64 = false;
139         enum LDC_with_SSE1 = false;
140         enum LDC_with_SSE2 = false;
141         enum LDC_with_SSE3 = false;
142     }
143 }
144 else
145 {
146     static assert(false, "Unknown compiler");
147 }
148 
149 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though
150 
151 static if (LDC_with_ARM32)
152 {
153     package uint arm_get_fpcr() nothrow @nogc @trusted
154     {
155         return __builtin_arm_get_fpscr();
156     }
157 
158     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
159     {
160         __builtin_arm_set_fpscr(cw);
161     }
162 }
163 
164 static if (LDC_with_ARM64)
165 {
166     package uint arm_get_fpcr() pure nothrow @nogc @trusted
167     {
168         return __asm!uint("mrs $0, fpcr", "=r");
169     }
170 
171     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
172     {
173         __asm!void("ldr w2, $0 \n msr fpcr, x2", "m", cw);
174     }
175 }
176 
177 version(DigitalMars)
178 {
179     version(D_InlineAsm_X86)
180         enum DMD_with_asm = true;
181     else version(D_InlineAsm_X86_64)
182         enum DMD_with_asm = true;
183     else
184         enum DMD_with_asm = false;
185 
186     version(D_InlineAsm_X86)
187         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
188     else
189         enum DMD_with_32bit_asm = false;
190 }
191 else
192 {
193     enum DMD_with_asm = false;
194     enum DMD_with_32bit_asm = false;
195 }
196 
197 
198 package:
199 nothrow @nogc:
200 
201 
202 // For internal use only, since public API deals with a x86 semantic emulation
203 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
204 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
205 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
206 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
207 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
208 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
209 
210 
211 //
212 //  <ROUNDING>
213 //
214 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
215 //  doesn't change the FPU rounding mode, and isn't expected to do so.
216 //  So we devised these rounding function to help having consistent rouding between 
217 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
218 //
219 //  Note: There is no MXCSR in ARM. But there is fpscr that implements similar 
220 //  functionality the same.
221 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
222 //  There is no
223 //  We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
224 
225 int convertFloatToInt32UsingMXCSR(float value) @trusted
226 {
227     int result;
228     version(GNU)
229     {
230         asm pure nothrow @nogc @trusted
231         {
232             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
233         }
234     }
235     else static if (LDC_with_ARM32)
236     {
237         result = __asm!int(`vldr s2, $1
238                             vcvtr.s32.f32 s2, s2
239                             vmov $0, s2`, "=r,m", value);
240     }
241     else static if (LDC_with_ARM64)
242     {
243         // Get current rounding mode.
244         uint fpscr = arm_get_fpcr();
245 
246         switch(fpscr & _MM_ROUND_MASK_ARM)
247         {
248             default:
249             case _MM_ROUND_NEAREST_ARM:
250                 result = __asm!int(`ldr s2, $1
251                                     fcvtns $0,s2`, "=r,m", value);
252                 break;
253             case _MM_ROUND_DOWN_ARM:
254                 result = __asm!int(`ldr s2, $1
255                                     fcvtms $0,s2`, "=r,m", value);
256                 break;
257             case _MM_ROUND_UP_ARM:
258                 result = __asm!int(`ldr s2, $1
259                                     fcvtps $0,s2`, "=r,m", value);
260                 break;
261             case _MM_ROUND_TOWARD_ZERO_ARM:
262                 result = cast(int)value;
263                 break;
264         }
265     }
266     else
267     {        
268         asm pure nothrow @nogc @trusted
269         {
270             cvtss2si EAX, value;
271             mov result, EAX;
272         }
273     }
274     return result;
275 }
276 
277 int convertDoubleToInt32UsingMXCSR(double value) @trusted
278 {
279     int result;
280     version(GNU)
281     {
282         asm pure nothrow @nogc @trusted
283         {
284             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
285         }
286     }
287     else static if (LDC_with_ARM32)
288     {
289         result = __asm!int(`vldr d2, $1
290                             vcvtr.s32.f64 s2, d2
291                             vmov $0, s2`, "=r,m", value);
292     }
293     else static if (LDC_with_ARM64)
294     {
295         // Get current rounding mode.
296         uint fpscr = arm_get_fpcr();
297 
298         switch(fpscr & _MM_ROUND_MASK_ARM)
299         {
300             default:
301             case _MM_ROUND_NEAREST_ARM:
302                 result = __asm!int(`ldr d2, $1
303                                     fcvtns $0,d2`, "=r,m", value);
304                 break;
305             case _MM_ROUND_DOWN_ARM:
306                 result = __asm!int(`ldr d2, $1
307                                     fcvtms $0,d2`, "=r,m", value);
308                 break;
309             case _MM_ROUND_UP_ARM:
310                 result = __asm!int(`ldr d2, $1
311                                     fcvtps $0,d2`, "=r,m", value);
312                 break;
313             case _MM_ROUND_TOWARD_ZERO_ARM:
314                 result = cast(int)value;
315                 break;
316         }
317     }
318     else
319     {
320         asm pure nothrow @nogc @trusted
321         {
322             cvtsd2si EAX, value;
323             mov result, EAX;
324         }
325     }
326     return result;
327 }
328 
329 long convertFloatToInt64UsingMXCSR(float value) @trusted
330 {
331     static if (LDC_with_ARM32)
332     {
333         // We have to resort to libc since 32-bit ARM 
334         // doesn't seem to have 64-bit registers.
335         
336         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
337 
338         // Note: converting to double precision else rounding could be different for large integers
339         double asDouble = value; 
340 
341         switch(fpscr & _MM_ROUND_MASK_ARM)
342         {
343             default:
344             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
345             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
346             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
347             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
348         }
349     }
350     else static if (LDC_with_ARM64)
351     {
352         uint fpscr = arm_get_fpcr();
353 
354         switch(fpscr & _MM_ROUND_MASK_ARM)
355         {
356             default:
357             case _MM_ROUND_NEAREST_ARM:
358                 return __asm!long(`ldr s2, $1
359                                    fcvtns $0,s2`, "=r,m", value);
360             case _MM_ROUND_DOWN_ARM:
361                 return __asm!long(`ldr s2, $1
362                                    fcvtms $0,s2`, "=r,m", value);
363             case _MM_ROUND_UP_ARM:
364                 return __asm!long(`ldr s2, $1
365                                    fcvtps $0,s2`, "=r,m", value);
366             case _MM_ROUND_TOWARD_ZERO_ARM:
367                 return cast(long)value;
368         }
369     }
370     // 64-bit can use an SSE instruction
371     else version(D_InlineAsm_X86_64)
372     {
373         long result;
374         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
375         {
376             asm pure nothrow @nogc @trusted
377             {
378                 movss XMM0, value;
379                 cvtss2si RAX, XMM0;
380                 mov result, RAX;
381             }
382         }
383         else
384         {
385             asm pure nothrow @nogc @trusted
386             {
387                 movss XMM0, value;
388                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
389                 mov result, RAX;
390             }
391         }
392         return result;
393     }
394     else version(D_InlineAsm_X86)
395     {
396         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
397         // This leads to an unfortunate FPU sequence in every C++ compiler.
398         // See: https://godbolt.org/z/vZym77
399 
400         // Get current MXCSR rounding
401         uint sseRounding;
402         ushort savedFPUCW;
403         ushort newFPUCW;
404         long result;
405         asm pure nothrow @nogc @trusted
406         {
407             stmxcsr sseRounding;
408             fld value;
409             fnstcw savedFPUCW;
410             mov AX, savedFPUCW;
411             and AX, 0xf3ff;          // clear FPU rounding bits
412             movzx ECX, word ptr sseRounding;
413             and ECX, 0x6000;         // only keep SSE rounding bits
414             shr ECX, 3;
415             or AX, CX;               // make a new control word for FPU with SSE bits
416             mov newFPUCW, AX;
417             fldcw newFPUCW;
418             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
419             fldcw savedFPUCW;
420         }
421         return result;
422     }
423     else static if (GDC_with_x86)
424     {
425         version(X86_64) // 64-bit can just use the right instruction
426         {
427             static assert(GDC_with_SSE);
428             __m128 A;
429             A.ptr[0] = value;
430             return __builtin_ia32_cvtss2si64 (A);
431         }
432         else version(X86) // 32-bit
433         {
434             // This is untested!
435             uint sseRounding;
436             ushort savedFPUCW;
437             ushort newFPUCW;
438             long result;
439             asm pure nothrow @nogc @trusted
440             {
441                 "stmxcsr %1;\n" ~
442                 "fld %2;\n" ~
443                 "fnstcw %3;\n" ~
444                 "movw %3, %%ax;\n" ~
445                 "andw $0xf3ff, %%ax;\n" ~
446                 "movzwl %1, %%ecx;\n" ~
447                 "andl $0x6000, %%ecx;\n" ~
448                 "shrl $3, %%ecx;\n" ~
449                 "orw %%cx, %%ax\n" ~
450                 "movw %%ax, %4;\n" ~
451                 "fldcw %4;\n" ~
452                 "fistpll %0;\n" ~
453                 "fldcw %3;\n" 
454                   : "=m"(result)    // %0
455                   : "m" (sseRounding),
456                     "f" (value),
457                     "m" (savedFPUCW),
458                     "m" (newFPUCW) 
459                   : "eax", "ecx", "st";
460             }
461             return result;
462         }
463         else
464             static assert(false);
465     }
466     else
467         static assert(false);
468 }
469 
470 
471 ///ditto
472 long convertDoubleToInt64UsingMXCSR(double value) @trusted
473 {
474     static if (LDC_with_ARM32)
475     {
476         // We have to resort to libc since 32-bit ARM 
477         // doesn't seem to have 64-bit registers.
478         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
479         switch(fpscr & _MM_ROUND_MASK_ARM)
480         {
481             default:
482             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
483             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
484             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
485             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
486         }
487     }
488     else static if (LDC_with_ARM64)
489     {
490         // Get current rounding mode.
491         uint fpscr = arm_get_fpcr();
492 
493         switch(fpscr & _MM_ROUND_MASK_ARM)
494         {
495             default:
496             case _MM_ROUND_NEAREST_ARM:
497                 return __asm!long(`ldr d2, $1
498                                    fcvtns $0,d2`, "=r,m", value);
499             case _MM_ROUND_DOWN_ARM:
500                 return __asm!long(`ldr d2, $1
501                                    fcvtms $0,d2`, "=r,m", value);
502             case _MM_ROUND_UP_ARM:
503                 return __asm!long(`ldr d2, $1
504                                    fcvtps $0,d2`, "=r,m", value);
505             case _MM_ROUND_TOWARD_ZERO_ARM:
506                 return cast(long)value;
507         }
508     }
509     // 64-bit can use an SSE instruction
510     else version(D_InlineAsm_X86_64)
511     {
512         long result;
513         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
514         {
515             asm pure nothrow @nogc @trusted
516             {
517                 movsd XMM0, value;
518                 cvtsd2si RAX, XMM0;
519                 mov result, RAX;
520             }
521         }
522         else
523         {
524             asm pure nothrow @nogc @trusted
525             {
526                 movsd XMM0, value;
527                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
528                 mov result, RAX;
529             }
530         }
531         return result;
532     }
533     else version(D_InlineAsm_X86)
534     {
535         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
536         // This leads to an unfortunate FPU sequence in every C++ compiler.
537         // See: https://godbolt.org/z/vZym77
538 
539         // Get current MXCSR rounding
540         uint sseRounding;
541         ushort savedFPUCW;
542         ushort newFPUCW;
543         long result;
544         asm pure nothrow @nogc @trusted
545         {
546             stmxcsr sseRounding;
547             fld value;
548             fnstcw savedFPUCW;
549             mov AX, savedFPUCW;
550             and AX, 0xf3ff;
551             movzx ECX, word ptr sseRounding;
552             and ECX, 0x6000;
553             shr ECX, 3;
554             or AX, CX;
555             mov newFPUCW, AX;
556             fldcw newFPUCW;
557             fistp result;
558             fldcw savedFPUCW;
559         }
560         return result;
561     }
562     else static if (GDC_with_x86)
563     {
564         version(X86_64)
565         {
566             static assert(GDC_with_SSE2);
567             __m128d A;
568             A.ptr[0] = value;
569             return __builtin_ia32_cvtsd2si64 (A);
570         }
571         else
572         {
573             // This is untested!
574             uint sseRounding;
575             ushort savedFPUCW;
576             ushort newFPUCW;
577             long result;
578             asm pure nothrow @nogc @trusted
579             {
580                 "stmxcsr %1;\n" ~
581                 "fld %2;\n" ~
582                 "fnstcw %3;\n" ~
583                 "movw %3, %%ax;\n" ~
584                 "andw $0xf3ff, %%ax;\n" ~
585                 "movzwl %1, %%ecx;\n" ~
586                 "andl $0x6000, %%ecx;\n" ~
587                 "shrl $3, %%ecx;\n" ~
588                 "orw %%cx, %%ax\n" ~
589                 "movw %%ax, %4;\n" ~
590                 "fldcw %4;\n" ~
591                 "fistpll %0;\n" ~
592                 "fldcw %3;\n"         
593                   : "=m"(result)    // %0
594                   : "m" (sseRounding),
595                     "t" (value),
596                     "m" (savedFPUCW),
597                     "m" (newFPUCW) 
598                   : "eax", "ecx", "st";
599             }
600             return result;
601         }
602     }
603     else
604         static assert(false);
605 }
606 
607 //
608 //  </ROUNDING>
609 //
610 
611 
612 // using the Intel terminology here
613 
614 byte saturateSignedWordToSignedByte(short value) pure @safe
615 {
616     if (value > 127) value = 127;
617     if (value < -128) value = -128;
618     return cast(byte) value;
619 }
620 
621 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
622 {
623     if (value > 255) value = 255;
624     if (value < 0) value = 0;
625     return cast(ubyte) value;
626 }
627 
628 short saturateSignedIntToSignedShort(int value) pure @safe
629 {
630     if (value > 32767) value = 32767;
631     if (value < -32768) value = -32768;
632     return cast(short) value;
633 }
634 
635 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
636 {
637     if (value > 65535) value = 65535;
638     if (value < 0) value = 0;
639     return cast(ushort) value;
640 }
641 
642 unittest // test saturate operations
643 {
644     assert( saturateSignedWordToSignedByte(32000) == 127);
645     assert( saturateSignedWordToUnsignedByte(32000) == 255);
646     assert( saturateSignedWordToSignedByte(-4000) == -128);
647     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
648     assert( saturateSignedIntToSignedShort(32768) == 32767);
649     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
650     assert( saturateSignedIntToSignedShort(-32769) == -32768);
651     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
652 }
653 
654 version(unittest)
655 {
656     // This is just for debugging tests
657     import core.stdc.stdio: printf;
658 
659     // printing vectors for implementation
660     // Note: you can override `pure` within a `debug` clause
661 
662     void _mm_print_pi64(__m64 v) @trusted
663     {
664         long1 vl = cast(long1)v;
665         printf("%lld\n", vl.array[0]);
666     }
667 
668     void _mm_print_pi32(__m64 v) @trusted
669     {
670         int[2] C = (cast(int2)v).array;
671         printf("%d %d\n", C[0], C[1]);
672     }
673 
674     void _mm_print_pi16(__m64 v) @trusted
675     {
676         short[4] C = (cast(short4)v).array;
677         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
678     }
679 
680     void _mm_print_pi8(__m64 v) @trusted
681     {
682         byte[8] C = (cast(byte8)v).array;
683         printf("%d %d %d %d %d %d %d %d\n",
684         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
685     }
686 
687     void _mm_print_epi64(__m128i v) @trusted
688     {
689         long2 vl = cast(long2)v;
690         printf("%lld %lld\n", vl.array[0], vl.array[1]);
691     }
692 
693     void _mm_print_epi32(__m128i v) @trusted
694     {
695         printf("%d %d %d %d\n",
696               v.array[0], v.array[1], v.array[2], v.array[3]);
697     }  
698 
699     void _mm_print_epi16(__m128i v) @trusted
700     {
701         short[8] C = (cast(short8)v).array;
702         printf("%d %d %d %d %d %d %d %d\n",
703         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
704     }
705 
706     void _mm_print_epi8(__m128i v) @trusted
707     {
708         byte[16] C = (cast(byte16)v).array;
709         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
710         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
711     }
712 
713     void _mm_print_ps(__m128 v) @trusted
714     {
715         float[4] C = (cast(float4)v).array;
716         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
717     }
718 
719     void _mm_print_pd(__m128d v) @trusted
720     {
721         double[2] C = (cast(double2)v).array;
722         printf("%f %f\n", C[0], C[1]);
723     }    
724 }
725 
726 
727 //
728 //  <FLOATING-POINT COMPARISONS>
729 //
730 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
731 //       need different IR generation.
732 
733 enum FPComparison
734 {
735     oeq,   // ordered and equal
736     ogt,   // ordered and greater than
737     oge,   // ordered and greater than or equal
738     olt,   // ordered and less than
739     ole,   // ordered and less than or equal
740     one,   // ordered and not equal
741     ord,   // ordered (no nans)
742     ueq,   // unordered or equal
743     ugt,   // unordered or greater than ("nle")
744     uge,   // unordered or greater than or equal ("nlt")
745     ult,   // unordered or less than ("nge")
746     ule,   // unordered or less than or equal ("ngt")
747     une,   // unordered or not equal ("neq")
748     uno,   // unordered (either nans)
749 }
750 
751 private static immutable string[FPComparison.max+1] FPComparisonToString =
752 [
753     "oeq",
754     "ogt",
755     "oge",
756     "olt",
757     "ole",
758     "one",
759     "ord",
760     "ueq",
761     "ugt",
762     "uge",
763     "ult",
764     "ule",
765     "une",
766     "uno",
767 ];
768 
769 // Individual float comparison: returns -1 for true or 0 for false.
770 // Useful for DMD and testing
771 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
772 {
773     import std.math;
774     bool unordered = isNaN(a) || isNaN(b);
775     final switch(comparison) with(FPComparison)
776     {
777         case oeq: return a == b;
778         case ogt: return a > b;
779         case oge: return a >= b;
780         case olt: return a < b;
781         case ole: return a <= b;
782         case one: return !unordered && (a != b); // NaN with != always yields true
783         case ord: return !unordered; 
784         case ueq: return unordered || (a == b);
785         case ugt: return unordered || (a > b);
786         case uge: return unordered || (a >= b);
787         case ult: return unordered || (a < b);
788         case ule: return unordered || (a <= b);
789         case une: return (a != b); // NaN with != always yields true
790         case uno: return unordered;
791     }
792 }
793 
794 version(LDC)
795 {
796     /// Provides packed float comparisons
797     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
798     {
799         enum ir = `
800             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
801             %r = sext <4 x i1> %cmp to <4 x i32>
802             ret <4 x i32> %r`;
803 
804         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
805     }
806 
807     /// Provides packed double comparisons
808     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
809     {
810         enum ir = `
811             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
812             %r = sext <2 x i1> %cmp to <2 x i64>
813             ret <2 x i64> %r`;
814 
815         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
816     }
817 
818     /// CMPSS-style comparisons
819     /// clang implement it through x86 intrinsics, it is possible with IR alone
820     /// but leads to less optimal code.
821     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
822     /// Not that simple.
823     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
824     {
825         /*
826         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
827         enum bool invertOp = (predicateNumber & 0x80) != 0;
828         static if(invertOp)
829             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
830         else
831             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
832         */
833         enum ir = `
834             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
835             %r = sext i1 %cmp to i32
836             %r2 = bitcast i32 %r to float
837             ret float %r2`;
838 
839         float4 r = a;
840         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
841         return r;
842     }
843 
844     /// CMPSD-style comparisons
845     /// clang implement it through x86 intrinsics, it is possible with IR alone
846     /// but leads to less optimal code.
847     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
848     /// Not that simple.    
849     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
850     {
851         enum ir = `
852             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
853             %r = sext i1 %cmp to i64
854             %r2 = bitcast i64 %r to double
855             ret double %r2`;
856 
857         double2 r = a;
858         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
859         return r;
860     }
861 
862     // Note: ucomss and ucomsd are left unimplemented
863     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
864     {
865         enum ir = `
866             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
867             %r = zext i1 %cmp to i32
868             ret i32 %r`;
869 
870         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
871     }
872 
873     // Note: ucomss and ucomsd are left unimplemented
874     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
875     {
876         enum ir = `
877             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
878             %r = zext i1 %cmp to i32
879             ret i32 %r`;
880 
881         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
882     }
883 }
884 else
885 {
886     /// Provides packed float comparisons
887     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
888     {
889         int4 result;
890         foreach(i; 0..4)
891         {
892             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
893         }
894         return result;
895     }
896 
897     /// Provides packed double comparisons
898     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
899     {
900         long2 result;
901         foreach(i; 0..2)
902         {
903             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
904         }
905         return result;
906     }
907 
908     /// Provides CMPSS-style comparison
909     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
910     {
911         int4 result = cast(int4)a;
912         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
913         return cast(float4)result;
914     }
915 
916     /// Provides CMPSD-style comparison
917     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
918     {
919         long2 result = cast(long2)a;
920         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
921         return cast(double2)result;
922     }
923 
924     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
925     {
926         return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0;
927     }
928 
929     // Note: ucomss and ucomsd are left unimplemented
930     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
931     {
932         return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0;
933     }
934 }
935 unittest // cmpps
936 {
937     // Check all comparison type is working
938     float4 A = [1, 3, 5, float.nan];
939     float4 B = [2, 3, 4, 5];
940 
941     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
942     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
943     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
944     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
945     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
946     int4 result_one = cmpps!(FPComparison.one)(A, B);
947     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
948     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
949     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
950     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
951     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
952     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
953     int4 result_une = cmpps!(FPComparison.une)(A, B);
954     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
955 
956     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
957     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
958     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
959     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
960     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
961     static immutable int[4] correct_one    = [-1, 0,-1, 0];
962     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
963     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
964     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
965     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
966     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
967     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
968     static immutable int[4] correct_une    = [-1, 0,-1,-1];
969     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
970 
971     assert(result_oeq.array == correct_oeq);
972     assert(result_ogt.array == correct_ogt);
973     assert(result_oge.array == correct_oge);
974     assert(result_olt.array == correct_olt);
975     assert(result_ole.array == correct_ole);
976     assert(result_one.array == correct_one);
977     assert(result_ord.array == correct_ord);
978     assert(result_ueq.array == correct_ueq);
979     assert(result_ugt.array == correct_ugt);
980     assert(result_uge.array == correct_uge);
981     assert(result_ult.array == correct_ult);
982     assert(result_ule.array == correct_ule);
983     assert(result_une.array == correct_une);
984     assert(result_uno.array == correct_uno);
985 }
986 unittest
987 {
988     double2 a = [1, 3];
989     double2 b = [2, 3];
990     long2 c = cmppd!(FPComparison.ult)(a, b);
991     static immutable long[2] correct = [cast(long)(-1), 0];
992     assert(c.array == correct);
993 }
994 unittest // cmpss and comss
995 {
996     void testComparison(FPComparison comparison)(float4 A, float4 B)
997     {
998         float4 result = cmpss!comparison(A, B);
999         int4 iresult = cast(int4)result;
1000         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1001         assert(iresult.array[0] == expected);
1002         assert(result.array[1] == A.array[1]);
1003         assert(result.array[2] == A.array[2]);
1004         assert(result.array[3] == A.array[3]);
1005 
1006         // check comss
1007         int comResult = comss!comparison(A, B);
1008         assert( (expected != 0) == (comResult != 0) );
1009     }
1010 
1011     // Check all comparison type is working
1012     float4 A = [1, 3, 5, 6];
1013     float4 B = [2, 3, 4, 5];
1014     float4 C = [float.nan, 3, 4, 5];
1015 
1016     testComparison!(FPComparison.oeq)(A, B);
1017     testComparison!(FPComparison.oeq)(A, C);
1018     testComparison!(FPComparison.ogt)(A, B);
1019     testComparison!(FPComparison.ogt)(A, C);
1020     testComparison!(FPComparison.oge)(A, B);
1021     testComparison!(FPComparison.oge)(A, C);
1022     testComparison!(FPComparison.olt)(A, B);
1023     testComparison!(FPComparison.olt)(A, C);
1024     testComparison!(FPComparison.ole)(A, B);
1025     testComparison!(FPComparison.ole)(A, C);
1026     testComparison!(FPComparison.one)(A, B);
1027     testComparison!(FPComparison.one)(A, C);
1028     testComparison!(FPComparison.ord)(A, B);
1029     testComparison!(FPComparison.ord)(A, C);
1030     testComparison!(FPComparison.ueq)(A, B);
1031     testComparison!(FPComparison.ueq)(A, C);
1032     testComparison!(FPComparison.ugt)(A, B);
1033     testComparison!(FPComparison.ugt)(A, C);
1034     testComparison!(FPComparison.uge)(A, B);
1035     testComparison!(FPComparison.uge)(A, C);
1036     testComparison!(FPComparison.ult)(A, B);
1037     testComparison!(FPComparison.ult)(A, C);
1038     testComparison!(FPComparison.ule)(A, B);
1039     testComparison!(FPComparison.ule)(A, C);
1040     testComparison!(FPComparison.une)(A, B);
1041     testComparison!(FPComparison.une)(A, C);
1042     testComparison!(FPComparison.uno)(A, B);
1043     testComparison!(FPComparison.uno)(A, C);
1044 }
1045 unittest // cmpsd and comsd
1046 {
1047     void testComparison(FPComparison comparison)(double2 A, double2 B)
1048     {
1049         double2 result = cmpsd!comparison(A, B);
1050         long2 iresult = cast(long2)result;
1051         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1052         assert(iresult.array[0] == expected);
1053         assert(result.array[1] == A.array[1]);
1054 
1055         // check comsd
1056         int comResult = comsd!comparison(A, B);
1057         assert( (expected != 0) == (comResult != 0) );
1058     }
1059 
1060     // Check all comparison type is working
1061     double2 A = [1, 3];
1062     double2 B = [2, 4];
1063     double2 C = [double.nan, 5];
1064 
1065     testComparison!(FPComparison.oeq)(A, B);
1066     testComparison!(FPComparison.oeq)(A, C);
1067     testComparison!(FPComparison.ogt)(A, B);
1068     testComparison!(FPComparison.ogt)(A, C);
1069     testComparison!(FPComparison.oge)(A, B);
1070     testComparison!(FPComparison.oge)(A, C);
1071     testComparison!(FPComparison.olt)(A, B);
1072     testComparison!(FPComparison.olt)(A, C);
1073     testComparison!(FPComparison.ole)(A, B);
1074     testComparison!(FPComparison.ole)(A, C);
1075     testComparison!(FPComparison.one)(A, B);
1076     testComparison!(FPComparison.one)(A, C);
1077     testComparison!(FPComparison.ord)(A, B);
1078     testComparison!(FPComparison.ord)(A, C);
1079     testComparison!(FPComparison.ueq)(A, B);
1080     testComparison!(FPComparison.ueq)(A, C);
1081     testComparison!(FPComparison.ugt)(A, B);
1082     testComparison!(FPComparison.ugt)(A, C);
1083     testComparison!(FPComparison.uge)(A, B);
1084     testComparison!(FPComparison.uge)(A, C);
1085     testComparison!(FPComparison.ult)(A, B);
1086     testComparison!(FPComparison.ult)(A, C);
1087     testComparison!(FPComparison.ule)(A, B);
1088     testComparison!(FPComparison.ule)(A, C);
1089     testComparison!(FPComparison.une)(A, B);
1090     testComparison!(FPComparison.une)(A, C);
1091     testComparison!(FPComparison.uno)(A, B);
1092     testComparison!(FPComparison.uno)(A, C);
1093 }
1094 
1095 //
1096 //  </FLOATING-POINT COMPARISONS>
1097 //
1098 
1099 
1100 __m64 to_m64(__m128i a) pure @trusted
1101 {
1102     long2 la = cast(long2)a;
1103     long1 r;
1104     r.ptr[0] = la.array[0];
1105     return r;
1106 }
1107 
1108 __m128i to_m128i(__m64 a) pure @trusted
1109 {
1110     long2 r = [0, 0];
1111     r.ptr[0] = a.array[0];
1112     return cast(__m128i)r;
1113 }