1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_AVX = false;
34         enum GDC_with_AVX2 = false;
35         enum GDC_with_SHA = false;
36         enum GDC_with_BMI2 = false;
37     }
38     else version (X86_64)
39     {
40         // GDC support uses extended inline assembly:
41         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
42         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
43         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
44 
45         public import core.simd: byte16, short8, int4, float4, double2;
46 
47         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
48         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
49         public import gcc.builtins;
50 
51         // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 
52         // want to support other archs with GDC
53 
54         enum GDC_with_x86 = true;
55         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
56         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
57         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
58 
59         static if (__VERSION__ >= 2100) // Starting at GDC 12.1
60         {
61             enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps);
62             enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128);
63             enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps);
64             enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq);
65             enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256);
66             enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df);
67             enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si);
68 
69         }
70         else
71         {
72             // Before GCC 11.3, no reliable way to detect instruction sets.
73             // We start above detection at GCC 12, with DMDFE 2.100, which
74             // is more conservative.
75             enum GDC_with_SSE3 = false;
76             enum GDC_with_SSSE3 = false;
77             enum GDC_with_SSE41 = false;
78             enum GDC_with_SSE42 = false;
79             enum GDC_with_AVX = false;
80             enum GDC_with_AVX2 = false;
81             enum GDC_with_BMI2 = false;
82         }
83 
84         enum GDC_with_SHA = false; // TODO: detect that
85     }
86     else
87     {
88         enum GDC_with_x86 = false;
89         enum GDC_with_MMX = false;
90         enum GDC_with_SSE = false;
91         enum GDC_with_SSE2 = false;
92         enum GDC_with_SSE3 = false;
93         enum GDC_with_SSSE3 = false;
94         enum GDC_with_SSE41 = false;
95         enum GDC_with_SSE42 = false;
96         enum GDC_with_AVX = false;
97         enum GDC_with_AVX2 = false;
98         enum GDC_with_SHA = false;
99         enum GDC_with_BMI2 = false;
100     }
101 }
102 else
103 {
104     enum GDC_with_x86 = false;
105     enum GDC_with_MMX = false;
106     enum GDC_with_SSE = false;
107     enum GDC_with_SSE2 = false;
108     enum GDC_with_SSE3 = false;
109     enum GDC_with_SSSE3 = false;
110     enum GDC_with_SSE41 = false;
111     enum GDC_with_SSE42 = false;
112     enum GDC_with_AVX = false;
113     enum GDC_with_AVX2 = false;
114     enum GDC_with_SHA = false;
115     enum GDC_with_BMI2 = false;
116 }
117 
118 version(LDC)
119 {
120     public import core.simd;
121     public import ldc.simd;
122     public import ldc.intrinsics;
123     public import ldc.llvmasm: __asm;
124 
125     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
126     static if (__VERSION__ >= 2083)
127     {
128          import ldc.llvmasm;
129          alias LDCInlineIR = __ir_pure;
130 
131          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
132          alias LDCInlineIREx = __irEx_pure; 
133     }
134     else
135     {
136         alias LDCInlineIR = inlineIR;
137     }
138 
139     version(ARM)
140     {
141         public import ldc.gccbuiltins_arm;
142         enum LDC_with_ARM32 = true;
143         enum LDC_with_ARM64 = false;
144         enum LDC_with_ARM64_CRC = false;
145         enum LDC_with_SSE = false;
146         enum LDC_with_SSE2 = false;
147         enum LDC_with_SSE3 = false;
148         enum LDC_with_SSSE3 = false;
149         enum LDC_with_SSE41 = false;
150         enum LDC_with_SSE42 = false;
151         enum LDC_with_AVX = false;
152         enum LDC_with_AVX2 = false;
153         enum LDC_with_SHA = false;
154         enum LDC_with_BMI2 = false;
155     }
156     else version(AArch64)
157     {
158         public import ldc.gccbuiltins_aarch64;
159         enum LDC_with_ARM32 = false;
160         enum LDC_with_ARM64 = true; // implies "has Neon"
161         enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
162         enum LDC_with_SSE = false;
163         enum LDC_with_SSE2 = false;
164         enum LDC_with_SSE3 = false;
165         enum LDC_with_SSSE3 = false;
166         enum LDC_with_SSE41 = false;
167         enum LDC_with_SSE42 = false;
168         enum LDC_with_AVX = false;
169         enum LDC_with_AVX2 = false;
170         enum LDC_with_SHA = false;
171         enum LDC_with_BMI2 = false;
172     }
173     else
174     {
175         public import ldc.gccbuiltins_x86;
176         enum LDC_with_ARM32 = false;
177         enum LDC_with_ARM64 = false;
178         enum LDC_with_ARM64_CRC = false;
179         enum LDC_with_SSE = __traits(targetHasFeature, "sse");
180         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
181         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
182         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3");
183         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1");
184         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
185         enum LDC_with_AVX = __traits(targetHasFeature, "avx");
186         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2");
187         enum LDC_with_SHA = __traits(targetHasFeature, "sha");
188         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2");
189     }
190 }
191 else
192 {
193     enum LDC_with_ARM32 = false;
194     enum LDC_with_ARM64 = false;
195     enum LDC_with_ARM64_CRC = false;
196     enum LDC_with_SSE = false;
197     enum LDC_with_SSE2 = false;
198     enum LDC_with_SSE3 = false;
199     enum LDC_with_SSSE3 = false;
200     enum LDC_with_SSE41 = false;
201     enum LDC_with_SSE42 = false;
202     enum LDC_with_AVX = false;
203     enum LDC_with_AVX2 = false;
204     enum LDC_with_SHA = false;
205     enum LDC_with_BMI2 = false;
206 }
207 
208 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
209 
210 version(DigitalMars)
211 {
212     version(D_InlineAsm_X86)
213         enum DMD_with_asm = true;
214     else version(D_InlineAsm_X86_64)
215         enum DMD_with_asm = true;
216     else
217         enum DMD_with_asm = false;
218 
219     version(D_InlineAsm_X86)
220         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
221     else
222         enum DMD_with_32bit_asm = false;
223 
224     version (D_SIMD)
225     {
226         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
227 
228         // Going further, does DMD has SSE4.1 through -mcpu?
229         static if (DMD_with_DSIMD)
230             enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0));
231         else
232             enum bool DMD_with_DSIMD_and_SSE41 = false;
233 
234         // No DMD way to detect those instruction sets => pessimize
235         // would be cool to have a way to detect support for this at CT
236         enum DMD_with_DSIMD_and_SSE3  = DMD_with_DSIMD_and_SSE41; 
237         enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41;
238 
239         version(D_AVX)
240             enum DMD_with_DSIMD_and_AVX   = true;
241         else
242             enum DMD_with_DSIMD_and_AVX   = false;
243 
244         version(D_AVX2)
245             enum DMD_with_DSIMD_and_AVX2  = true;
246         else
247             enum DMD_with_DSIMD_and_AVX2  = false;
248 
249         enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX;
250     }
251     else
252     {
253         enum DMD_with_DSIMD = false;
254         enum DMD_with_DSIMD_and_SSE3  = false;
255         enum DMD_with_DSIMD_and_SSSE3 = false;
256         enum DMD_with_DSIMD_and_SSE41 = false;
257         enum DMD_with_DSIMD_and_SSE42 = false;
258         enum DMD_with_DSIMD_and_AVX   = false;
259         enum DMD_with_DSIMD_and_AVX2  = false;
260     }
261 }
262 else
263 {
264     enum DMD_with_asm = false;
265     enum DMD_with_32bit_asm = false;
266     enum DMD_with_DSIMD = false;
267     enum DMD_with_DSIMD_and_SSE3  = false;
268     enum DMD_with_DSIMD_and_SSSE3 = false;
269     enum DMD_with_DSIMD_and_SSE41 = false;
270     enum DMD_with_DSIMD_and_SSE42 = false;
271     enum DMD_with_DSIMD_and_AVX   = false;
272     enum DMD_with_DSIMD_and_AVX2  = false;
273 }
274 
275 
276 // Sometimes, can be helpful to merge builtin code, however keep in mind that
277 // LDC and GDC builtins often subtly diverse, wrt. unsigned vs signed vectors, 
278 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics.
279 enum GDC_or_LDC_with_SSE  = GDC_with_SSE  || LDC_with_SSE;
280 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2;
281 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3;
282 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41;
283 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42;
284 
285 enum GDC_or_LDC_with_AVX  = GDC_with_AVX  || LDC_with_AVX;
286 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2;
287 enum GDC_or_LDC_with_SHA  = GDC_with_SHA  || LDC_with_SHA;
288 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2;
289 
290 
291 static if (LDC_with_ARM32)
292 {
293     package uint arm_get_fpcr() nothrow @nogc @trusted
294     {
295         return __builtin_arm_get_fpscr();
296     }
297 
298     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
299     {
300         __builtin_arm_set_fpscr(cw);
301     }
302 }
303 
304 static if (LDC_with_ARM64)
305 {
306     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
307         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
308 
309     package uint arm_get_fpcr() pure nothrow @nogc @trusted
310     {
311         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
312         return __asm!uint("mrs $0, fpcr", "=r");
313     }
314 
315     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
316     {
317         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
318         long save_x2;
319         __asm!void("str x2, $1 \n" ~
320                    "ldr w2, $0 \n" ~
321                    "msr fpcr, x2 \n" ~
322                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
323     }
324 }
325 
326 
327 // For internal use only, since public API deals with a x86 semantic emulation
328 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
329 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
330 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
331 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
332 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
333 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
334 
335 
336 //
337 //  <ROUNDING>
338 //
339 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
340 //  doesn't change the FPU rounding mode, and isn't expected to do so.
341 //  So we devised these rounding function to help having consistent rounding between 
342 //  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
343 //
344 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
345 //  functionality.
346 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
347 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
348 
349 int convertFloatToInt32UsingMXCSR(float value) @trusted
350 {
351     int result;
352     version(GNU)
353     {
354         asm pure nothrow @nogc @trusted
355         {
356             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
357         }
358     }
359     else static if (LDC_with_ARM32)
360     {
361         result = __asm!int(`vldr s2, $1
362                             vcvtr.s32.f32 s2, s2
363                             vmov $0, s2`, "=r,m,~{s2}", value);
364     }
365     else static if (LDC_with_ARM64)
366     {
367         // Get current rounding mode.
368         uint fpscr = arm_get_fpcr();
369 
370         switch(fpscr & _MM_ROUND_MASK_ARM)
371         {
372             default:
373             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
374             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
375             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
376             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
377         }
378     }
379     else
380     {
381         asm pure nothrow @nogc @trusted
382         {
383             cvtss2si EAX, value;
384             mov result, EAX;
385         }
386     }
387     return result;
388 }
389 
390 int convertDoubleToInt32UsingMXCSR(double value) @trusted
391 {
392     int result;
393     version(GNU)
394     {
395         asm pure nothrow @nogc @trusted
396         {
397             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
398         }
399     }
400     else static if (LDC_with_ARM32)
401     {
402         result = __asm!int(`vldr d2, $1
403                             vcvtr.s32.f64 s2, d2
404                             vmov $0, s2`, "=r,m,~{s2},~{d2}", value);
405     }
406     else static if (LDC_with_ARM64)
407     {
408         // Get current rounding mode.
409         uint fpscr = arm_get_fpcr();
410 
411         switch(fpscr & _MM_ROUND_MASK_ARM)
412         {
413             default:
414             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
415             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
416             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
417             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
418         }
419     }
420     else
421     {
422         asm pure nothrow @nogc @trusted
423         {
424             cvtsd2si EAX, value;
425             mov result, EAX;
426         }
427     }
428     return result;
429 }
430 
431 long convertFloatToInt64UsingMXCSR(float value) @trusted
432 {
433     static if (LDC_with_ARM32)
434     {
435         // We have to resort to libc since 32-bit ARM 
436         // doesn't seem to have 64-bit registers.
437         
438         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
439 
440         // Note: converting to double precision else rounding could be different for large integers
441         double asDouble = value; 
442 
443         switch(fpscr & _MM_ROUND_MASK_ARM)
444         {
445             default:
446             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
447             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
448             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
449             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
450         }
451     }
452     else static if (LDC_with_ARM64)
453     {
454         uint fpscr = arm_get_fpcr();
455 
456         switch(fpscr & _MM_ROUND_MASK_ARM)
457         {
458             default:
459             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
460             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
461             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
462             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
463         }
464     }
465     // 64-bit can use an SSE instruction
466     else version(D_InlineAsm_X86_64)
467     {
468         long result;
469         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
470         {
471             asm pure nothrow @nogc @trusted
472             {
473                 movss XMM0, value;
474                 cvtss2si RAX, XMM0;
475                 mov result, RAX;
476             }
477         }
478         else
479         {
480             asm pure nothrow @nogc @trusted
481             {
482                 movss XMM0, value;
483                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
484                 mov result, RAX;
485             }
486         }
487         return result;
488     }
489     else version(D_InlineAsm_X86)
490     {
491         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
492         // This leads to an unfortunate FPU sequence in every C++ compiler.
493         // See: https://godbolt.org/z/vZym77
494 
495         // Get current MXCSR rounding
496         uint sseRounding;
497         ushort savedFPUCW;
498         ushort newFPUCW;
499         long result;
500         asm pure nothrow @nogc @trusted
501         {
502             stmxcsr sseRounding;
503             fld value;
504             fnstcw savedFPUCW;
505             mov AX, savedFPUCW;
506             and AX, 0xf3ff;          // clear FPU rounding bits
507             movzx ECX, word ptr sseRounding;
508             and ECX, 0x6000;         // only keep SSE rounding bits
509             shr ECX, 3;
510             or AX, CX;               // make a new control word for FPU with SSE bits
511             mov newFPUCW, AX;
512             fldcw newFPUCW;
513             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
514             fldcw savedFPUCW;
515         }
516         return result;
517     }
518     else static if (GDC_with_x86)
519     {
520         version(X86_64) // 64-bit can just use the right instruction
521         {
522             static assert(GDC_with_SSE);
523             __m128 A;
524             A.ptr[0] = value;
525             return __builtin_ia32_cvtss2si64 (A);
526         }
527         else version(X86) // 32-bit
528         {
529             // This is untested!
530             uint sseRounding;
531             ushort savedFPUCW;
532             ushort newFPUCW;
533             long result;
534             asm pure nothrow @nogc @trusted
535             {
536                 "stmxcsr %1;\n" ~
537                 "fld %2;\n" ~
538                 "fnstcw %3;\n" ~
539                 "movw %3, %%ax;\n" ~
540                 "andw $0xf3ff, %%ax;\n" ~
541                 "movzwl %1, %%ecx;\n" ~
542                 "andl $0x6000, %%ecx;\n" ~
543                 "shrl $3, %%ecx;\n" ~
544                 "orw %%cx, %%ax\n" ~
545                 "movw %%ax, %4;\n" ~
546                 "fldcw %4;\n" ~
547                 "fistpll %0;\n" ~
548                 "fldcw %3;\n" 
549                   : "=m"(result)    // %0
550                   : "m" (sseRounding),
551                     "f" (value),
552                     "m" (savedFPUCW),
553                     "m" (newFPUCW) 
554                   : "eax", "ecx", "st";
555             }
556             return result;
557         }
558         else
559             static assert(false);
560     }
561     else
562         static assert(false);
563 }
564 
565 
566 ///ditto
567 long convertDoubleToInt64UsingMXCSR(double value) @trusted
568 {
569     static if (LDC_with_ARM32)
570     {
571         // We have to resort to libc since 32-bit ARM 
572         // doesn't seem to have 64-bit registers.
573         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
574         switch(fpscr & _MM_ROUND_MASK_ARM)
575         {
576             default:
577             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
578             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
579             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
580             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
581         }
582     }
583     else static if (LDC_with_ARM64)
584     {
585         // Get current rounding mode.
586         uint fpscr = arm_get_fpcr();
587 
588         switch(fpscr & _MM_ROUND_MASK_ARM)
589         {
590             default:
591             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
592             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
593             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
594             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
595         }
596     }
597     // 64-bit can use an SSE instruction
598     else version(D_InlineAsm_X86_64)
599     {
600         long result;
601         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
602         {
603             asm pure nothrow @nogc @trusted
604             {
605                 movsd XMM0, value;
606                 cvtsd2si RAX, XMM0;
607                 mov result, RAX;
608             }
609         }
610         else
611         {
612             asm pure nothrow @nogc @trusted
613             {
614                 movsd XMM0, value;
615                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
616                 mov result, RAX;
617             }
618         }
619         return result;
620     }
621     else version(D_InlineAsm_X86)
622     {
623         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
624         // This leads to an unfortunate FPU sequence in every C++ compiler.
625         // See: https://godbolt.org/z/vZym77
626 
627         // Get current MXCSR rounding
628         uint sseRounding;
629         ushort savedFPUCW;
630         ushort newFPUCW;
631         long result;
632         asm pure nothrow @nogc @trusted
633         {
634             stmxcsr sseRounding;
635             fld value;
636             fnstcw savedFPUCW;
637             mov AX, savedFPUCW;
638             and AX, 0xf3ff;
639             movzx ECX, word ptr sseRounding;
640             and ECX, 0x6000;
641             shr ECX, 3;
642             or AX, CX;
643             mov newFPUCW, AX;
644             fldcw newFPUCW;
645             fistp result;
646             fldcw savedFPUCW;
647         }
648         return result;
649     }
650     else static if (GDC_with_x86)
651     {
652         version(X86_64)
653         {
654             static assert(GDC_with_SSE2);
655             __m128d A;
656             A.ptr[0] = value;
657             return __builtin_ia32_cvtsd2si64 (A);
658         }
659         else
660         {
661             // This is untested!
662             uint sseRounding;
663             ushort savedFPUCW;
664             ushort newFPUCW;
665             long result;
666             asm pure nothrow @nogc @trusted
667             {
668                 "stmxcsr %1;\n" ~
669                 "fld %2;\n" ~
670                 "fnstcw %3;\n" ~
671                 "movw %3, %%ax;\n" ~
672                 "andw $0xf3ff, %%ax;\n" ~
673                 "movzwl %1, %%ecx;\n" ~
674                 "andl $0x6000, %%ecx;\n" ~
675                 "shrl $3, %%ecx;\n" ~
676                 "orw %%cx, %%ax\n" ~
677                 "movw %%ax, %4;\n" ~
678                 "fldcw %4;\n" ~
679                 "fistpll %0;\n" ~
680                 "fldcw %3;\n"         
681                   : "=m"(result)    // %0
682                   : "m" (sseRounding),
683                     "t" (value),
684                     "m" (savedFPUCW),
685                     "m" (newFPUCW) 
686                   : "eax", "ecx", "st";
687             }
688             return result;
689         }
690     }
691     else
692         static assert(false);
693 }
694 
695 //
696 //  </ROUNDING>
697 //
698 
699 
700 // using the Intel terminology here
701 
702 byte saturateSignedWordToSignedByte(short value) pure @safe
703 {
704     if (value > 127) value = 127;
705     if (value < -128) value = -128;
706     return cast(byte) value;
707 }
708 
709 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
710 {
711     if (value > 255) value = 255;
712     if (value < 0) value = 0;
713     return cast(ubyte) value;
714 }
715 
716 short saturateSignedIntToSignedShort(int value) pure @safe
717 {
718     if (value > 32767) value = 32767;
719     if (value < -32768) value = -32768;
720     return cast(short) value;
721 }
722 
723 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
724 {
725     if (value > 65535) value = 65535;
726     if (value < 0) value = 0;
727     return cast(ushort) value;
728 }
729 
730 unittest // test saturate operations
731 {
732     assert( saturateSignedWordToSignedByte(32000) == 127);
733     assert( saturateSignedWordToUnsignedByte(32000) == 255);
734     assert( saturateSignedWordToSignedByte(-4000) == -128);
735     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
736     assert( saturateSignedIntToSignedShort(32768) == 32767);
737     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
738     assert( saturateSignedIntToSignedShort(-32769) == -32768);
739     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
740 }
741 
742 version(unittest)
743 {
744     // This is just for debugging tests
745     import core.stdc.stdio: printf;
746 
747     // printing vectors for implementation
748     // Note: you can override `pure` within a `debug` clause
749 
750     void _mm_print_pi64(__m64 v) @trusted
751     {
752         long1 vl = cast(long1)v;
753         printf("%lld\n", vl.array[0]);
754     }
755 
756     void _mm_print_pi32(__m64 v) @trusted
757     {
758         int[2] C = (cast(int2)v).array;
759         printf("%d %d\n", C[0], C[1]);
760     }
761 
762     void _mm_print_pi16(__m64 v) @trusted
763     {
764         short[4] C = (cast(short4)v).array;
765         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
766     }
767 
768     void _mm_print_pi8(__m64 v) @trusted
769     {
770         byte[8] C = (cast(byte8)v).array;
771         printf("%d %d %d %d %d %d %d %d\n",
772         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
773     }
774 
775     void _mm_print_epi64(__m128i v) @trusted
776     {
777         long2 vl = cast(long2)v;
778         printf("%lld %lld\n", vl.array[0], vl.array[1]);
779     }
780 
781     void _mm_print_epi32(__m128i v) @trusted
782     {
783         printf("%d %d %d %d\n",
784               v.array[0], v.array[1], v.array[2], v.array[3]);
785     }  
786 
787     void _mm_print_epi16(__m128i v) @trusted
788     {
789         short[8] C = (cast(short8)v).array;
790         printf("%d %d %d %d %d %d %d %d\n",
791         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
792     }
793 
794     void _mm_print_epi8(__m128i v) @trusted
795     {
796         byte[16] C = (cast(byte16)v).array;
797         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
798         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
799     }
800 
801     void _mm_print_ps(__m128 v) @trusted
802     {
803         // %g because %f can conceal very small numbers and prints zero instead
804         float[4] C = (cast(float4)v).array;
805         printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]);
806     }
807 
808     void _mm_print_pd(__m128d v) @trusted
809     {
810         double[2] C = (cast(double2)v).array;
811         printf("%f %f\n", C[0], C[1]);
812     }
813 
814     void _mm256_print_pd(__m256d v) @trusted
815     {
816         // %g because %f can conceal very small numbers and prints zero instead
817         printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 
818     }
819 
820     void _mm256_print_ps(__m256 v) @trusted
821     {
822         // %g because %f can conceal very small numbers and prints zero instead
823         printf("%g %g %g %g %g %g %g %g\n", 
824             v.array[0], v.array[1], v.array[2], v.array[3],
825             v.array[4], v.array[5], v.array[6], v.array[7]); 
826     }
827 
828     void _mm256_print_epi32(__m256i v) @trusted
829     {
830         int8 vl = cast(int8)v;
831         printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3],
832                                             vl.array[4], vl.array[5], vl.array[6], vl.array[7]);
833     }
834 
835     void _mm256_print_epi64(__m256i v) @trusted
836     {
837         long4 vl = cast(long4)v;
838         printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]);
839     }
840 }
841 
842 
843 //
844 //  <FLOATING-POINT COMPARISONS>
845 //
846 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
847 //       need different IR generation.
848 
849 enum FPComparison
850 {
851     false_,// always false
852     oeq,   // ordered and equal
853     ogt,   // ordered and greater than
854     oge,   // ordered and greater than or equal
855     olt,   // ordered and less than
856     ole,   // ordered and less than or equal
857     one,   // ordered and not equal
858     ord,   // ordered (no nans)
859     ueq,   // unordered or equal
860     ugt,   // unordered or greater than ("nle")
861     uge,   // unordered or greater than or equal ("nlt")
862     ult,   // unordered or less than ("nge")
863     ule,   // unordered or less than or equal ("ngt")
864     une,   // unordered or not equal ("neq")
865     uno,   // unordered (either nans)
866     true_, // always true
867 }
868 
869 private static immutable string[FPComparison.max+1] FPComparisonToString =
870 [
871     "false",
872     "oeq",
873     "ogt",
874     "oge",
875     "olt",
876     "ole",
877     "one",
878     "ord",
879     "ueq",
880     "ugt",
881     "uge",
882     "ult",
883     "ule",
884     "une",
885     "uno",
886     "true"
887 ];
888 
889 // AVX FP comparison to FPComparison
890 FPComparison mapAVXFPComparison(int imm8) pure @safe
891 {
892     // Always map on non-signalling
893     static immutable FPComparison[16] mapping =
894     [
895         FPComparison.oeq, // _CMP_EQ_OQ
896         FPComparison.olt, // _CMP_LT_OS
897         FPComparison.ole, // _CMP_LE_OS
898         FPComparison.uno, // _CMP_UNORD_Q
899         FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered?
900         FPComparison.uge, // _CMP_NLT_US
901         FPComparison.ugt, // _CMP_NLE_US
902         FPComparison.ord, // _CMP_ORD_Q
903         FPComparison.ueq,   // _CMP_EQ_UQ  
904         FPComparison.ult,   // _CMP_NGE_US 
905         FPComparison.ule,   // _CMP_NGT_US 
906         FPComparison.false_,// _CMP_FALSE_OQ
907         FPComparison.one,   // _CMP_NEQ_OQ
908         FPComparison.oge,   // _CMP_GE_OS
909         FPComparison.ogt,   // _CMP_GT_OS
910         FPComparison.true_  // _CMP_TRUE_UQ
911     ];
912 
913     return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up
914 }
915 
916 // Individual float comparison: returns -1 for true or 0 for false.
917 // Useful for DMD and testing
918 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
919 {
920     bool unordered = isnan(a) || isnan(b);
921     final switch(comparison) with(FPComparison)
922     {
923         case false_: return false;
924         case oeq: return a == b;
925         case ogt: return a > b;
926         case oge: return a >= b;
927         case olt: return a < b;
928         case ole: return a <= b;
929         case one: return !unordered && (a != b); // NaN with != always yields true
930         case ord: return !unordered; 
931         case ueq: return unordered || (a == b);
932         case ugt: return unordered || (a > b);
933         case uge: return unordered || (a >= b);
934         case ult: return unordered || (a < b);
935         case ule: return unordered || (a <= b);
936         case une: return (a != b); // NaN with != always yields true
937         case uno: return unordered;
938         case true_: return true;
939     }
940 }
941 
942 version(LDC)
943 {
944     /// Provides packed float comparisons
945     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
946     {
947         enum ir = `
948             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
949             %r = sext <4 x i1> %cmp to <4 x i32>
950             ret <4 x i32> %r`;
951 
952         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
953     }
954 
955     ///ditto
956     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe
957     {
958         enum ir = `
959             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1
960             %r = sext <8 x i1> %cmp to <8 x i32>
961             ret <8 x i32> %r`;
962         return LDCInlineIR!(ir, int8, float8, float8)(a, b);
963     }
964 
965     /// Provides packed double comparisons
966     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
967     {
968         enum ir = `
969             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
970             %r = sext <2 x i1> %cmp to <2 x i64>
971             ret <2 x i64> %r`;
972 
973         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
974     }
975 
976     ///ditto 
977     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe
978     {
979         enum ir = `
980             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1
981             %r = sext <4 x i1> %cmp to <4 x i64>
982             ret <4 x i64> %r`;
983         return LDCInlineIR!(ir, long4, double4, double4)(a, b);
984     }
985 
986     /// CMPSS-style comparisons
987     /// clang implement it through x86 intrinsics, it is possible with IR alone
988     /// but leads to less optimal code.
989     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
990     /// Not that simple.
991     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
992     {
993         /*
994         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
995         enum bool invertOp = (predicateNumber & 0x80) != 0;
996         static if(invertOp)
997             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
998         else
999             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
1000         */
1001         enum ir = `
1002             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
1003             %r = sext i1 %cmp to i32
1004             %r2 = bitcast i32 %r to float
1005             ret float %r2`;
1006 
1007         float4 r = a;
1008         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
1009         return r;
1010     }
1011 
1012     /// CMPSD-style comparisons
1013     /// clang implement it through x86 intrinsics, it is possible with IR alone
1014     /// but leads to less optimal code.
1015     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
1016     /// Not that simple.    
1017     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
1018     {
1019         enum ir = `
1020             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
1021             %r = sext i1 %cmp to i64
1022             %r2 = bitcast i64 %r to double
1023             ret double %r2`;
1024 
1025         double2 r = a;
1026         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
1027         return r;
1028     }
1029 }
1030 else
1031 {
1032     /// Provides packed float comparisons
1033     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
1034     {
1035         int4 result;
1036         foreach(i; 0..4)
1037         {
1038             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1039         }
1040         return result;
1041     }
1042     ///ditto
1043     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted
1044     {
1045         int8 result;
1046         foreach(i; 0..8)
1047         {
1048             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1049         }
1050         return result;
1051     }
1052 
1053     /// Provides packed double comparisons
1054     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1055     {
1056         long2 result;
1057         foreach(i; 0..2)
1058         {
1059             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1060         }
1061         return result;
1062     }
1063     ///ditto
1064     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted
1065     {
1066         long4 result;
1067         foreach(i; 0..4)
1068         {
1069             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1070         }
1071         return result;
1072     }
1073 
1074     /// Provides CMPSS-style comparison
1075     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
1076     {
1077         int4 result = cast(int4)a;
1078         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
1079         return cast(float4)result;
1080     }
1081 
1082     /// Provides CMPSD-style comparison
1083     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1084     {
1085         long2 result = cast(long2)a;
1086         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
1087         return cast(double2)result;
1088     }
1089 }
1090 unittest // cmpps
1091 {
1092     // Check all comparison type is working
1093     float4 A = [1, 3, 5, float.nan];
1094     float4 B = [2, 3, 4, 5];
1095 
1096     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
1097     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
1098     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
1099     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
1100     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
1101     int4 result_one = cmpps!(FPComparison.one)(A, B);
1102     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
1103     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
1104     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
1105     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
1106     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
1107     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
1108     int4 result_une = cmpps!(FPComparison.une)(A, B);
1109     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
1110 
1111     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
1112     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
1113     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
1114     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
1115     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
1116     static immutable int[4] correct_one    = [-1, 0,-1, 0];
1117     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
1118     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
1119     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
1120     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
1121     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
1122     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
1123     static immutable int[4] correct_une    = [-1, 0,-1,-1];
1124     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
1125 
1126     assert(result_oeq.array == correct_oeq);
1127     assert(result_ogt.array == correct_ogt);
1128     assert(result_oge.array == correct_oge);
1129     assert(result_olt.array == correct_olt);
1130     assert(result_ole.array == correct_ole);
1131     assert(result_one.array == correct_one);
1132     assert(result_ord.array == correct_ord);
1133     assert(result_ueq.array == correct_ueq);
1134     assert(result_ugt.array == correct_ugt);
1135     assert(result_uge.array == correct_uge);
1136     assert(result_ult.array == correct_ult);
1137     assert(result_ule.array == correct_ule);
1138     assert(result_une.array == correct_une);
1139     assert(result_uno.array == correct_uno);
1140 }
1141 unittest
1142 {
1143     double2 a = [1, 3];
1144     double2 b = [2, 3];
1145     long2 c = cmppd!(FPComparison.ult)(a, b);
1146     static immutable long[2] correct = [cast(long)(-1), 0];
1147     assert(c.array == correct);
1148 }
1149 unittest // cmpss
1150 {
1151     void testComparison(FPComparison comparison)(float4 A, float4 B)
1152     {
1153         float4 result = cmpss!comparison(A, B);
1154         int4 iresult = cast(int4)result;
1155         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1156         assert(iresult.array[0] == expected);
1157         assert(result.array[1] == A.array[1]);
1158         assert(result.array[2] == A.array[2]);
1159         assert(result.array[3] == A.array[3]);
1160     }
1161 
1162     // Check all comparison type is working
1163     float4 A = [1, 3, 5, 6];
1164     float4 B = [2, 3, 4, 5];
1165     float4 C = [float.nan, 3, 4, 5];
1166 
1167     testComparison!(FPComparison.oeq)(A, B);
1168     testComparison!(FPComparison.oeq)(A, C);
1169     testComparison!(FPComparison.ogt)(A, B);
1170     testComparison!(FPComparison.ogt)(A, C);
1171     testComparison!(FPComparison.oge)(A, B);
1172     testComparison!(FPComparison.oge)(A, C);
1173     testComparison!(FPComparison.olt)(A, B);
1174     testComparison!(FPComparison.olt)(A, C);
1175     testComparison!(FPComparison.ole)(A, B);
1176     testComparison!(FPComparison.ole)(A, C);
1177     testComparison!(FPComparison.one)(A, B);
1178     testComparison!(FPComparison.one)(A, C);
1179     testComparison!(FPComparison.ord)(A, B);
1180     testComparison!(FPComparison.ord)(A, C);
1181     testComparison!(FPComparison.ueq)(A, B);
1182     testComparison!(FPComparison.ueq)(A, C);
1183     testComparison!(FPComparison.ugt)(A, B);
1184     testComparison!(FPComparison.ugt)(A, C);
1185     testComparison!(FPComparison.uge)(A, B);
1186     testComparison!(FPComparison.uge)(A, C);
1187     testComparison!(FPComparison.ult)(A, B);
1188     testComparison!(FPComparison.ult)(A, C);
1189     testComparison!(FPComparison.ule)(A, B);
1190     testComparison!(FPComparison.ule)(A, C);
1191     testComparison!(FPComparison.une)(A, B);
1192     testComparison!(FPComparison.une)(A, C);
1193     testComparison!(FPComparison.uno)(A, B);
1194     testComparison!(FPComparison.uno)(A, C);
1195 }
1196 unittest // cmpsd
1197 {
1198     void testComparison(FPComparison comparison)(double2 A, double2 B)
1199     {
1200         double2 result = cmpsd!comparison(A, B);
1201         long2 iresult = cast(long2)result;
1202         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1203         assert(iresult.array[0] == expected);
1204         assert(result.array[1] == A.array[1]);
1205     }
1206 
1207     // Check all comparison type is working
1208     double2 A = [1, 3];
1209     double2 B = [2, 4];
1210     double2 C = [double.nan, 5];
1211 
1212     testComparison!(FPComparison.oeq)(A, B);
1213     testComparison!(FPComparison.oeq)(A, C);
1214     testComparison!(FPComparison.ogt)(A, B);
1215     testComparison!(FPComparison.ogt)(A, C);
1216     testComparison!(FPComparison.oge)(A, B);
1217     testComparison!(FPComparison.oge)(A, C);
1218     testComparison!(FPComparison.olt)(A, B);
1219     testComparison!(FPComparison.olt)(A, C);
1220     testComparison!(FPComparison.ole)(A, B);
1221     testComparison!(FPComparison.ole)(A, C);
1222     testComparison!(FPComparison.one)(A, B);
1223     testComparison!(FPComparison.one)(A, C);
1224     testComparison!(FPComparison.ord)(A, B);
1225     testComparison!(FPComparison.ord)(A, C);
1226     testComparison!(FPComparison.ueq)(A, B);
1227     testComparison!(FPComparison.ueq)(A, C);
1228     testComparison!(FPComparison.ugt)(A, B);
1229     testComparison!(FPComparison.ugt)(A, C);
1230     testComparison!(FPComparison.uge)(A, B);
1231     testComparison!(FPComparison.uge)(A, C);
1232     testComparison!(FPComparison.ult)(A, B);
1233     testComparison!(FPComparison.ult)(A, C);
1234     testComparison!(FPComparison.ule)(A, B);
1235     testComparison!(FPComparison.ule)(A, C);
1236     testComparison!(FPComparison.une)(A, B);
1237     testComparison!(FPComparison.une)(A, C);
1238     testComparison!(FPComparison.uno)(A, B);
1239     testComparison!(FPComparison.uno)(A, C);
1240 }
1241 
1242 //
1243 //  </FLOATING-POINT COMPARISONS>
1244 //
1245 
1246 
1247 __m64 to_m64(__m128i a) pure @trusted
1248 {
1249     long2 la = cast(long2)a;
1250     long1 r = la.array[0];
1251     return r;
1252 }
1253 
1254 __m128i to_m128i(__m64 a) pure @trusted
1255 {
1256   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1257     
1258     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1259     {
1260         long2 r = a.array[0];
1261         r.ptr[1] = 0;
1262         return cast(int4)r;
1263     }
1264     else */
1265     {
1266         long2 r = [0, 0];
1267         r.ptr[0] = a.array[0];
1268         return cast(__m128i)r;
1269     }
1270 }
1271 
1272 // ADDITIONAL x86 INTRINSICS
1273 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1274 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1275 static if (LDC_with_SSE41)
1276 {
1277     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1278         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1279 }
1280 
1281 // SOME NEON INTRINSICS
1282 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1283 // Not in the public API but the simde project expose it all for the user to use.
1284 // MAYDO: create a new neon.d module, for internal use only.
1285 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1286 static if (LDC_with_ARM64)
1287 {
1288     // VERY USEFUL LINK
1289     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1290     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1291 
1292     pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
1293         uint __crc32cb(uint a, uint b) pure @safe;
1294 
1295     pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
1296         uint __crc32ch(uint a, uint b) pure @safe;
1297 
1298     pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
1299         uint __crc32cw(uint a, uint b) pure @safe;
1300 
1301     pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
1302         uint __crc32cd(uint a, ulong b) pure @safe;
1303 
1304     //pragma(LDC_intrinsic, "llvm.aarch64.dmb")
1305     //    uint __dmb(int a) @safe; // didn't found a name in intrinsic list
1306 
1307     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1308         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1309 
1310     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1311         short8 vabsq_s16(short8 a) pure @safe;
1312 
1313     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1314         int4 vabsq_s32(int4 a) pure @safe;
1315 
1316     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1317         byte16 vabsq_s8(byte16 a) pure @safe;
1318 
1319     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1320     {
1321         return a & b;
1322     }
1323 
1324     long2 vandq_s64(long2 a, long2 b)
1325     {
1326         return a & b;
1327     }
1328 
1329     long2 vbicq_s64(long2 a, long2 b) pure @safe
1330     {
1331         return a & ~b;
1332     }
1333 
1334     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1335     {
1336         return c ^ ((c ^ b) & a);
1337     }
1338 
1339     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1340     {
1341         return c ^ ((c ^ b) & a);
1342     }
1343 
1344     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1345     {
1346         return c ^ ((c ^ b) & a);
1347     }
1348 
1349     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1350     {
1351         short8 r;
1352         r.ptr[0]  = lo.array[0];
1353         r.ptr[1]  = lo.array[1];
1354         r.ptr[2]  = lo.array[2];
1355         r.ptr[3]  = lo.array[3];
1356         r.ptr[4]  = hi.array[0];
1357         r.ptr[5]  = hi.array[1];
1358         r.ptr[6]  = hi.array[2];
1359         r.ptr[7]  = hi.array[3];
1360         return r;
1361     }
1362 
1363     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1364     {
1365         int4 r;
1366         r.ptr[0] = lo.array[0];
1367         r.ptr[1] = lo.array[1];
1368         r.ptr[2] = hi.array[0];
1369         r.ptr[3] = hi.array[1];
1370         return r;
1371     }
1372 
1373     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1374     {
1375         byte16 r;
1376         r.ptr[0]  = lo.array[0];
1377         r.ptr[1]  = lo.array[1];
1378         r.ptr[2]  = lo.array[2];
1379         r.ptr[3]  = lo.array[3];
1380         r.ptr[4]  = lo.array[4];
1381         r.ptr[5]  = lo.array[5];
1382         r.ptr[6]  = lo.array[6];
1383         r.ptr[7]  = lo.array[7];
1384         r.ptr[8]  = hi.array[0];
1385         r.ptr[9]  = hi.array[1];
1386         r.ptr[10] = hi.array[2];
1387         r.ptr[11] = hi.array[3];
1388         r.ptr[12] = hi.array[4];
1389         r.ptr[13] = hi.array[5];
1390         r.ptr[14] = hi.array[6];
1391         r.ptr[15] = hi.array[7];
1392         return r;
1393     }
1394 
1395     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1396     {
1397         short8 r;
1398         r.ptr[0]  = lo.array[0];
1399         r.ptr[1]  = lo.array[1];
1400         r.ptr[2]  = lo.array[2];
1401         r.ptr[3]  = lo.array[3];
1402         r.ptr[4]  = hi.array[0];
1403         r.ptr[5]  = hi.array[1];
1404         r.ptr[6]  = hi.array[2];
1405         r.ptr[7]  = hi.array[3];
1406         return r;
1407     }
1408 
1409 
1410     // float4 => int4
1411 
1412     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1413         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1414 
1415     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1416         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1417 
1418     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1419         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1420 
1421     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1422         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1423 
1424 
1425     // double2 => long2
1426 
1427     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1428         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1429 
1430     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1431         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1432 
1433     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1434         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1435 
1436     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1437         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1438 
1439     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1440         int vcvtms_s32_f32(float a) pure @safe;
1441 
1442     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1443         int vcvtns_s32_f32(float a) pure @safe;    
1444 
1445     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1446         int vcvtps_s32_f32(float a) pure @safe;
1447 
1448     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1449         int vcvts_s32_f32(float a) pure @safe;
1450      
1451     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1452         int vcvtms_s32_f64(double a) pure @safe;
1453 
1454     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1455         int vcvtns_s32_f64(double a) pure @safe;    
1456 
1457     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1458         int vcvtps_s32_f64(double a) pure @safe;
1459 
1460     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1461         int vcvts_s32_f64(double a) pure @safe;
1462 
1463     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1464         long vcvtms_s64_f32(float a) pure @safe;
1465 
1466     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1467         long vcvtns_s64_f32(float a) pure @safe;    
1468 
1469     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1470         long vcvtps_s64_f32(float a) pure @safe;
1471 
1472     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1473         long vcvts_s64_f32(float a) pure @safe;
1474 
1475     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1476         long vcvtms_s64_f64(double a) pure @safe;
1477 
1478     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1479         long vcvtns_s64_f64(double a) pure @safe;    
1480 
1481     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1482         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1483 
1484     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1485         long vcvts_s64_f64(double a) pure @safe;
1486 
1487     long2 vdupq_n_s64(long value) pure @safe
1488     {
1489         long2 r;
1490         r = value;
1491         return r;
1492     }
1493 
1494     short4 vget_high_s16(short8 a) pure @trusted
1495     {
1496         short4 r;
1497         r.ptr[0] = a.array[4];
1498         r.ptr[1] = a.array[5];
1499         r.ptr[2] = a.array[6];
1500         r.ptr[3] = a.array[7];
1501         return r;
1502     }
1503 
1504     int2 vget_high_s32(int4 a) pure @trusted
1505     {
1506         int2 r;
1507         r.ptr[0] = a.array[2];
1508         r.ptr[1] = a.array[3];
1509         return r;
1510     }
1511 
1512     byte8 vget_high_u8(byte16 a) pure @trusted
1513     {
1514         byte8 r;
1515         r.ptr[0] = a.array[8];
1516         r.ptr[1] = a.array[9];
1517         r.ptr[2] = a.array[10];
1518         r.ptr[3] = a.array[11];
1519         r.ptr[4] = a.array[12];
1520         r.ptr[5] = a.array[13];
1521         r.ptr[6] = a.array[14];
1522         r.ptr[7] = a.array[15];
1523         return r;
1524     }
1525 
1526     short4 vget_low_s16(short8 a) pure @trusted
1527     {
1528         short4 r;
1529         r.ptr[0] = a.array[0];
1530         r.ptr[1] = a.array[1];
1531         r.ptr[2] = a.array[2];
1532         r.ptr[3] = a.array[3];
1533         return r;
1534     } 
1535 
1536     int2 vget_low_s32(int4 a) pure @trusted
1537     {
1538         int2 r;
1539         r.ptr[0] = a.array[0];
1540         r.ptr[1] = a.array[1];
1541         return r;
1542     }
1543 
1544     byte8 vget_low_u8(byte16 a) pure @trusted
1545     {
1546         byte8 r;
1547         r.ptr[0] = a.array[0];
1548         r.ptr[1] = a.array[1];
1549         r.ptr[2] = a.array[2];
1550         r.ptr[3] = a.array[3];
1551         r.ptr[4] = a.array[4];
1552         r.ptr[5] = a.array[5];
1553         r.ptr[6] = a.array[6];
1554         r.ptr[7] = a.array[7];
1555         return r;
1556     }
1557 
1558     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1559     {
1560         return v.array[lane];
1561     }
1562 
1563     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1564         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1565 
1566     int4 vmaxq_s32(int4 a, int4 b)
1567     {
1568         int4 r;
1569         r[0] = a[0] >= b[0] ? a[0] : b[0];
1570         r[1] = a[1] >= b[1] ? a[1] : b[1];
1571         r[2] = a[2] >= b[2] ? a[2] : b[2];
1572         r[3] = a[3] >= b[3] ? a[3] : b[3];
1573         return r;
1574     }
1575 
1576     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1577         short8 vminq_s16(short8 a, short8 b) pure @safe;
1578 
1579     int2 vmovn_s64(long2 a) pure @trusted
1580     {
1581         int2 r;
1582         r.ptr[0] = cast(int)(a.array[0]);
1583         r.ptr[1] = cast(int)(a.array[1]);
1584         return r;
1585     }        
1586 
1587     int4 vmull_s16(short4 a, short4 b) pure @trusted
1588     {
1589         int4 r;
1590         r.ptr[0] = a.array[0] * b.array[0];
1591         r.ptr[1] = a.array[1] * b.array[1];
1592         r.ptr[2] = a.array[2] * b.array[2];
1593         r.ptr[3] = a.array[3] * b.array[3];
1594         return r;
1595     }
1596 
1597     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1598         long2 vmull_s32(int2 a, int2 b) pure @safe;
1599 
1600     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1601         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1602 
1603     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1604         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1605 
1606     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1607         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1608 
1609     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1610         short8 vpaddlq_u8 (byte16 a) pure @safe;
1611 
1612     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1613     {
1614         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1615             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1616     }
1617     else
1618     {
1619         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1620             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1621     }
1622     
1623     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1624         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1625 
1626     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1627         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1628 
1629     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1630         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1631 
1632     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1633         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1634 
1635     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1636         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1637 
1638     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1639         byte8 vqmovn_s16(short8 a) pure @safe;
1640 
1641     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1642         short4 vqmovn_s32(int4 a) pure @safe;
1643 
1644     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1645         short4 vqmovn_u32(int4 a) pure @safe;
1646 
1647     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1648         byte8 vqmovun_s16(short8 a) pure @safe;
1649 
1650     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1651         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1652 
1653     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1654         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1655 
1656     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1657         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1658 
1659     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1660         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1661 
1662     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1663         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1664 
1665     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1666         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1667 
1668     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1669     {
1670         return a >>> b;
1671     }
1672 
1673     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1674     { 
1675         a = a >> byte16(cast(byte)r);
1676         return a;
1677     }
1678 
1679     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1680         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1681 }
1682 
1683 version(unittest)
1684 {
1685     double abs_double(double x) @trusted
1686     {
1687         version(LDC)
1688             return llvm_fabs(x);
1689         else
1690         {
1691             long uf = *cast(long*)(&x);
1692             uf &= 0x7fffffff_ffffffff;
1693             return *cast(double*)(&uf);
1694         }
1695     }
1696 }
1697 
1698 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1699 
1700 bool isnan(float x) pure @trusted
1701 {
1702     uint u = *cast(uint*)(&x);
1703     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1704     return result;
1705 }
1706 unittest
1707 {
1708     float x = float.nan;
1709     assert(isnan(x));
1710 
1711     x = 0;
1712     assert(!isnan(x));
1713     
1714     x = float.infinity;
1715     assert(!isnan(x));
1716 }
1717 
1718 bool isnan(double x) pure @trusted
1719 {
1720     ulong u = *cast(ulong*)(&x);
1721     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1722 }
1723 unittest
1724 {
1725     double x = double.nan;
1726     assert(isnan(x));
1727 
1728     x = 0;
1729     assert(!isnan(x));
1730     
1731     x = double.infinity;
1732     assert(!isnan(x));
1733 }