1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_AVX = false;
34         enum GDC_with_AVX2 = false;
35         enum GDC_with_SHA = false;
36         enum GDC_with_BMI2 = false;
37     }
38     else version (X86_64)
39     {
40         // GDC support uses extended inline assembly:
41         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
42         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
43         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
44 
45         public import core.simd: byte16, short8, int4, float4, double2;
46 
47         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
48         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
49         public import gcc.builtins;
50 
51         // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 
52         // want to support other archs with GDC
53 
54         enum GDC_with_x86 = true;
55         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
56         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
57         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
58 
59         static if (__VERSION__ >= 2100) // Starting at GDC 12.1
60         {
61             enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps);
62             enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128);
63             enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps);
64             enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq);
65             enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256);
66             enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df);
67             enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si);
68 
69         }
70         else
71         {
72             // Before GCC 11.3, no reliable way to detect instruction sets.
73             // We start above detection at GCC 12, with DMDFE 2.100, which
74             // is more conservative.
75             enum GDC_with_SSE3 = false;
76             enum GDC_with_SSSE3 = false;
77             enum GDC_with_SSE41 = false;
78             enum GDC_with_SSE42 = false;
79             enum GDC_with_AVX = false;
80             enum GDC_with_AVX2 = false;
81             enum GDC_with_BMI2 = false;
82         }
83 
84         enum GDC_with_SHA = false; // TODO: detect that
85     }
86     else
87     {
88         enum GDC_with_x86 = false;
89         enum GDC_with_MMX = false;
90         enum GDC_with_SSE = false;
91         enum GDC_with_SSE2 = false;
92         enum GDC_with_SSE3 = false;
93         enum GDC_with_SSSE3 = false;
94         enum GDC_with_SSE41 = false;
95         enum GDC_with_SSE42 = false;
96         enum GDC_with_AVX = false;
97         enum GDC_with_AVX2 = false;
98         enum GDC_with_SHA = false;
99         enum GDC_with_BMI2 = false;
100     }
101 }
102 else
103 {
104     enum GDC_with_x86 = false;
105     enum GDC_with_MMX = false;
106     enum GDC_with_SSE = false;
107     enum GDC_with_SSE2 = false;
108     enum GDC_with_SSE3 = false;
109     enum GDC_with_SSSE3 = false;
110     enum GDC_with_SSE41 = false;
111     enum GDC_with_SSE42 = false;
112     enum GDC_with_AVX = false;
113     enum GDC_with_AVX2 = false;
114     enum GDC_with_SHA = false;
115     enum GDC_with_BMI2 = false;
116 }
117 
118 version(LDC)
119 {
120     public import core.simd;
121     public import ldc.simd;
122     public import ldc.intrinsics;
123     public import ldc.llvmasm: __asm;
124 
125     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
126     static if (__VERSION__ >= 2083)
127     {
128         import ldc.llvmasm;
129         alias LDCInlineIR = __ir_pure;
130 
131         // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
132         alias LDCInlineIREx = __irEx_pure; 
133 
134         enum bool LDC_with_InlineIREx = true;
135     }
136     else
137     {
138         alias LDCInlineIR = inlineIR;
139 
140         enum bool LDC_with_InlineIREx = false;
141     }
142 
143     version(ARM)
144     {
145         public import ldc.gccbuiltins_arm;
146         enum LDC_with_ARM32 = true;
147         enum LDC_with_ARM64 = false;
148         enum LDC_with_ARM64_CRC = false;
149         enum LDC_with_SSE = false;
150         enum LDC_with_SSE2 = false;
151         enum LDC_with_SSE3 = false;
152         enum LDC_with_SSSE3 = false;
153         enum LDC_with_SSE41 = false;
154         enum LDC_with_SSE42 = false;
155         enum LDC_with_CRC32 = false;
156         enum LDC_with_AVX = false;
157         enum LDC_with_AVX2 = false;
158         enum LDC_with_SHA = false;
159         enum LDC_with_BMI2 = false;
160     }
161     else version(AArch64)
162     {
163         public import ldc.gccbuiltins_aarch64;
164         enum LDC_with_ARM32 = false;
165         enum LDC_with_ARM64 = true; // implies "has Neon"
166         enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
167         enum LDC_with_SSE = false;
168         enum LDC_with_SSE2 = false;
169         enum LDC_with_SSE3 = false;
170         enum LDC_with_SSSE3 = false;
171         enum LDC_with_SSE41 = false;
172         enum LDC_with_SSE42 = false;
173         enum LDC_with_CRC32 = false;
174         enum LDC_with_AVX = false;
175         enum LDC_with_AVX2 = false;
176         enum LDC_with_SHA = false;
177         enum LDC_with_BMI2 = false;
178     }
179     else
180     {
181         public import ldc.gccbuiltins_x86;
182         enum LDC_with_ARM32 = false;
183         enum LDC_with_ARM64 = false;
184         enum LDC_with_ARM64_CRC = false;
185         enum LDC_with_SSE = __traits(targetHasFeature, "sse");
186         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
187         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
188         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3");
189         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1");
190         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
191 
192         // Since LDC 1.30, crc32 is a separate (and sufficient) attribute from sse4.2
193         // As of Jan 2023, GDC doesn't make that distinction, -msse4.2 includes -mcrc32 for GDC.
194         static if (__VERSION__ >= 2100)
195         {
196             enum LDC_with_CRC32 = __traits(targetHasFeature, "crc32");
197         }
198         else
199         {
200             enum LDC_with_CRC32 = __traits(targetHasFeature, "sse4.2"); // crc32 used to be included in sse4.2
201         }
202 
203         enum LDC_with_AVX = __traits(targetHasFeature, "avx");
204         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2");
205         enum LDC_with_SHA = __traits(targetHasFeature, "sha");
206         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2");
207     }
208 }
209 else
210 {
211     enum LDC_with_ARM32 = false;
212     enum LDC_with_ARM64 = false;
213     enum LDC_with_ARM64_CRC = false;
214     enum LDC_with_SSE = false;
215     enum LDC_with_SSE2 = false;
216     enum LDC_with_SSE3 = false;
217     enum LDC_with_SSSE3 = false;
218     enum LDC_with_SSE41 = false;
219     enum LDC_with_SSE42 = false;
220     enum LDC_with_CRC32 = false;
221     enum LDC_with_AVX = false;
222     enum LDC_with_AVX2 = false;
223     enum LDC_with_SHA = false;
224     enum LDC_with_BMI2 = false;
225     enum LDC_with_InlineIREx = false;
226 }
227 
228 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
229 
230 version(DigitalMars)
231 {
232     version(D_InlineAsm_X86)
233         enum DMD_with_asm = true;
234     else version(D_InlineAsm_X86_64)
235         enum DMD_with_asm = true;
236     else
237         enum DMD_with_asm = false;
238 
239     version(D_InlineAsm_X86)
240         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
241     else
242         enum DMD_with_32bit_asm = false;
243 
244     version (D_SIMD)
245     {
246         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
247 
248         // Going further, does DMD has SSE4.1 through -mcpu?
249         static if (DMD_with_DSIMD)
250             enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0));
251         else
252             enum bool DMD_with_DSIMD_and_SSE41 = false;
253 
254         // No DMD way to detect those instruction sets => pessimize
255         // would be cool to have a way to detect support for this at CT
256         enum DMD_with_DSIMD_and_SSE3  = DMD_with_DSIMD_and_SSE41; 
257         enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41;
258 
259         version(D_AVX)
260             enum DMD_with_DSIMD_and_AVX   = true;
261         else
262             enum DMD_with_DSIMD_and_AVX   = false;
263 
264         version(D_AVX2)
265             enum DMD_with_DSIMD_and_AVX2  = true;
266         else
267             enum DMD_with_DSIMD_and_AVX2  = false;
268 
269         enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX;
270     }
271     else
272     {
273         enum DMD_with_DSIMD = false;
274         enum DMD_with_DSIMD_and_SSE3  = false;
275         enum DMD_with_DSIMD_and_SSSE3 = false;
276         enum DMD_with_DSIMD_and_SSE41 = false;
277         enum DMD_with_DSIMD_and_SSE42 = false;
278         enum DMD_with_DSIMD_and_AVX   = false;
279         enum DMD_with_DSIMD_and_AVX2  = false;
280     }
281 }
282 else
283 {
284     enum DMD_with_asm = false;
285     enum DMD_with_32bit_asm = false;
286     enum DMD_with_DSIMD = false;
287     enum DMD_with_DSIMD_and_SSE3  = false;
288     enum DMD_with_DSIMD_and_SSSE3 = false;
289     enum DMD_with_DSIMD_and_SSE41 = false;
290     enum DMD_with_DSIMD_and_SSE42 = false;
291     enum DMD_with_DSIMD_and_AVX   = false;
292     enum DMD_with_DSIMD_and_AVX2  = false;
293 }
294 
295 
296 // Sometimes, can be helpful to merge builtin code, however keep in mind that
297 // LDC and GDC builtins often subtly diverse, wrt. unsigned vs signed vectors, 
298 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics.
299 enum GDC_or_LDC_with_SSE  = GDC_with_SSE  || LDC_with_SSE;
300 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2;
301 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3;
302 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41;
303 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42;
304 
305 enum GDC_or_LDC_with_AVX  = GDC_with_AVX  || LDC_with_AVX;
306 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2;
307 enum GDC_or_LDC_with_SHA  = GDC_with_SHA  || LDC_with_SHA;
308 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2;
309 
310 
311 static if (LDC_with_ARM32)
312 {
313     package uint arm_get_fpcr() nothrow @nogc @trusted
314     {
315         return __builtin_arm_get_fpscr();
316     }
317 
318     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
319     {
320         __builtin_arm_set_fpscr(cw);
321     }
322 }
323 
324 static if (LDC_with_ARM64)
325 {
326     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
327         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
328 
329     package uint arm_get_fpcr() pure nothrow @nogc @trusted
330     {
331         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
332         return __asm!uint("mrs $0, fpcr", "=r");
333     }
334 
335     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
336     {
337         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
338         long save_x2;
339         __asm!void("str x2, $1 \n" ~
340                    "ldr w2, $0 \n" ~
341                    "msr fpcr, x2 \n" ~
342                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
343     }
344 }
345 
346 
347 // For internal use only, since public API deals with a x86 semantic emulation
348 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
349 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
350 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
351 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
352 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
353 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
354 
355 
356 //
357 //  <ROUNDING>
358 //
359 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
360 //  doesn't change the FPU rounding mode, and isn't expected to do so.
361 //  So we devised these rounding function to help having consistent rounding between 
362 //  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
363 //
364 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
365 //  functionality.
366 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
367 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
368 
369 int convertFloatToInt32UsingMXCSR(float value) @trusted
370 {
371     int result;
372     version(GNU)
373     {
374         asm pure nothrow @nogc @trusted
375         {
376             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
377         }
378     }
379     else static if (LDC_with_ARM32)
380     {
381         result = __asm!int(`vldr s2, $1
382                             vcvtr.s32.f32 s2, s2
383                             vmov $0, s2`, "=r,m,~{s2}", value);
384     }
385     else static if (LDC_with_ARM64)
386     {
387         // Get current rounding mode.
388         uint fpscr = arm_get_fpcr();
389 
390         switch(fpscr & _MM_ROUND_MASK_ARM)
391         {
392             default:
393             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
394             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
395             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
396             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
397         }
398     }
399     else
400     {
401         asm pure nothrow @nogc @trusted
402         {
403             cvtss2si EAX, value;
404             mov result, EAX;
405         }
406     }
407     return result;
408 }
409 
410 int convertDoubleToInt32UsingMXCSR(double value) @trusted
411 {
412     int result;
413     version(GNU)
414     {
415         asm pure nothrow @nogc @trusted
416         {
417             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
418         }
419     }
420     else static if (LDC_with_ARM32)
421     {
422         result = __asm!int(`vldr d2, $1
423                             vcvtr.s32.f64 s2, d2
424                             vmov $0, s2`, "=r,m,~{s2},~{d2}", value);
425     }
426     else static if (LDC_with_ARM64)
427     {
428         // Get current rounding mode.
429         uint fpscr = arm_get_fpcr();
430 
431         switch(fpscr & _MM_ROUND_MASK_ARM)
432         {
433             default:
434             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
435             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
436             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
437             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
438         }
439     }
440     else
441     {
442         asm pure nothrow @nogc @trusted
443         {
444             cvtsd2si EAX, value;
445             mov result, EAX;
446         }
447     }
448     return result;
449 }
450 
451 long convertFloatToInt64UsingMXCSR(float value) @trusted
452 {
453     static if (LDC_with_ARM32)
454     {
455         // We have to resort to libc since 32-bit ARM 
456         // doesn't seem to have 64-bit registers.
457         
458         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
459 
460         // Note: converting to double precision else rounding could be different for large integers
461         double asDouble = value; 
462 
463         switch(fpscr & _MM_ROUND_MASK_ARM)
464         {
465             default:
466             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
467             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
468             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
469             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
470         }
471     }
472     else static if (LDC_with_ARM64)
473     {
474         uint fpscr = arm_get_fpcr();
475 
476         switch(fpscr & _MM_ROUND_MASK_ARM)
477         {
478             default:
479             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
480             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
481             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
482             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
483         }
484     }
485     // 64-bit can use an SSE instruction
486     else version(D_InlineAsm_X86_64)
487     {
488         long result;
489         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
490         {
491             asm pure nothrow @nogc @trusted
492             {
493                 movss XMM0, value;
494                 cvtss2si RAX, XMM0;
495                 mov result, RAX;
496             }
497         }
498         else
499         {
500             asm pure nothrow @nogc @trusted
501             {
502                 movss XMM0, value;
503                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
504                 mov result, RAX;
505             }
506         }
507         return result;
508     }
509     else version(D_InlineAsm_X86)
510     {
511         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
512         // This leads to an unfortunate FPU sequence in every C++ compiler.
513         // See: https://godbolt.org/z/vZym77
514 
515         // Get current MXCSR rounding
516         uint sseRounding;
517         ushort savedFPUCW;
518         ushort newFPUCW;
519         long result;
520         asm pure nothrow @nogc @trusted
521         {
522             stmxcsr sseRounding;
523             fld value;
524             fnstcw savedFPUCW;
525             mov AX, savedFPUCW;
526             and AX, 0xf3ff;          // clear FPU rounding bits
527             movzx ECX, word ptr sseRounding;
528             and ECX, 0x6000;         // only keep SSE rounding bits
529             shr ECX, 3;
530             or AX, CX;               // make a new control word for FPU with SSE bits
531             mov newFPUCW, AX;
532             fldcw newFPUCW;
533             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
534             fldcw savedFPUCW;
535         }
536         return result;
537     }
538     else static if (GDC_with_x86)
539     {
540         version(X86_64) // 64-bit can just use the right instruction
541         {
542             static assert(GDC_with_SSE);
543             __m128 A;
544             A.ptr[0] = value;
545             return __builtin_ia32_cvtss2si64 (A);
546         }
547         else version(X86) // 32-bit
548         {
549             // This is untested!
550             uint sseRounding;
551             ushort savedFPUCW;
552             ushort newFPUCW;
553             long result;
554             asm pure nothrow @nogc @trusted
555             {
556                 "stmxcsr %1;\n" ~
557                 "fld %2;\n" ~
558                 "fnstcw %3;\n" ~
559                 "movw %3, %%ax;\n" ~
560                 "andw $0xf3ff, %%ax;\n" ~
561                 "movzwl %1, %%ecx;\n" ~
562                 "andl $0x6000, %%ecx;\n" ~
563                 "shrl $3, %%ecx;\n" ~
564                 "orw %%cx, %%ax\n" ~
565                 "movw %%ax, %4;\n" ~
566                 "fldcw %4;\n" ~
567                 "fistpll %0;\n" ~
568                 "fldcw %3;\n" 
569                   : "=m"(result)    // %0
570                   : "m" (sseRounding),
571                     "f" (value),
572                     "m" (savedFPUCW),
573                     "m" (newFPUCW) 
574                   : "eax", "ecx", "st";
575             }
576             return result;
577         }
578         else
579             static assert(false);
580     }
581     else
582         static assert(false);
583 }
584 
585 
586 ///ditto
587 long convertDoubleToInt64UsingMXCSR(double value) @trusted
588 {
589     static if (LDC_with_ARM32)
590     {
591         // We have to resort to libc since 32-bit ARM 
592         // doesn't seem to have 64-bit registers.
593         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
594         switch(fpscr & _MM_ROUND_MASK_ARM)
595         {
596             default:
597             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
598             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
599             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
600             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
601         }
602     }
603     else static if (LDC_with_ARM64)
604     {
605         // Get current rounding mode.
606         uint fpscr = arm_get_fpcr();
607 
608         switch(fpscr & _MM_ROUND_MASK_ARM)
609         {
610             default:
611             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
612             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
613             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
614             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
615         }
616     }
617     // 64-bit can use an SSE instruction
618     else version(D_InlineAsm_X86_64)
619     {
620         long result;
621         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
622         {
623             asm pure nothrow @nogc @trusted
624             {
625                 movsd XMM0, value;
626                 cvtsd2si RAX, XMM0;
627                 mov result, RAX;
628             }
629         }
630         else
631         {
632             asm pure nothrow @nogc @trusted
633             {
634                 movsd XMM0, value;
635                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
636                 mov result, RAX;
637             }
638         }
639         return result;
640     }
641     else version(D_InlineAsm_X86)
642     {
643         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
644         // This leads to an unfortunate FPU sequence in every C++ compiler.
645         // See: https://godbolt.org/z/vZym77
646 
647         // Get current MXCSR rounding
648         uint sseRounding;
649         ushort savedFPUCW;
650         ushort newFPUCW;
651         long result;
652         asm pure nothrow @nogc @trusted
653         {
654             stmxcsr sseRounding;
655             fld value;
656             fnstcw savedFPUCW;
657             mov AX, savedFPUCW;
658             and AX, 0xf3ff;
659             movzx ECX, word ptr sseRounding;
660             and ECX, 0x6000;
661             shr ECX, 3;
662             or AX, CX;
663             mov newFPUCW, AX;
664             fldcw newFPUCW;
665             fistp result;
666             fldcw savedFPUCW;
667         }
668         return result;
669     }
670     else static if (GDC_with_x86)
671     {
672         version(X86_64)
673         {
674             static assert(GDC_with_SSE2);
675             __m128d A;
676             A.ptr[0] = value;
677             return __builtin_ia32_cvtsd2si64 (A);
678         }
679         else
680         {
681             // This is untested!
682             uint sseRounding;
683             ushort savedFPUCW;
684             ushort newFPUCW;
685             long result;
686             asm pure nothrow @nogc @trusted
687             {
688                 "stmxcsr %1;\n" ~
689                 "fld %2;\n" ~
690                 "fnstcw %3;\n" ~
691                 "movw %3, %%ax;\n" ~
692                 "andw $0xf3ff, %%ax;\n" ~
693                 "movzwl %1, %%ecx;\n" ~
694                 "andl $0x6000, %%ecx;\n" ~
695                 "shrl $3, %%ecx;\n" ~
696                 "orw %%cx, %%ax\n" ~
697                 "movw %%ax, %4;\n" ~
698                 "fldcw %4;\n" ~
699                 "fistpll %0;\n" ~
700                 "fldcw %3;\n"         
701                   : "=m"(result)    // %0
702                   : "m" (sseRounding),
703                     "t" (value),
704                     "m" (savedFPUCW),
705                     "m" (newFPUCW) 
706                   : "eax", "ecx", "st";
707             }
708             return result;
709         }
710     }
711     else
712         static assert(false);
713 }
714 
715 //
716 //  </ROUNDING>
717 //
718 
719 
720 // using the Intel terminology here
721 
722 byte saturateSignedWordToSignedByte(short value) pure @safe
723 {
724     if (value > 127) value = 127;
725     if (value < -128) value = -128;
726     return cast(byte) value;
727 }
728 
729 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
730 {
731     if (value > 255) value = 255;
732     if (value < 0) value = 0;
733     return cast(ubyte) value;
734 }
735 
736 short saturateSignedIntToSignedShort(int value) pure @safe
737 {
738     if (value > 32767) value = 32767;
739     if (value < -32768) value = -32768;
740     return cast(short) value;
741 }
742 
743 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
744 {
745     if (value > 65535) value = 65535;
746     if (value < 0) value = 0;
747     return cast(ushort) value;
748 }
749 
750 unittest // test saturate operations
751 {
752     assert( saturateSignedWordToSignedByte(32000) == 127);
753     assert( saturateSignedWordToUnsignedByte(32000) == 255);
754     assert( saturateSignedWordToSignedByte(-4000) == -128);
755     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
756     assert( saturateSignedIntToSignedShort(32768) == 32767);
757     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
758     assert( saturateSignedIntToSignedShort(-32769) == -32768);
759     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
760 }
761 
762 version(unittest)
763 {
764     // This is just for debugging tests
765     import core.stdc.stdio: printf;
766 
767     // printing vectors for implementation
768     // Note: you can override `pure` within a `debug` clause
769 
770     void _mm_print_pi64(__m64 v) @trusted
771     {
772         long1 vl = cast(long1)v;
773         printf("%lld\n", vl.array[0]);
774     }
775 
776     void _mm_print_pi32(__m64 v) @trusted
777     {
778         int[2] C = (cast(int2)v).array;
779         printf("%d %d\n", C[0], C[1]);
780     }
781 
782     void _mm_print_pi16(__m64 v) @trusted
783     {
784         short[4] C = (cast(short4)v).array;
785         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
786     }
787 
788     void _mm_print_pi8(__m64 v) @trusted
789     {
790         byte[8] C = (cast(byte8)v).array;
791         printf("%d %d %d %d %d %d %d %d\n",
792         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
793     }
794 
795     void _mm_print_epi64(__m128i v) @trusted
796     {
797         long2 vl = cast(long2)v;
798         printf("%lld %lld\n", vl.array[0], vl.array[1]);
799     }
800 
801     void _mm_print_epi32(__m128i v) @trusted
802     {
803         printf("%d %d %d %d\n",
804               v.array[0], v.array[1], v.array[2], v.array[3]);
805     }  
806 
807     void _mm_print_epi16(__m128i v) @trusted
808     {
809         short[8] C = (cast(short8)v).array;
810         printf("%d %d %d %d %d %d %d %d\n",
811         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
812     }
813 
814     void _mm_print_epi8(__m128i v) @trusted
815     {
816         byte[16] C = (cast(byte16)v).array;
817         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
818         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
819     }
820 
821     void _mm_print_ps(__m128 v) @trusted
822     {
823         // %g because %f can conceal very small numbers and prints zero instead
824         float[4] C = (cast(float4)v).array;
825         printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]);
826     }
827 
828     void _mm_print_pd(__m128d v) @trusted
829     {
830         double[2] C = (cast(double2)v).array;
831         printf("%f %f\n", C[0], C[1]);
832     }
833 
834     void _mm256_print_pd(__m256d v) @trusted
835     {
836         // %g because %f can conceal very small numbers and prints zero instead
837         printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 
838     }
839 
840     void _mm256_print_ps(__m256 v) @trusted
841     {
842         // %g because %f can conceal very small numbers and prints zero instead
843         printf("%g %g %g %g %g %g %g %g\n", 
844             v.array[0], v.array[1], v.array[2], v.array[3],
845             v.array[4], v.array[5], v.array[6], v.array[7]); 
846     }
847 
848     void _mm256_print_epi16(__m256i v) @trusted
849     {
850         short16 vl = cast(short16)v;
851         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 
852                vl.array[0], vl.array[1], vl.array[2], vl.array[3],
853                vl.array[4], vl.array[5], vl.array[6], vl.array[7],
854                vl.array[8], vl.array[9], vl.array[10], vl.array[11],
855                vl.array[12], vl.array[13], vl.array[14], vl.array[15]);
856     }
857 
858     void _mm256_print_epi32(__m256i v) @trusted
859     {
860         int8 vl = cast(int8)v;
861         printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3],
862                                             vl.array[4], vl.array[5], vl.array[6], vl.array[7]);
863     }
864 
865     void _mm256_print_epi64(__m256i v) @trusted
866     {
867         long4 vl = cast(long4)v;
868         printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]);
869     }
870 }
871 
872 
873 //
874 //  <FLOATING-POINT COMPARISONS>
875 //
876 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
877 //       need different IR generation.
878 
879 enum FPComparison
880 {
881     false_,// always false
882     oeq,   // ordered and equal
883     ogt,   // ordered and greater than
884     oge,   // ordered and greater than or equal
885     olt,   // ordered and less than
886     ole,   // ordered and less than or equal
887     one,   // ordered and not equal
888     ord,   // ordered (no nans)
889     ueq,   // unordered or equal
890     ugt,   // unordered or greater than ("nle")
891     uge,   // unordered or greater than or equal ("nlt")
892     ult,   // unordered or less than ("nge")
893     ule,   // unordered or less than or equal ("ngt")
894     une,   // unordered or not equal ("neq")
895     uno,   // unordered (either nans)
896     true_, // always true
897 }
898 
899 private static immutable string[FPComparison.max+1] FPComparisonToString =
900 [
901     "false",
902     "oeq",
903     "ogt",
904     "oge",
905     "olt",
906     "ole",
907     "one",
908     "ord",
909     "ueq",
910     "ugt",
911     "uge",
912     "ult",
913     "ule",
914     "une",
915     "uno",
916     "true"
917 ];
918 
919 // AVX FP comparison to FPComparison
920 FPComparison mapAVXFPComparison(int imm8) pure @safe
921 {
922     // Always map on non-signalling
923     static immutable FPComparison[16] mapping =
924     [
925         FPComparison.oeq, // _CMP_EQ_OQ
926         FPComparison.olt, // _CMP_LT_OS
927         FPComparison.ole, // _CMP_LE_OS
928         FPComparison.uno, // _CMP_UNORD_Q
929         FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered?
930         FPComparison.uge, // _CMP_NLT_US
931         FPComparison.ugt, // _CMP_NLE_US
932         FPComparison.ord, // _CMP_ORD_Q
933         FPComparison.ueq,   // _CMP_EQ_UQ  
934         FPComparison.ult,   // _CMP_NGE_US 
935         FPComparison.ule,   // _CMP_NGT_US 
936         FPComparison.false_,// _CMP_FALSE_OQ
937         FPComparison.one,   // _CMP_NEQ_OQ
938         FPComparison.oge,   // _CMP_GE_OS
939         FPComparison.ogt,   // _CMP_GT_OS
940         FPComparison.true_  // _CMP_TRUE_UQ
941     ];
942 
943     return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up
944 }
945 
946 // Individual float comparison: returns -1 for true or 0 for false.
947 // Useful for DMD and testing
948 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
949 {
950     bool unordered = isnan(a) || isnan(b);
951     final switch(comparison) with(FPComparison)
952     {
953         case false_: return false;
954         case oeq: return a == b;
955         case ogt: return a > b;
956         case oge: return a >= b;
957         case olt: return a < b;
958         case ole: return a <= b;
959         case one: return !unordered && (a != b); // NaN with != always yields true
960         case ord: return !unordered; 
961         case ueq: return unordered || (a == b);
962         case ugt: return unordered || (a > b);
963         case uge: return unordered || (a >= b);
964         case ult: return unordered || (a < b);
965         case ule: return unordered || (a <= b);
966         case une: return (a != b); // NaN with != always yields true
967         case uno: return unordered;
968         case true_: return true;
969     }
970 }
971 
972 version(LDC)
973 {
974     /// Provides packed float comparisons
975     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
976     {
977         enum ir = `
978             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
979             %r = sext <4 x i1> %cmp to <4 x i32>
980             ret <4 x i32> %r`;
981 
982         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
983     }
984 
985     ///ditto
986     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe
987     {
988         enum ir = `
989             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1
990             %r = sext <8 x i1> %cmp to <8 x i32>
991             ret <8 x i32> %r`;
992         return LDCInlineIR!(ir, int8, float8, float8)(a, b);
993     }
994 
995     /// Provides packed double comparisons
996     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
997     {
998         enum ir = `
999             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
1000             %r = sext <2 x i1> %cmp to <2 x i64>
1001             ret <2 x i64> %r`;
1002 
1003         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
1004     }
1005 
1006     ///ditto 
1007     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe
1008     {
1009         enum ir = `
1010             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1
1011             %r = sext <4 x i1> %cmp to <4 x i64>
1012             ret <4 x i64> %r`;
1013         return LDCInlineIR!(ir, long4, double4, double4)(a, b);
1014     }
1015 
1016     /// CMPSS-style comparisons
1017     /// clang implement it through x86 intrinsics, it is possible with IR alone
1018     /// but leads to less optimal code.
1019     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
1020     /// Not that simple.
1021     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
1022     {
1023         /*
1024         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
1025         enum bool invertOp = (predicateNumber & 0x80) != 0;
1026         static if(invertOp)
1027             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
1028         else
1029             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
1030         */
1031         enum ir = `
1032             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
1033             %r = sext i1 %cmp to i32
1034             %r2 = bitcast i32 %r to float
1035             ret float %r2`;
1036 
1037         float4 r = a;
1038         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
1039         return r;
1040     }
1041 
1042     /// CMPSD-style comparisons
1043     /// clang implement it through x86 intrinsics, it is possible with IR alone
1044     /// but leads to less optimal code.
1045     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
1046     /// Not that simple.    
1047     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
1048     {
1049         enum ir = `
1050             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
1051             %r = sext i1 %cmp to i64
1052             %r2 = bitcast i64 %r to double
1053             ret double %r2`;
1054 
1055         double2 r = a;
1056         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
1057         return r;
1058     }
1059 }
1060 else
1061 {
1062     /// Provides packed float comparisons
1063     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
1064     {
1065         int4 result;
1066         foreach(i; 0..4)
1067         {
1068             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1069         }
1070         return result;
1071     }
1072     ///ditto
1073     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted
1074     {
1075         int8 result;
1076         foreach(i; 0..8)
1077         {
1078             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1079         }
1080         return result;
1081     }
1082 
1083     /// Provides packed double comparisons
1084     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1085     {
1086         long2 result;
1087         foreach(i; 0..2)
1088         {
1089             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1090         }
1091         return result;
1092     }
1093     ///ditto
1094     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted
1095     {
1096         long4 result;
1097         foreach(i; 0..4)
1098         {
1099             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1100         }
1101         return result;
1102     }
1103 
1104     /// Provides CMPSS-style comparison
1105     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
1106     {
1107         int4 result = cast(int4)a;
1108         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
1109         return cast(float4)result;
1110     }
1111 
1112     /// Provides CMPSD-style comparison
1113     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1114     {
1115         long2 result = cast(long2)a;
1116         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
1117         return cast(double2)result;
1118     }
1119 }
1120 unittest // cmpps
1121 {
1122     // Check all comparison type is working
1123     float4 A = [1, 3, 5, float.nan];
1124     float4 B = [2, 3, 4, 5];
1125 
1126     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
1127     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
1128     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
1129     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
1130     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
1131     int4 result_one = cmpps!(FPComparison.one)(A, B);
1132     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
1133     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
1134     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
1135     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
1136     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
1137     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
1138     int4 result_une = cmpps!(FPComparison.une)(A, B);
1139     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
1140 
1141     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
1142     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
1143     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
1144     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
1145     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
1146     static immutable int[4] correct_one    = [-1, 0,-1, 0];
1147     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
1148     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
1149     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
1150     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
1151     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
1152     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
1153     static immutable int[4] correct_une    = [-1, 0,-1,-1];
1154     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
1155 
1156     assert(result_oeq.array == correct_oeq);
1157     assert(result_ogt.array == correct_ogt);
1158     assert(result_oge.array == correct_oge);
1159     assert(result_olt.array == correct_olt);
1160     assert(result_ole.array == correct_ole);
1161     assert(result_one.array == correct_one);
1162     assert(result_ord.array == correct_ord);
1163     assert(result_ueq.array == correct_ueq);
1164     assert(result_ugt.array == correct_ugt);
1165     assert(result_uge.array == correct_uge);
1166     assert(result_ult.array == correct_ult);
1167     assert(result_ule.array == correct_ule);
1168     assert(result_une.array == correct_une);
1169     assert(result_uno.array == correct_uno);
1170 }
1171 unittest
1172 {
1173     double2 a = [1, 3];
1174     double2 b = [2, 3];
1175     long2 c = cmppd!(FPComparison.ult)(a, b);
1176     static immutable long[2] correct = [cast(long)(-1), 0];
1177     assert(c.array == correct);
1178 }
1179 unittest // cmpss
1180 {
1181     void testComparison(FPComparison comparison)(float4 A, float4 B)
1182     {
1183         float4 result = cmpss!comparison(A, B);
1184         int4 iresult = cast(int4)result;
1185         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1186         assert(iresult.array[0] == expected);
1187         assert(result.array[1] == A.array[1]);
1188         assert(result.array[2] == A.array[2]);
1189         assert(result.array[3] == A.array[3]);
1190     }
1191 
1192     // Check all comparison type is working
1193     float4 A = [1, 3, 5, 6];
1194     float4 B = [2, 3, 4, 5];
1195     float4 C = [float.nan, 3, 4, 5];
1196 
1197     testComparison!(FPComparison.oeq)(A, B);
1198     testComparison!(FPComparison.oeq)(A, C);
1199     testComparison!(FPComparison.ogt)(A, B);
1200     testComparison!(FPComparison.ogt)(A, C);
1201     testComparison!(FPComparison.oge)(A, B);
1202     testComparison!(FPComparison.oge)(A, C);
1203     testComparison!(FPComparison.olt)(A, B);
1204     testComparison!(FPComparison.olt)(A, C);
1205     testComparison!(FPComparison.ole)(A, B);
1206     testComparison!(FPComparison.ole)(A, C);
1207     testComparison!(FPComparison.one)(A, B);
1208     testComparison!(FPComparison.one)(A, C);
1209     testComparison!(FPComparison.ord)(A, B);
1210     testComparison!(FPComparison.ord)(A, C);
1211     testComparison!(FPComparison.ueq)(A, B);
1212     testComparison!(FPComparison.ueq)(A, C);
1213     testComparison!(FPComparison.ugt)(A, B);
1214     testComparison!(FPComparison.ugt)(A, C);
1215     testComparison!(FPComparison.uge)(A, B);
1216     testComparison!(FPComparison.uge)(A, C);
1217     testComparison!(FPComparison.ult)(A, B);
1218     testComparison!(FPComparison.ult)(A, C);
1219     testComparison!(FPComparison.ule)(A, B);
1220     testComparison!(FPComparison.ule)(A, C);
1221     testComparison!(FPComparison.une)(A, B);
1222     testComparison!(FPComparison.une)(A, C);
1223     testComparison!(FPComparison.uno)(A, B);
1224     testComparison!(FPComparison.uno)(A, C);
1225 }
1226 unittest // cmpsd
1227 {
1228     void testComparison(FPComparison comparison)(double2 A, double2 B)
1229     {
1230         double2 result = cmpsd!comparison(A, B);
1231         long2 iresult = cast(long2)result;
1232         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1233         assert(iresult.array[0] == expected);
1234         assert(result.array[1] == A.array[1]);
1235     }
1236 
1237     // Check all comparison type is working
1238     double2 A = [1, 3];
1239     double2 B = [2, 4];
1240     double2 C = [double.nan, 5];
1241 
1242     testComparison!(FPComparison.oeq)(A, B);
1243     testComparison!(FPComparison.oeq)(A, C);
1244     testComparison!(FPComparison.ogt)(A, B);
1245     testComparison!(FPComparison.ogt)(A, C);
1246     testComparison!(FPComparison.oge)(A, B);
1247     testComparison!(FPComparison.oge)(A, C);
1248     testComparison!(FPComparison.olt)(A, B);
1249     testComparison!(FPComparison.olt)(A, C);
1250     testComparison!(FPComparison.ole)(A, B);
1251     testComparison!(FPComparison.ole)(A, C);
1252     testComparison!(FPComparison.one)(A, B);
1253     testComparison!(FPComparison.one)(A, C);
1254     testComparison!(FPComparison.ord)(A, B);
1255     testComparison!(FPComparison.ord)(A, C);
1256     testComparison!(FPComparison.ueq)(A, B);
1257     testComparison!(FPComparison.ueq)(A, C);
1258     testComparison!(FPComparison.ugt)(A, B);
1259     testComparison!(FPComparison.ugt)(A, C);
1260     testComparison!(FPComparison.uge)(A, B);
1261     testComparison!(FPComparison.uge)(A, C);
1262     testComparison!(FPComparison.ult)(A, B);
1263     testComparison!(FPComparison.ult)(A, C);
1264     testComparison!(FPComparison.ule)(A, B);
1265     testComparison!(FPComparison.ule)(A, C);
1266     testComparison!(FPComparison.une)(A, B);
1267     testComparison!(FPComparison.une)(A, C);
1268     testComparison!(FPComparison.uno)(A, B);
1269     testComparison!(FPComparison.uno)(A, C);
1270 }
1271 
1272 //
1273 //  </FLOATING-POINT COMPARISONS>
1274 //
1275 
1276 
1277 __m64 to_m64(__m128i a) pure @trusted
1278 {
1279     long2 la = cast(long2)a;
1280     long1 r = la.array[0];
1281     return r;
1282 }
1283 
1284 __m128i to_m128i(__m64 a) pure @trusted
1285 {
1286   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1287     
1288     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1289     {
1290         long2 r = a.array[0];
1291         r.ptr[1] = 0;
1292         return cast(int4)r;
1293     }
1294     else */
1295     {
1296         long2 r = [0, 0];
1297         r.ptr[0] = a.array[0];
1298         return cast(__m128i)r;
1299     }
1300 }
1301 
1302 
1303 // ADDITIONAL LLVM INTRINSICS
1304 // Basically LDC didn't add them yet
1305 version(LDC)
1306 {
1307     static if (__VERSION__ >= 2097) // LDC 1.27+
1308     {
1309         pragma(LDC_intrinsic, "llvm.abs.i#")
1310             T inteli_llvm_abs(T)(T val, bool attrib);
1311     }
1312 
1313     static if (__VERSION__ >= 2092) // LDC 1.22+
1314     {
1315         pragma(LDC_intrinsic, "llvm.sadd.sat.i#")
1316             T inteli_llvm_adds(T)(T a, T b) pure @safe;
1317         pragma(LDC_intrinsic, "llvm.ssub.sat.i#")
1318             T inteli_llvm_subs(T)(T a, T b) pure @safe;
1319         pragma(LDC_intrinsic, "llvm.uadd.sat.i#")
1320             T inteli_llvm_addus(T)(T a, T b) pure @safe;
1321         pragma(LDC_intrinsic, "llvm.usub.sat.i#")
1322             T inteli_llvm_subus(T)(T a, T b) pure @safe;
1323     }
1324 }
1325 
1326 // ADDITIONAL x86 INTRINSICS
1327 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1328 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1329 static if (LDC_with_SSE41)
1330 {
1331     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1332         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1333 }
1334 
1335 // SOME NEON INTRINSICS
1336 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1337 // Not in the public API but the simde project expose it all for the user to use.
1338 // MAYDO: create a new neon.d module, for internal use only.
1339 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1340 static if (LDC_with_ARM64)
1341 {
1342     // VERY USEFUL LINK
1343     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1344     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1345 
1346     pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
1347         uint __crc32cb(uint a, uint b) pure @safe;
1348 
1349     pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
1350         uint __crc32ch(uint a, uint b) pure @safe;
1351 
1352     pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
1353         uint __crc32cw(uint a, uint b) pure @safe;
1354 
1355     pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
1356         uint __crc32cd(uint a, ulong b) pure @safe;
1357 
1358     //pragma(LDC_intrinsic, "llvm.aarch64.dmb")
1359     //    uint __dmb(int a) @safe; // didn't found a name in intrinsic list
1360 
1361     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1362         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1363 
1364     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1365         short8 vabsq_s16(short8 a) pure @safe;
1366 
1367     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1368         int4 vabsq_s32(int4 a) pure @safe;
1369 
1370     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1371         byte16 vabsq_s8(byte16 a) pure @safe;
1372 
1373     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1374     {
1375         return a & b;
1376     }
1377 
1378     long2 vandq_s64(long2 a, long2 b)
1379     {
1380         return a & b;
1381     }
1382 
1383     long2 vbicq_s64(long2 a, long2 b) pure @safe
1384     {
1385         return a & ~b;
1386     }
1387 
1388     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1389     {
1390         return c ^ ((c ^ b) & a);
1391     }
1392 
1393     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1394     {
1395         return c ^ ((c ^ b) & a);
1396     }
1397 
1398     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1399     {
1400         return c ^ ((c ^ b) & a);
1401     }
1402 
1403     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1404     {
1405         short8 r;
1406         r.ptr[0]  = lo.array[0];
1407         r.ptr[1]  = lo.array[1];
1408         r.ptr[2]  = lo.array[2];
1409         r.ptr[3]  = lo.array[3];
1410         r.ptr[4]  = hi.array[0];
1411         r.ptr[5]  = hi.array[1];
1412         r.ptr[6]  = hi.array[2];
1413         r.ptr[7]  = hi.array[3];
1414         return r;
1415     }
1416 
1417     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1418     {
1419         int4 r;
1420         r.ptr[0] = lo.array[0];
1421         r.ptr[1] = lo.array[1];
1422         r.ptr[2] = hi.array[0];
1423         r.ptr[3] = hi.array[1];
1424         return r;
1425     }
1426 
1427     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1428     {
1429         byte16 r;
1430         r.ptr[0]  = lo.array[0];
1431         r.ptr[1]  = lo.array[1];
1432         r.ptr[2]  = lo.array[2];
1433         r.ptr[3]  = lo.array[3];
1434         r.ptr[4]  = lo.array[4];
1435         r.ptr[5]  = lo.array[5];
1436         r.ptr[6]  = lo.array[6];
1437         r.ptr[7]  = lo.array[7];
1438         r.ptr[8]  = hi.array[0];
1439         r.ptr[9]  = hi.array[1];
1440         r.ptr[10] = hi.array[2];
1441         r.ptr[11] = hi.array[3];
1442         r.ptr[12] = hi.array[4];
1443         r.ptr[13] = hi.array[5];
1444         r.ptr[14] = hi.array[6];
1445         r.ptr[15] = hi.array[7];
1446         return r;
1447     }
1448 
1449     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1450     {
1451         short8 r;
1452         r.ptr[0]  = lo.array[0];
1453         r.ptr[1]  = lo.array[1];
1454         r.ptr[2]  = lo.array[2];
1455         r.ptr[3]  = lo.array[3];
1456         r.ptr[4]  = hi.array[0];
1457         r.ptr[5]  = hi.array[1];
1458         r.ptr[6]  = hi.array[2];
1459         r.ptr[7]  = hi.array[3];
1460         return r;
1461     }
1462 
1463 
1464     // float4 => int4
1465 
1466     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1467         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1468 
1469     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1470         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1471 
1472     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1473         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1474 
1475     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1476         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1477 
1478 
1479     // double2 => long2
1480 
1481     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1482         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1483 
1484     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1485         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1486 
1487     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1488         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1489 
1490     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1491         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1492 
1493     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1494         int vcvtms_s32_f32(float a) pure @safe;
1495 
1496     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1497         int vcvtns_s32_f32(float a) pure @safe;    
1498 
1499     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1500         int vcvtps_s32_f32(float a) pure @safe;
1501 
1502     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1503         int vcvts_s32_f32(float a) pure @safe;
1504      
1505     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1506         int vcvtms_s32_f64(double a) pure @safe;
1507 
1508     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1509         int vcvtns_s32_f64(double a) pure @safe;    
1510 
1511     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1512         int vcvtps_s32_f64(double a) pure @safe;
1513 
1514     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1515         int vcvts_s32_f64(double a) pure @safe;
1516 
1517     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1518         long vcvtms_s64_f32(float a) pure @safe;
1519 
1520     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1521         long vcvtns_s64_f32(float a) pure @safe;    
1522 
1523     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1524         long vcvtps_s64_f32(float a) pure @safe;
1525 
1526     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1527         long vcvts_s64_f32(float a) pure @safe;
1528 
1529     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1530         long vcvtms_s64_f64(double a) pure @safe;
1531 
1532     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1533         long vcvtns_s64_f64(double a) pure @safe;    
1534 
1535     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1536         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1537 
1538     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1539         long vcvts_s64_f64(double a) pure @safe;
1540 
1541     long2 vdupq_n_s64(long value) pure @safe
1542     {
1543         long2 r;
1544         r = value;
1545         return r;
1546     }
1547 
1548     short4 vget_high_s16(short8 a) pure @trusted
1549     {
1550         short4 r;
1551         r.ptr[0] = a.array[4];
1552         r.ptr[1] = a.array[5];
1553         r.ptr[2] = a.array[6];
1554         r.ptr[3] = a.array[7];
1555         return r;
1556     }
1557 
1558     int2 vget_high_s32(int4 a) pure @trusted
1559     {
1560         int2 r;
1561         r.ptr[0] = a.array[2];
1562         r.ptr[1] = a.array[3];
1563         return r;
1564     }
1565 
1566     byte8 vget_high_u8(byte16 a) pure @trusted
1567     {
1568         byte8 r;
1569         r.ptr[0] = a.array[8];
1570         r.ptr[1] = a.array[9];
1571         r.ptr[2] = a.array[10];
1572         r.ptr[3] = a.array[11];
1573         r.ptr[4] = a.array[12];
1574         r.ptr[5] = a.array[13];
1575         r.ptr[6] = a.array[14];
1576         r.ptr[7] = a.array[15];
1577         return r;
1578     }
1579 
1580     short4 vget_low_s16(short8 a) pure @trusted
1581     {
1582         short4 r;
1583         r.ptr[0] = a.array[0];
1584         r.ptr[1] = a.array[1];
1585         r.ptr[2] = a.array[2];
1586         r.ptr[3] = a.array[3];
1587         return r;
1588     } 
1589 
1590     int2 vget_low_s32(int4 a) pure @trusted
1591     {
1592         int2 r;
1593         r.ptr[0] = a.array[0];
1594         r.ptr[1] = a.array[1];
1595         return r;
1596     }
1597 
1598     byte8 vget_low_u8(byte16 a) pure @trusted
1599     {
1600         byte8 r;
1601         r.ptr[0] = a.array[0];
1602         r.ptr[1] = a.array[1];
1603         r.ptr[2] = a.array[2];
1604         r.ptr[3] = a.array[3];
1605         r.ptr[4] = a.array[4];
1606         r.ptr[5] = a.array[5];
1607         r.ptr[6] = a.array[6];
1608         r.ptr[7] = a.array[7];
1609         return r;
1610     }
1611 
1612     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1613     {
1614         return v.array[lane];
1615     }
1616 
1617     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1618         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1619 
1620     int4 vmaxq_s32(int4 a, int4 b)
1621     {
1622         int4 r;
1623         r[0] = a[0] >= b[0] ? a[0] : b[0];
1624         r[1] = a[1] >= b[1] ? a[1] : b[1];
1625         r[2] = a[2] >= b[2] ? a[2] : b[2];
1626         r[3] = a[3] >= b[3] ? a[3] : b[3];
1627         return r;
1628     }
1629 
1630     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1631         short8 vminq_s16(short8 a, short8 b) pure @safe;
1632 
1633     int2 vmovn_s64(long2 a) pure @trusted
1634     {
1635         int2 r;
1636         r.ptr[0] = cast(int)(a.array[0]);
1637         r.ptr[1] = cast(int)(a.array[1]);
1638         return r;
1639     }        
1640 
1641     int4 vmull_s16(short4 a, short4 b) pure @trusted
1642     {
1643         int4 r;
1644         r.ptr[0] = a.array[0] * b.array[0];
1645         r.ptr[1] = a.array[1] * b.array[1];
1646         r.ptr[2] = a.array[2] * b.array[2];
1647         r.ptr[3] = a.array[3] * b.array[3];
1648         return r;
1649     }
1650 
1651     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1652         long2 vmull_s32(int2 a, int2 b) pure @safe;
1653 
1654     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1655         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1656 
1657     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1658         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1659 
1660     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1661         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1662 
1663     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1664         short8 vpaddlq_u8 (byte16 a) pure @safe;
1665 
1666     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1667     {
1668         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1669             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1670     }
1671     else
1672     {
1673         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1674             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1675     }
1676     
1677     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1678         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1679 
1680     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1681         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1682 
1683     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1684         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1685 
1686     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1687         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1688 
1689     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1690         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1691 
1692     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1693         byte8 vqmovn_s16(short8 a) pure @safe;
1694 
1695     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1696         short4 vqmovn_s32(int4 a) pure @safe;
1697 
1698     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1699         short4 vqmovn_u32(int4 a) pure @safe;
1700 
1701     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1702         byte8 vqmovun_s16(short8 a) pure @safe;
1703 
1704     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1705         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1706 
1707     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1708         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1709 
1710     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1711         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1712 
1713     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1714         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1715 
1716     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1717         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1718 
1719     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1720         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1721 
1722     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1723     {
1724         return a >>> b;
1725     }
1726 
1727     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1728     { 
1729         a = a >> byte16(cast(byte)r);
1730         return a;
1731     }
1732 
1733     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1734         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1735 }
1736 
1737 version(unittest)
1738 {
1739     double abs_double(double x) @trusted
1740     {
1741         version(LDC)
1742             return llvm_fabs(x);
1743         else
1744         {
1745             long uf = *cast(long*)(&x);
1746             uf &= 0x7fffffff_ffffffff;
1747             return *cast(double*)(&uf);
1748         }
1749     }
1750 }
1751 
1752 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1753 
1754 bool isnan(float x) pure @trusted
1755 {
1756     uint u = *cast(uint*)(&x);
1757     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1758     return result;
1759 }
1760 unittest
1761 {
1762     float x = float.nan;
1763     assert(isnan(x));
1764 
1765     x = 0;
1766     assert(!isnan(x));
1767     
1768     x = float.infinity;
1769     assert(!isnan(x));
1770 }
1771 
1772 bool isnan(double x) pure @trusted
1773 {
1774     ulong u = *cast(ulong*)(&x);
1775     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1776 }
1777 unittest
1778 {
1779     double x = double.nan;
1780     assert(isnan(x));
1781 
1782     x = 0;
1783     assert(!isnan(x));
1784     
1785     x = double.infinity;
1786     assert(!isnan(x));
1787 }