1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_SHA = false;
34     }
35     else version (X86_64)
36     {
37         // GDC support uses extended inline assembly:
38         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
39         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
40         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
41 
42         public import core.simd;
43 
44         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
45         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
46         public import gcc.builtins;
47                 
48         enum GDC_with_x86 = true;
49         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
50         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
51         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
52 
53         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
54         enum GDC_with_SSSE3 = false; // TODO: we don't have a way to detect that at CT
55         enum GDC_with_SSE41 = false; // TODO: we don't have a way to detect that at CT
56         enum GDC_with_SSE42 = false; // TODO: we don't have a way to detect that at CT
57         enum GDC_with_SHA = false;
58     }
59     else
60     {
61         enum GDC_with_x86 = false;
62         enum GDC_with_MMX = false;
63         enum GDC_with_SSE = false;
64         enum GDC_with_SSE2 = false;
65         enum GDC_with_SSE3 = false;
66         enum GDC_with_SSSE3 = false;
67         enum GDC_with_SSE41 = false;
68         enum GDC_with_SSE42 = false;
69         enum GDC_with_SHA = false;
70     }
71 }
72 else
73 {
74     enum GDC_with_x86 = false;
75     enum GDC_with_MMX = false;
76     enum GDC_with_SSE = false;
77     enum GDC_with_SSE2 = false;
78     enum GDC_with_SSE3 = false;
79     enum GDC_with_SSSE3 = false;
80     enum GDC_with_SSE41 = false;
81     enum GDC_with_SSE42 = false;
82     enum GDC_with_SHA = false;
83 }
84 
85 version(LDC)
86 {
87     public import core.simd;
88     public import ldc.simd;
89     public import ldc.intrinsics;
90     public import ldc.llvmasm: __asm;
91 
92     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
93     static if (__VERSION__ >= 2083)
94     {
95          import ldc.llvmasm;
96          alias LDCInlineIR = __ir_pure;
97 
98          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
99          alias LDCInlineIREx = __irEx_pure; 
100     }
101     else
102     {
103         alias LDCInlineIR = inlineIR;
104     }
105 
106     version(ARM)
107     {
108         public import ldc.gccbuiltins_arm;
109         enum LDC_with_ARM32 = true;
110         enum LDC_with_ARM64 = false;
111         enum LDC_with_SSE1 = false;
112         enum LDC_with_SSE2 = false;
113         enum LDC_with_SSE3 = false;
114         enum LDC_with_SSSE3 = false;
115         enum LDC_with_SSE41 = false;
116         enum LDC_with_SSE42 = false;
117         enum LDC_with_AVX = false;
118         enum LDC_with_AVX2 = false;
119         enum LDC_with_SHA = false;
120     }
121     else version(AArch64)
122     {
123         enum LDC_with_ARM32 = false;
124         enum LDC_with_ARM64 = true;
125         enum LDC_with_SSE1 = false;
126         enum LDC_with_SSE2 = false;
127         enum LDC_with_SSE3 = false;
128         enum LDC_with_SSSE3 = false;
129         enum LDC_with_SSE41 = false;
130         enum LDC_with_SSE42 = false;
131         enum LDC_with_AVX = false;
132         enum LDC_with_AVX2 = false;
133         enum LDC_with_SHA = false;
134     }
135     else
136     {
137         public import ldc.gccbuiltins_x86;
138         enum LDC_with_ARM32 = false;
139         enum LDC_with_ARM64 = false;
140         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
141         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
142         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
143         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3");
144         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1");
145         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
146         enum LDC_with_AVX = __traits(targetHasFeature, "avx");
147         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2");
148         enum LDC_with_SHA = __traits(targetHasFeature, "sha");
149     }
150 }
151 else
152 {
153     enum LDC_with_ARM32 = false;
154     enum LDC_with_ARM64 = false;
155     enum LDC_with_SSE1 = false;
156     enum LDC_with_SSE2 = false;
157     enum LDC_with_SSE3 = false;
158     enum LDC_with_SSSE3 = false;
159     enum LDC_with_SSE41 = false;
160     enum LDC_with_SSE42 = false;
161     enum LDC_with_AVX = false;
162     enum LDC_with_AVX2 = false;
163     enum LDC_with_SHA = false;
164 }
165 
166 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
167 
168 version(DigitalMars)
169 {
170     version(D_InlineAsm_X86)
171         enum DMD_with_asm = true;
172     else version(D_InlineAsm_X86_64)
173         enum DMD_with_asm = true;
174     else
175         enum DMD_with_asm = false;
176 
177     version(D_InlineAsm_X86)
178         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
179     else
180         enum DMD_with_32bit_asm = false;
181 
182     version (D_SIMD)
183         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
184     else
185         enum DMD_with_DSIMD = false;
186 }
187 else
188 {
189     enum DMD_with_asm = false;
190     enum DMD_with_32bit_asm = false;
191     enum DMD_with_DSIMD = false;
192 }
193 
194 static if (LDC_with_ARM32)
195 {
196     package uint arm_get_fpcr() nothrow @nogc @trusted
197     {
198         return __builtin_arm_get_fpscr();
199     }
200 
201     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
202     {
203         __builtin_arm_set_fpscr(cw);
204     }
205 }
206 
207 static if (LDC_with_ARM64)
208 {
209     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
210         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
211 
212     package uint arm_get_fpcr() pure nothrow @nogc @trusted
213     {
214         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
215         return __asm!uint("mrs $0, fpcr", "=r");
216     }
217 
218     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
219     {
220         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
221         long save_x2;
222         __asm!void("str x2, $1 \n" ~
223                    "ldr w2, $0 \n" ~
224                    "msr fpcr, x2 \n" ~
225                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
226     }
227 }
228 
229 
230 // For internal use only, since public API deals with a x86 semantic emulation
231 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
232 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
233 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
234 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
235 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
236 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
237 
238 
239 //
240 //  <ROUNDING>
241 //
242 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
243 //  doesn't change the FPU rounding mode, and isn't expected to do so.
244 //  So we devised these rounding function to help having consistent rouding between 
245 //  LDC and DMD. It's important that DMD uses what is in MXCSR to round.
246 //
247 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
248 //  functionality.
249 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
250 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
251 
252 int convertFloatToInt32UsingMXCSR(float value) @trusted
253 {
254     int result;
255     version(GNU)
256     {
257         asm pure nothrow @nogc @trusted
258         {
259             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
260         }
261     }
262     else static if (LDC_with_ARM32)
263     {
264         // TODO: this is a bug, it won't preserve registers when optimized
265         result = __asm!int(`vldr s2, $1
266                             vcvtr.s32.f32 s2, s2
267                             vmov $0, s2`, "=r,m", value);
268     }
269     else static if (LDC_with_ARM64)
270     {
271         // Get current rounding mode.
272         uint fpscr = arm_get_fpcr();
273 
274         switch(fpscr & _MM_ROUND_MASK_ARM)
275         {
276             default:
277             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
278             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
279             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
280             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
281         }
282     }
283     else
284     {
285         asm pure nothrow @nogc @trusted
286         {
287             cvtss2si EAX, value;
288             mov result, EAX;
289         }
290     }
291     return result;
292 }
293 
294 int convertDoubleToInt32UsingMXCSR(double value) @trusted
295 {
296     int result;
297     version(GNU)
298     {
299         asm pure nothrow @nogc @trusted
300         {
301             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
302         }
303     }
304     else static if (LDC_with_ARM32)
305     {
306         // TODO: bug, doesn't preserve registers
307         result = __asm!int(`vldr d2, $1
308                             vcvtr.s32.f64 s2, d2
309                             vmov $0, s2`, "=r,m", value);
310     }
311     else static if (LDC_with_ARM64)
312     {
313         // Get current rounding mode.
314         uint fpscr = arm_get_fpcr();
315 
316         switch(fpscr & _MM_ROUND_MASK_ARM)
317         {
318             default:
319             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
320             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
321             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
322             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
323         }
324     }
325     else
326     {
327         asm pure nothrow @nogc @trusted
328         {
329             cvtsd2si EAX, value;
330             mov result, EAX;
331         }
332     }
333     return result;
334 }
335 
336 long convertFloatToInt64UsingMXCSR(float value) @trusted
337 {
338     static if (LDC_with_ARM32)
339     {
340         // We have to resort to libc since 32-bit ARM 
341         // doesn't seem to have 64-bit registers.
342         
343         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
344 
345         // Note: converting to double precision else rounding could be different for large integers
346         double asDouble = value; 
347 
348         switch(fpscr & _MM_ROUND_MASK_ARM)
349         {
350             default:
351             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
352             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
353             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
354             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
355         }
356     }
357     else static if (LDC_with_ARM64)
358     {
359         uint fpscr = arm_get_fpcr();
360 
361         switch(fpscr & _MM_ROUND_MASK_ARM)
362         {
363             default:
364             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
365             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
366             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
367             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
368         }
369     }
370     // 64-bit can use an SSE instruction
371     else version(D_InlineAsm_X86_64)
372     {
373         long result;
374         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
375         {
376             asm pure nothrow @nogc @trusted
377             {
378                 movss XMM0, value;
379                 cvtss2si RAX, XMM0;
380                 mov result, RAX;
381             }
382         }
383         else
384         {
385             asm pure nothrow @nogc @trusted
386             {
387                 movss XMM0, value;
388                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
389                 mov result, RAX;
390             }
391         }
392         return result;
393     }
394     else version(D_InlineAsm_X86)
395     {
396         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
397         // This leads to an unfortunate FPU sequence in every C++ compiler.
398         // See: https://godbolt.org/z/vZym77
399 
400         // Get current MXCSR rounding
401         uint sseRounding;
402         ushort savedFPUCW;
403         ushort newFPUCW;
404         long result;
405         asm pure nothrow @nogc @trusted
406         {
407             stmxcsr sseRounding;
408             fld value;
409             fnstcw savedFPUCW;
410             mov AX, savedFPUCW;
411             and AX, 0xf3ff;          // clear FPU rounding bits
412             movzx ECX, word ptr sseRounding;
413             and ECX, 0x6000;         // only keep SSE rounding bits
414             shr ECX, 3;
415             or AX, CX;               // make a new control word for FPU with SSE bits
416             mov newFPUCW, AX;
417             fldcw newFPUCW;
418             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
419             fldcw savedFPUCW;
420         }
421         return result;
422     }
423     else static if (GDC_with_x86)
424     {
425         version(X86_64) // 64-bit can just use the right instruction
426         {
427             static assert(GDC_with_SSE);
428             __m128 A;
429             A.ptr[0] = value;
430             return __builtin_ia32_cvtss2si64 (A);
431         }
432         else version(X86) // 32-bit
433         {
434             // This is untested!
435             uint sseRounding;
436             ushort savedFPUCW;
437             ushort newFPUCW;
438             long result;
439             asm pure nothrow @nogc @trusted
440             {
441                 "stmxcsr %1;\n" ~
442                 "fld %2;\n" ~
443                 "fnstcw %3;\n" ~
444                 "movw %3, %%ax;\n" ~
445                 "andw $0xf3ff, %%ax;\n" ~
446                 "movzwl %1, %%ecx;\n" ~
447                 "andl $0x6000, %%ecx;\n" ~
448                 "shrl $3, %%ecx;\n" ~
449                 "orw %%cx, %%ax\n" ~
450                 "movw %%ax, %4;\n" ~
451                 "fldcw %4;\n" ~
452                 "fistpll %0;\n" ~
453                 "fldcw %3;\n" 
454                   : "=m"(result)    // %0
455                   : "m" (sseRounding),
456                     "f" (value),
457                     "m" (savedFPUCW),
458                     "m" (newFPUCW) 
459                   : "eax", "ecx", "st";
460             }
461             return result;
462         }
463         else
464             static assert(false);
465     }
466     else
467         static assert(false);
468 }
469 
470 
471 ///ditto
472 long convertDoubleToInt64UsingMXCSR(double value) @trusted
473 {
474     static if (LDC_with_ARM32)
475     {
476         // We have to resort to libc since 32-bit ARM 
477         // doesn't seem to have 64-bit registers.
478         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
479         switch(fpscr & _MM_ROUND_MASK_ARM)
480         {
481             default:
482             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
483             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
484             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
485             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
486         }
487     }
488     else static if (LDC_with_ARM64)
489     {
490         // Get current rounding mode.
491         uint fpscr = arm_get_fpcr();
492 
493         switch(fpscr & _MM_ROUND_MASK_ARM)
494         {
495             default:
496             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
497             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
498             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
499             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
500         }
501     }
502     // 64-bit can use an SSE instruction
503     else version(D_InlineAsm_X86_64)
504     {
505         long result;
506         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
507         {
508             asm pure nothrow @nogc @trusted
509             {
510                 movsd XMM0, value;
511                 cvtsd2si RAX, XMM0;
512                 mov result, RAX;
513             }
514         }
515         else
516         {
517             asm pure nothrow @nogc @trusted
518             {
519                 movsd XMM0, value;
520                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
521                 mov result, RAX;
522             }
523         }
524         return result;
525     }
526     else version(D_InlineAsm_X86)
527     {
528         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
529         // This leads to an unfortunate FPU sequence in every C++ compiler.
530         // See: https://godbolt.org/z/vZym77
531 
532         // Get current MXCSR rounding
533         uint sseRounding;
534         ushort savedFPUCW;
535         ushort newFPUCW;
536         long result;
537         asm pure nothrow @nogc @trusted
538         {
539             stmxcsr sseRounding;
540             fld value;
541             fnstcw savedFPUCW;
542             mov AX, savedFPUCW;
543             and AX, 0xf3ff;
544             movzx ECX, word ptr sseRounding;
545             and ECX, 0x6000;
546             shr ECX, 3;
547             or AX, CX;
548             mov newFPUCW, AX;
549             fldcw newFPUCW;
550             fistp result;
551             fldcw savedFPUCW;
552         }
553         return result;
554     }
555     else static if (GDC_with_x86)
556     {
557         version(X86_64)
558         {
559             static assert(GDC_with_SSE2);
560             __m128d A;
561             A.ptr[0] = value;
562             return __builtin_ia32_cvtsd2si64 (A);
563         }
564         else
565         {
566             // This is untested!
567             uint sseRounding;
568             ushort savedFPUCW;
569             ushort newFPUCW;
570             long result;
571             asm pure nothrow @nogc @trusted
572             {
573                 "stmxcsr %1;\n" ~
574                 "fld %2;\n" ~
575                 "fnstcw %3;\n" ~
576                 "movw %3, %%ax;\n" ~
577                 "andw $0xf3ff, %%ax;\n" ~
578                 "movzwl %1, %%ecx;\n" ~
579                 "andl $0x6000, %%ecx;\n" ~
580                 "shrl $3, %%ecx;\n" ~
581                 "orw %%cx, %%ax\n" ~
582                 "movw %%ax, %4;\n" ~
583                 "fldcw %4;\n" ~
584                 "fistpll %0;\n" ~
585                 "fldcw %3;\n"         
586                   : "=m"(result)    // %0
587                   : "m" (sseRounding),
588                     "t" (value),
589                     "m" (savedFPUCW),
590                     "m" (newFPUCW) 
591                   : "eax", "ecx", "st";
592             }
593             return result;
594         }
595     }
596     else
597         static assert(false);
598 }
599 
600 //
601 //  </ROUNDING>
602 //
603 
604 
605 // using the Intel terminology here
606 
607 byte saturateSignedWordToSignedByte(short value) pure @safe
608 {
609     if (value > 127) value = 127;
610     if (value < -128) value = -128;
611     return cast(byte) value;
612 }
613 
614 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
615 {
616     if (value > 255) value = 255;
617     if (value < 0) value = 0;
618     return cast(ubyte) value;
619 }
620 
621 short saturateSignedIntToSignedShort(int value) pure @safe
622 {
623     if (value > 32767) value = 32767;
624     if (value < -32768) value = -32768;
625     return cast(short) value;
626 }
627 
628 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
629 {
630     if (value > 65535) value = 65535;
631     if (value < 0) value = 0;
632     return cast(ushort) value;
633 }
634 
635 unittest // test saturate operations
636 {
637     assert( saturateSignedWordToSignedByte(32000) == 127);
638     assert( saturateSignedWordToUnsignedByte(32000) == 255);
639     assert( saturateSignedWordToSignedByte(-4000) == -128);
640     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
641     assert( saturateSignedIntToSignedShort(32768) == 32767);
642     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
643     assert( saturateSignedIntToSignedShort(-32769) == -32768);
644     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
645 }
646 
647 version(unittest)
648 {
649     // This is just for debugging tests
650     import core.stdc.stdio: printf;
651 
652     // printing vectors for implementation
653     // Note: you can override `pure` within a `debug` clause
654 
655     void _mm_print_pi64(__m64 v) @trusted
656     {
657         long1 vl = cast(long1)v;
658         printf("%lld\n", vl.array[0]);
659     }
660 
661     void _mm_print_pi32(__m64 v) @trusted
662     {
663         int[2] C = (cast(int2)v).array;
664         printf("%d %d\n", C[0], C[1]);
665     }
666 
667     void _mm_print_pi16(__m64 v) @trusted
668     {
669         short[4] C = (cast(short4)v).array;
670         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
671     }
672 
673     void _mm_print_pi8(__m64 v) @trusted
674     {
675         byte[8] C = (cast(byte8)v).array;
676         printf("%d %d %d %d %d %d %d %d\n",
677         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
678     }
679 
680     void _mm_print_epi64(__m128i v) @trusted
681     {
682         long2 vl = cast(long2)v;
683         printf("%lld %lld\n", vl.array[0], vl.array[1]);
684     }
685 
686     void _mm_print_epi32(__m128i v) @trusted
687     {
688         printf("%d %d %d %d\n",
689               v.array[0], v.array[1], v.array[2], v.array[3]);
690     }  
691 
692     void _mm_print_epi16(__m128i v) @trusted
693     {
694         short[8] C = (cast(short8)v).array;
695         printf("%d %d %d %d %d %d %d %d\n",
696         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
697     }
698 
699     void _mm_print_epi8(__m128i v) @trusted
700     {
701         byte[16] C = (cast(byte16)v).array;
702         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
703         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
704     }
705 
706     void _mm_print_ps(__m128 v) @trusted
707     {
708         float[4] C = (cast(float4)v).array;
709         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
710     }
711 
712     void _mm_print_pd(__m128d v) @trusted
713     {
714         double[2] C = (cast(double2)v).array;
715         printf("%f %f\n", C[0], C[1]);
716     }    
717 }
718 
719 
720 //
721 //  <FLOATING-POINT COMPARISONS>
722 //
723 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
724 //       need different IR generation.
725 
726 enum FPComparison
727 {
728     oeq,   // ordered and equal
729     ogt,   // ordered and greater than
730     oge,   // ordered and greater than or equal
731     olt,   // ordered and less than
732     ole,   // ordered and less than or equal
733     one,   // ordered and not equal
734     ord,   // ordered (no nans)
735     ueq,   // unordered or equal
736     ugt,   // unordered or greater than ("nle")
737     uge,   // unordered or greater than or equal ("nlt")
738     ult,   // unordered or less than ("nge")
739     ule,   // unordered or less than or equal ("ngt")
740     une,   // unordered or not equal ("neq")
741     uno,   // unordered (either nans)
742 }
743 
744 private static immutable string[FPComparison.max+1] FPComparisonToString =
745 [
746     "oeq",
747     "ogt",
748     "oge",
749     "olt",
750     "ole",
751     "one",
752     "ord",
753     "ueq",
754     "ugt",
755     "uge",
756     "ult",
757     "ule",
758     "une",
759     "uno",
760 ];
761 
762 // Individual float comparison: returns -1 for true or 0 for false.
763 // Useful for DMD and testing
764 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
765 {
766     bool unordered = isnan(a) || isnan(b);
767     final switch(comparison) with(FPComparison)
768     {
769         case oeq: return a == b;
770         case ogt: return a > b;
771         case oge: return a >= b;
772         case olt: return a < b;
773         case ole: return a <= b;
774         case one: return !unordered && (a != b); // NaN with != always yields true
775         case ord: return !unordered; 
776         case ueq: return unordered || (a == b);
777         case ugt: return unordered || (a > b);
778         case uge: return unordered || (a >= b);
779         case ult: return unordered || (a < b);
780         case ule: return unordered || (a <= b);
781         case une: return (a != b); // NaN with != always yields true
782         case uno: return unordered;
783     }
784 }
785 
786 version(LDC)
787 {
788     /// Provides packed float comparisons
789     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
790     {
791         enum ir = `
792             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
793             %r = sext <4 x i1> %cmp to <4 x i32>
794             ret <4 x i32> %r`;
795 
796         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
797     }
798 
799     /// Provides packed double comparisons
800     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
801     {
802         enum ir = `
803             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
804             %r = sext <2 x i1> %cmp to <2 x i64>
805             ret <2 x i64> %r`;
806 
807         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
808     }
809 
810     /// CMPSS-style comparisons
811     /// clang implement it through x86 intrinsics, it is possible with IR alone
812     /// but leads to less optimal code.
813     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
814     /// Not that simple.
815     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
816     {
817         /*
818         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
819         enum bool invertOp = (predicateNumber & 0x80) != 0;
820         static if(invertOp)
821             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
822         else
823             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
824         */
825         enum ir = `
826             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
827             %r = sext i1 %cmp to i32
828             %r2 = bitcast i32 %r to float
829             ret float %r2`;
830 
831         float4 r = a;
832         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
833         return r;
834     }
835 
836     /// CMPSD-style comparisons
837     /// clang implement it through x86 intrinsics, it is possible with IR alone
838     /// but leads to less optimal code.
839     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
840     /// Not that simple.    
841     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
842     {
843         enum ir = `
844             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
845             %r = sext i1 %cmp to i64
846             %r2 = bitcast i64 %r to double
847             ret double %r2`;
848 
849         double2 r = a;
850         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
851         return r;
852     }
853 }
854 else
855 {
856     /// Provides packed float comparisons
857     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
858     {
859         int4 result;
860         foreach(i; 0..4)
861         {
862             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
863         }
864         return result;
865     }
866 
867     /// Provides packed double comparisons
868     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
869     {
870         long2 result;
871         foreach(i; 0..2)
872         {
873             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
874         }
875         return result;
876     }
877 
878     /// Provides CMPSS-style comparison
879     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
880     {
881         int4 result = cast(int4)a;
882         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
883         return cast(float4)result;
884     }
885 
886     /// Provides CMPSD-style comparison
887     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
888     {
889         long2 result = cast(long2)a;
890         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
891         return cast(double2)result;
892     }
893 }
894 unittest // cmpps
895 {
896     // Check all comparison type is working
897     float4 A = [1, 3, 5, float.nan];
898     float4 B = [2, 3, 4, 5];
899 
900     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
901     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
902     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
903     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
904     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
905     int4 result_one = cmpps!(FPComparison.one)(A, B);
906     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
907     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
908     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
909     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
910     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
911     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
912     int4 result_une = cmpps!(FPComparison.une)(A, B);
913     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
914 
915     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
916     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
917     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
918     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
919     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
920     static immutable int[4] correct_one    = [-1, 0,-1, 0];
921     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
922     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
923     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
924     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
925     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
926     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
927     static immutable int[4] correct_une    = [-1, 0,-1,-1];
928     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
929 
930     assert(result_oeq.array == correct_oeq);
931     assert(result_ogt.array == correct_ogt);
932     assert(result_oge.array == correct_oge);
933     assert(result_olt.array == correct_olt);
934     assert(result_ole.array == correct_ole);
935     assert(result_one.array == correct_one);
936     assert(result_ord.array == correct_ord);
937     assert(result_ueq.array == correct_ueq);
938     assert(result_ugt.array == correct_ugt);
939     assert(result_uge.array == correct_uge);
940     assert(result_ult.array == correct_ult);
941     assert(result_ule.array == correct_ule);
942     assert(result_une.array == correct_une);
943     assert(result_uno.array == correct_uno);
944 }
945 unittest
946 {
947     double2 a = [1, 3];
948     double2 b = [2, 3];
949     long2 c = cmppd!(FPComparison.ult)(a, b);
950     static immutable long[2] correct = [cast(long)(-1), 0];
951     assert(c.array == correct);
952 }
953 unittest // cmpss
954 {
955     void testComparison(FPComparison comparison)(float4 A, float4 B)
956     {
957         float4 result = cmpss!comparison(A, B);
958         int4 iresult = cast(int4)result;
959         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
960         assert(iresult.array[0] == expected);
961         assert(result.array[1] == A.array[1]);
962         assert(result.array[2] == A.array[2]);
963         assert(result.array[3] == A.array[3]);
964     }
965 
966     // Check all comparison type is working
967     float4 A = [1, 3, 5, 6];
968     float4 B = [2, 3, 4, 5];
969     float4 C = [float.nan, 3, 4, 5];
970 
971     testComparison!(FPComparison.oeq)(A, B);
972     testComparison!(FPComparison.oeq)(A, C);
973     testComparison!(FPComparison.ogt)(A, B);
974     testComparison!(FPComparison.ogt)(A, C);
975     testComparison!(FPComparison.oge)(A, B);
976     testComparison!(FPComparison.oge)(A, C);
977     testComparison!(FPComparison.olt)(A, B);
978     testComparison!(FPComparison.olt)(A, C);
979     testComparison!(FPComparison.ole)(A, B);
980     testComparison!(FPComparison.ole)(A, C);
981     testComparison!(FPComparison.one)(A, B);
982     testComparison!(FPComparison.one)(A, C);
983     testComparison!(FPComparison.ord)(A, B);
984     testComparison!(FPComparison.ord)(A, C);
985     testComparison!(FPComparison.ueq)(A, B);
986     testComparison!(FPComparison.ueq)(A, C);
987     testComparison!(FPComparison.ugt)(A, B);
988     testComparison!(FPComparison.ugt)(A, C);
989     testComparison!(FPComparison.uge)(A, B);
990     testComparison!(FPComparison.uge)(A, C);
991     testComparison!(FPComparison.ult)(A, B);
992     testComparison!(FPComparison.ult)(A, C);
993     testComparison!(FPComparison.ule)(A, B);
994     testComparison!(FPComparison.ule)(A, C);
995     testComparison!(FPComparison.une)(A, B);
996     testComparison!(FPComparison.une)(A, C);
997     testComparison!(FPComparison.uno)(A, B);
998     testComparison!(FPComparison.uno)(A, C);
999 }
1000 unittest // cmpsd
1001 {
1002     void testComparison(FPComparison comparison)(double2 A, double2 B)
1003     {
1004         double2 result = cmpsd!comparison(A, B);
1005         long2 iresult = cast(long2)result;
1006         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1007         assert(iresult.array[0] == expected);
1008         assert(result.array[1] == A.array[1]);
1009     }
1010 
1011     // Check all comparison type is working
1012     double2 A = [1, 3];
1013     double2 B = [2, 4];
1014     double2 C = [double.nan, 5];
1015 
1016     testComparison!(FPComparison.oeq)(A, B);
1017     testComparison!(FPComparison.oeq)(A, C);
1018     testComparison!(FPComparison.ogt)(A, B);
1019     testComparison!(FPComparison.ogt)(A, C);
1020     testComparison!(FPComparison.oge)(A, B);
1021     testComparison!(FPComparison.oge)(A, C);
1022     testComparison!(FPComparison.olt)(A, B);
1023     testComparison!(FPComparison.olt)(A, C);
1024     testComparison!(FPComparison.ole)(A, B);
1025     testComparison!(FPComparison.ole)(A, C);
1026     testComparison!(FPComparison.one)(A, B);
1027     testComparison!(FPComparison.one)(A, C);
1028     testComparison!(FPComparison.ord)(A, B);
1029     testComparison!(FPComparison.ord)(A, C);
1030     testComparison!(FPComparison.ueq)(A, B);
1031     testComparison!(FPComparison.ueq)(A, C);
1032     testComparison!(FPComparison.ugt)(A, B);
1033     testComparison!(FPComparison.ugt)(A, C);
1034     testComparison!(FPComparison.uge)(A, B);
1035     testComparison!(FPComparison.uge)(A, C);
1036     testComparison!(FPComparison.ult)(A, B);
1037     testComparison!(FPComparison.ult)(A, C);
1038     testComparison!(FPComparison.ule)(A, B);
1039     testComparison!(FPComparison.ule)(A, C);
1040     testComparison!(FPComparison.une)(A, B);
1041     testComparison!(FPComparison.une)(A, C);
1042     testComparison!(FPComparison.uno)(A, B);
1043     testComparison!(FPComparison.uno)(A, C);
1044 }
1045 
1046 //
1047 //  </FLOATING-POINT COMPARISONS>
1048 //
1049 
1050 
1051 __m64 to_m64(__m128i a) pure @trusted
1052 {
1053     long2 la = cast(long2)a;
1054     long1 r = la.array[0];
1055     return r;
1056 }
1057 
1058 __m128i to_m128i(__m64 a) pure @trusted
1059 {
1060   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1061     
1062     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1063     {
1064         long2 r = a.array[0];
1065         r.ptr[1] = 0;
1066         return cast(int4)r;
1067     }
1068     else */
1069     {
1070         long2 r = [0, 0];
1071         r.ptr[0] = a.array[0];
1072         return cast(__m128i)r;
1073     }
1074 }
1075 
1076 // ADDITIONAL x86 INTRINSICS
1077 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1078 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1079 static if (LDC_with_SSE41)
1080 {
1081     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1082         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1083 }
1084 
1085 // SOME NEON INTRINSICS
1086 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1087 // Not in the public API but the simde project expose it all for the user to use.
1088 // MAYDO: create a new neon.d module, for internal use only.
1089 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1090 static if (LDC_with_ARM64)
1091 {
1092     // VERY USEFUL LINK
1093     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1094     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1095 
1096     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1097         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1098 
1099     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1100         short8 vabsq_s16(short8 a) pure @safe;
1101 
1102     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1103         int4 vabsq_s32(int4 a) pure @safe;
1104 
1105     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1106         byte16 vabsq_s8(byte16 a) pure @safe;
1107 
1108     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1109     {
1110         return a & b;
1111     }
1112 
1113     long2 vandq_s64(long2 a, long2 b)
1114     {
1115         return a & b;
1116     }
1117 
1118     long2 vbicq_s64(long2 a, long2 b) pure @safe
1119     {
1120         return a & ~b;
1121     }
1122 
1123     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1124     {
1125         return c ^ ((c ^ b) & a);
1126     }
1127 
1128     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1129     {
1130         return c ^ ((c ^ b) & a);
1131     }
1132 
1133     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1134     {
1135         return c ^ ((c ^ b) & a);
1136     }
1137 
1138     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1139     {
1140         short8 r;
1141         r.ptr[0]  = lo.array[0];
1142         r.ptr[1]  = lo.array[1];
1143         r.ptr[2]  = lo.array[2];
1144         r.ptr[3]  = lo.array[3];
1145         r.ptr[4]  = hi.array[0];
1146         r.ptr[5]  = hi.array[1];
1147         r.ptr[6]  = hi.array[2];
1148         r.ptr[7]  = hi.array[3];
1149         return r;
1150     }
1151 
1152     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1153     {
1154         int4 r;
1155         r.ptr[0] = lo.array[0];
1156         r.ptr[1] = lo.array[1];
1157         r.ptr[2] = hi.array[0];
1158         r.ptr[3] = hi.array[1];
1159         return r;
1160     }
1161 
1162     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1163     {
1164         byte16 r;
1165         r.ptr[0]  = lo.array[0];
1166         r.ptr[1]  = lo.array[1];
1167         r.ptr[2]  = lo.array[2];
1168         r.ptr[3]  = lo.array[3];
1169         r.ptr[4]  = lo.array[4];
1170         r.ptr[5]  = lo.array[5];
1171         r.ptr[6]  = lo.array[6];
1172         r.ptr[7]  = lo.array[7];
1173         r.ptr[8]  = hi.array[0];
1174         r.ptr[9]  = hi.array[1];
1175         r.ptr[10] = hi.array[2];
1176         r.ptr[11] = hi.array[3];
1177         r.ptr[12] = hi.array[4];
1178         r.ptr[13] = hi.array[5];
1179         r.ptr[14] = hi.array[6];
1180         r.ptr[15] = hi.array[7];
1181         return r;
1182     }
1183 
1184     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1185     {
1186         short8 r;
1187         r.ptr[0]  = lo.array[0];
1188         r.ptr[1]  = lo.array[1];
1189         r.ptr[2]  = lo.array[2];
1190         r.ptr[3]  = lo.array[3];
1191         r.ptr[4]  = hi.array[0];
1192         r.ptr[5]  = hi.array[1];
1193         r.ptr[6]  = hi.array[2];
1194         r.ptr[7]  = hi.array[3];
1195         return r;
1196     }
1197 
1198 
1199     // float4 => int4
1200 
1201     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1202         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1203 
1204     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1205         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1206 
1207     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1208         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1209 
1210     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1211         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1212 
1213 
1214     // double2 => long2
1215 
1216     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1217         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1218 
1219     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1220         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1221 
1222     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1223         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1224 
1225     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1226         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1227 
1228     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1229         int vcvtms_s32_f32(float a) pure @safe;
1230 
1231     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1232         int vcvtns_s32_f32(float a) pure @safe;    
1233 
1234     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1235         int vcvtps_s32_f32(float a) pure @safe;
1236 
1237     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1238         int vcvts_s32_f32(float a) pure @safe;
1239      
1240     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1241         int vcvtms_s32_f64(double a) pure @safe;
1242 
1243     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1244         int vcvtns_s32_f64(double a) pure @safe;    
1245 
1246     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1247         int vcvtps_s32_f64(double a) pure @safe;
1248 
1249     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1250         int vcvts_s32_f64(double a) pure @safe;
1251 
1252     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1253         long vcvtms_s64_f32(float a) pure @safe;
1254 
1255     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1256         long vcvtns_s64_f32(float a) pure @safe;    
1257 
1258     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1259         long vcvtps_s64_f32(float a) pure @safe;
1260 
1261     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1262         long vcvts_s64_f32(float a) pure @safe;
1263 
1264     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1265         long vcvtms_s64_f64(double a) pure @safe;
1266 
1267     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1268         long vcvtns_s64_f64(double a) pure @safe;    
1269 
1270     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1271         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1272 
1273     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1274         long vcvts_s64_f64(double a) pure @safe;
1275 
1276     long2 vdupq_n_s64(long value) pure @safe
1277     {
1278         long2 r;
1279         r = value;
1280         return r;
1281     }
1282 
1283     short4 vget_high_s16(short8 a) pure @trusted
1284     {
1285         short4 r;
1286         r.ptr[0] = a.array[4];
1287         r.ptr[1] = a.array[5];
1288         r.ptr[2] = a.array[6];
1289         r.ptr[3] = a.array[7];
1290         return r;
1291     }
1292 
1293     int2 vget_high_s32(int4 a) pure @trusted
1294     {
1295         int2 r;
1296         r.ptr[0] = a.array[2];
1297         r.ptr[1] = a.array[3];
1298         return r;
1299     }
1300 
1301     byte8 vget_high_u8(byte16 a) pure @trusted
1302     {
1303         byte8 r;
1304         r.ptr[0] = a.array[8];
1305         r.ptr[1] = a.array[9];
1306         r.ptr[2] = a.array[10];
1307         r.ptr[3] = a.array[11];
1308         r.ptr[4] = a.array[12];
1309         r.ptr[5] = a.array[13];
1310         r.ptr[6] = a.array[14];
1311         r.ptr[7] = a.array[15];
1312         return r;
1313     }
1314 
1315     short4 vget_low_s16(short8 a) pure @trusted
1316     {
1317         short4 r;
1318         r.ptr[0] = a.array[0];
1319         r.ptr[1] = a.array[1];
1320         r.ptr[2] = a.array[2];
1321         r.ptr[3] = a.array[3];
1322         return r;
1323     } 
1324 
1325     int2 vget_low_s32(int4 a) pure @trusted
1326     {
1327         int2 r;
1328         r.ptr[0] = a.array[0];
1329         r.ptr[1] = a.array[1];
1330         return r;
1331     }
1332 
1333     byte8 vget_low_u8(byte16 a) pure @trusted
1334     {
1335         byte8 r;
1336         r.ptr[0] = a.array[0];
1337         r.ptr[1] = a.array[1];
1338         r.ptr[2] = a.array[2];
1339         r.ptr[3] = a.array[3];
1340         r.ptr[4] = a.array[4];
1341         r.ptr[5] = a.array[5];
1342         r.ptr[6] = a.array[6];
1343         r.ptr[7] = a.array[7];
1344         return r;
1345     }
1346 
1347     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1348     {
1349         return v.array[lane];
1350     }
1351 
1352     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1353         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1354 
1355     int4 vmaxq_s32(int4 a, int4 b)
1356     {
1357         int4 r;
1358         r[0] = a[0] >= b[0] ? a[0] : b[0];
1359         r[1] = a[1] >= b[1] ? a[1] : b[1];
1360         r[2] = a[2] >= b[2] ? a[2] : b[2];
1361         r[3] = a[3] >= b[3] ? a[3] : b[3];
1362         return r;
1363     }
1364 
1365     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1366         short8 vminq_s16(short8 a, short8 b) pure @safe;
1367 
1368     int2 vmovn_s64(long2 a) pure @trusted
1369     {
1370         int2 r;
1371         r.ptr[0] = cast(int)(a.array[0]);
1372         r.ptr[1] = cast(int)(a.array[1]);
1373         return r;
1374     }        
1375 
1376     int4 vmull_s16(short4 a, short4 b) pure @trusted
1377     {
1378         int4 r;
1379         r.ptr[0] = a.array[0] * b.array[0];
1380         r.ptr[1] = a.array[1] * b.array[1];
1381         r.ptr[2] = a.array[2] * b.array[2];
1382         r.ptr[3] = a.array[3] * b.array[3];
1383         return r;
1384     }
1385 
1386     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1387         long2 vmull_s32(int2 a, int2 b) pure @safe;
1388 
1389     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1390         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1391 
1392     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1393         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1394 
1395     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1396         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1397 
1398     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1399         short8 vpaddlq_u8 (byte16 a) pure @safe;
1400 
1401     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1402     {
1403         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1404             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1405     }
1406     else
1407     {
1408         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1409             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1410     }
1411     
1412     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1413         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1414 
1415     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1416         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1417 
1418     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1419         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1420 
1421     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1422         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1423 
1424     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1425         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1426 
1427     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1428         byte8 vqmovn_s16(short8 a) pure @safe;
1429 
1430     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1431         short4 vqmovn_s32(int4 a) pure @safe;
1432 
1433     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1434         short4 vqmovn_u32(int4 a) pure @safe;
1435 
1436     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1437         byte8 vqmovun_s16(short8 a) pure @safe;
1438 
1439     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1440         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1441 
1442     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1443         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1444 
1445     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1446         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1447 
1448     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1449         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1450 
1451     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1452         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1453 
1454     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1455         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1456 
1457     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1458     {
1459         return a >>> b;
1460     }
1461 
1462     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1463     { 
1464         a = a >> byte16(cast(byte)r);
1465         return a;
1466     }
1467 
1468     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1469         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1470 }
1471 
1472 version(unittest)
1473 {
1474     double abs_double(double x) @trusted
1475     {
1476         version(LDC)
1477             return llvm_fabs(x);
1478         else
1479         {
1480             long uf = *cast(long*)(&x);
1481             uf &= 0x7fffffff_ffffffff;
1482             return *cast(double*)(&uf);
1483         }
1484     }
1485 }
1486 
1487 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1488 
1489 bool isnan(float x) pure @trusted
1490 {
1491     uint u = *cast(uint*)(&x);
1492     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1493     return result;
1494 }
1495 unittest
1496 {
1497     float x = float.nan;
1498     assert(isnan(x));
1499 
1500     x = 0;
1501     assert(!isnan(x));
1502     
1503     x = float.infinity;
1504     assert(!isnan(x));
1505 }
1506 
1507 bool isnan(double x) pure @trusted
1508 {
1509     ulong u = *cast(ulong*)(&x);
1510     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1511 }
1512 unittest
1513 {
1514     double x = double.nan;
1515     assert(isnan(x));
1516 
1517     x = 0;
1518     assert(!isnan(x));
1519     
1520     x = double.infinity;
1521     assert(!isnan(x));
1522 }