1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_SHA = false;
34         enum GDC_with_BMI2 = false;
35     }
36     else version (X86_64)
37     {
38         // GDC support uses extended inline assembly:
39         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
40         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
41         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
42 
43         public import core.simd;
44 
45         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
46         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
47         public import gcc.builtins;
48                 
49         enum GDC_with_x86 = true;
50         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
51         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
52         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
53 
54         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
55         enum GDC_with_SSSE3 = false; // TODO: we don't have a way to detect that at CT
56         enum GDC_with_SSE41 = false; // TODO: we don't have a way to detect that at CT
57         enum GDC_with_SSE42 = false; // TODO: we don't have a way to detect that at CT
58         enum GDC_with_SHA = false;
59         enum GDC_with_BMI2 = false;
60     }
61     else
62     {
63         enum GDC_with_x86 = false;
64         enum GDC_with_MMX = false;
65         enum GDC_with_SSE = false;
66         enum GDC_with_SSE2 = false;
67         enum GDC_with_SSE3 = false;
68         enum GDC_with_SSSE3 = false;
69         enum GDC_with_SSE41 = false;
70         enum GDC_with_SSE42 = false;
71         enum GDC_with_SHA = false;
72         enum GDC_with_BMI2 = false;
73     }
74 }
75 else
76 {
77     enum GDC_with_x86 = false;
78     enum GDC_with_MMX = false;
79     enum GDC_with_SSE = false;
80     enum GDC_with_SSE2 = false;
81     enum GDC_with_SSE3 = false;
82     enum GDC_with_SSSE3 = false;
83     enum GDC_with_SSE41 = false;
84     enum GDC_with_SSE42 = false;
85     enum GDC_with_SHA = false;
86     enum GDC_with_BMI2 = false;
87 }
88 
89 version(LDC)
90 {
91     public import core.simd;
92     public import ldc.simd;
93     public import ldc.intrinsics;
94     public import ldc.llvmasm: __asm;
95 
96     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
97     static if (__VERSION__ >= 2083)
98     {
99          import ldc.llvmasm;
100          alias LDCInlineIR = __ir_pure;
101 
102          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
103          alias LDCInlineIREx = __irEx_pure; 
104     }
105     else
106     {
107         alias LDCInlineIR = inlineIR;
108     }
109 
110     version(ARM)
111     {
112         public import ldc.gccbuiltins_arm;
113         enum LDC_with_ARM32 = true;
114         enum LDC_with_ARM64 = false;
115         enum LDC_with_SSE1 = false;
116         enum LDC_with_SSE2 = false;
117         enum LDC_with_SSE3 = false;
118         enum LDC_with_SSSE3 = false;
119         enum LDC_with_SSE41 = false;
120         enum LDC_with_SSE42 = false;
121         enum LDC_with_AVX = false;
122         enum LDC_with_AVX2 = false;
123         enum LDC_with_SHA = false;
124         enum LDC_with_BMI2 = false;
125     }
126     else version(AArch64)
127     {
128         enum LDC_with_ARM32 = false;
129         enum LDC_with_ARM64 = true;
130         enum LDC_with_SSE1 = false;
131         enum LDC_with_SSE2 = false;
132         enum LDC_with_SSE3 = false;
133         enum LDC_with_SSSE3 = false;
134         enum LDC_with_SSE41 = false;
135         enum LDC_with_SSE42 = false;
136         enum LDC_with_AVX = false;
137         enum LDC_with_AVX2 = false;
138         enum LDC_with_SHA = false;
139         enum LDC_with_BMI2 = false;
140     }
141     else
142     {
143         public import ldc.gccbuiltins_x86;
144         enum LDC_with_ARM32 = false;
145         enum LDC_with_ARM64 = false;
146         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
147         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
148         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
149         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3");
150         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1");
151         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
152         enum LDC_with_AVX = __traits(targetHasFeature, "avx");
153         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2");
154         enum LDC_with_SHA = __traits(targetHasFeature, "sha");
155         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2");
156     }
157 }
158 else
159 {
160     enum LDC_with_ARM32 = false;
161     enum LDC_with_ARM64 = false;
162     enum LDC_with_SSE1 = false;
163     enum LDC_with_SSE2 = false;
164     enum LDC_with_SSE3 = false;
165     enum LDC_with_SSSE3 = false;
166     enum LDC_with_SSE41 = false;
167     enum LDC_with_SSE42 = false;
168     enum LDC_with_AVX = false;
169     enum LDC_with_AVX2 = false;
170     enum LDC_with_SHA = false;
171     enum LDC_with_BMI2 = false;
172 }
173 
174 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
175 
176 version(DigitalMars)
177 {
178     version(D_InlineAsm_X86)
179         enum DMD_with_asm = true;
180     else version(D_InlineAsm_X86_64)
181         enum DMD_with_asm = true;
182     else
183         enum DMD_with_asm = false;
184 
185     version(D_InlineAsm_X86)
186         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
187     else
188         enum DMD_with_32bit_asm = false;
189 
190     version (D_SIMD)
191         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
192     else
193         enum DMD_with_DSIMD = false;
194 }
195 else
196 {
197     enum DMD_with_asm = false;
198     enum DMD_with_32bit_asm = false;
199     enum DMD_with_DSIMD = false;
200 }
201 
202 static if (LDC_with_ARM32)
203 {
204     package uint arm_get_fpcr() nothrow @nogc @trusted
205     {
206         return __builtin_arm_get_fpscr();
207     }
208 
209     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
210     {
211         __builtin_arm_set_fpscr(cw);
212     }
213 }
214 
215 static if (LDC_with_ARM64)
216 {
217     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
218         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
219 
220     package uint arm_get_fpcr() pure nothrow @nogc @trusted
221     {
222         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
223         return __asm!uint("mrs $0, fpcr", "=r");
224     }
225 
226     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
227     {
228         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
229         long save_x2;
230         __asm!void("str x2, $1 \n" ~
231                    "ldr w2, $0 \n" ~
232                    "msr fpcr, x2 \n" ~
233                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
234     }
235 }
236 
237 
238 // For internal use only, since public API deals with a x86 semantic emulation
239 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
240 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
241 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
242 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
243 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
244 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
245 
246 
247 //
248 //  <ROUNDING>
249 //
250 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
251 //  doesn't change the FPU rounding mode, and isn't expected to do so.
252 //  So we devised these rounding function to help having consistent rouding between 
253 //  LDC and DMD. It's important that DMD uses what is in MXCSR to round.
254 //
255 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
256 //  functionality.
257 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
258 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
259 
260 int convertFloatToInt32UsingMXCSR(float value) @trusted
261 {
262     int result;
263     version(GNU)
264     {
265         asm pure nothrow @nogc @trusted
266         {
267             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
268         }
269     }
270     else static if (LDC_with_ARM32)
271     {
272         // TODO: this is a bug, it won't preserve registers when optimized
273         result = __asm!int(`vldr s2, $1
274                             vcvtr.s32.f32 s2, s2
275                             vmov $0, s2`, "=r,m", value);
276     }
277     else static if (LDC_with_ARM64)
278     {
279         // Get current rounding mode.
280         uint fpscr = arm_get_fpcr();
281 
282         switch(fpscr & _MM_ROUND_MASK_ARM)
283         {
284             default:
285             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
286             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
287             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
288             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
289         }
290     }
291     else
292     {
293         asm pure nothrow @nogc @trusted
294         {
295             cvtss2si EAX, value;
296             mov result, EAX;
297         }
298     }
299     return result;
300 }
301 
302 int convertDoubleToInt32UsingMXCSR(double value) @trusted
303 {
304     int result;
305     version(GNU)
306     {
307         asm pure nothrow @nogc @trusted
308         {
309             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
310         }
311     }
312     else static if (LDC_with_ARM32)
313     {
314         // TODO: bug, doesn't preserve registers
315         result = __asm!int(`vldr d2, $1
316                             vcvtr.s32.f64 s2, d2
317                             vmov $0, s2`, "=r,m", value);
318     }
319     else static if (LDC_with_ARM64)
320     {
321         // Get current rounding mode.
322         uint fpscr = arm_get_fpcr();
323 
324         switch(fpscr & _MM_ROUND_MASK_ARM)
325         {
326             default:
327             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
328             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
329             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
330             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
331         }
332     }
333     else
334     {
335         asm pure nothrow @nogc @trusted
336         {
337             cvtsd2si EAX, value;
338             mov result, EAX;
339         }
340     }
341     return result;
342 }
343 
344 long convertFloatToInt64UsingMXCSR(float value) @trusted
345 {
346     static if (LDC_with_ARM32)
347     {
348         // We have to resort to libc since 32-bit ARM 
349         // doesn't seem to have 64-bit registers.
350         
351         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
352 
353         // Note: converting to double precision else rounding could be different for large integers
354         double asDouble = value; 
355 
356         switch(fpscr & _MM_ROUND_MASK_ARM)
357         {
358             default:
359             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
360             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
361             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
362             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
363         }
364     }
365     else static if (LDC_with_ARM64)
366     {
367         uint fpscr = arm_get_fpcr();
368 
369         switch(fpscr & _MM_ROUND_MASK_ARM)
370         {
371             default:
372             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
373             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
374             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
375             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
376         }
377     }
378     // 64-bit can use an SSE instruction
379     else version(D_InlineAsm_X86_64)
380     {
381         long result;
382         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
383         {
384             asm pure nothrow @nogc @trusted
385             {
386                 movss XMM0, value;
387                 cvtss2si RAX, XMM0;
388                 mov result, RAX;
389             }
390         }
391         else
392         {
393             asm pure nothrow @nogc @trusted
394             {
395                 movss XMM0, value;
396                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
397                 mov result, RAX;
398             }
399         }
400         return result;
401     }
402     else version(D_InlineAsm_X86)
403     {
404         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
405         // This leads to an unfortunate FPU sequence in every C++ compiler.
406         // See: https://godbolt.org/z/vZym77
407 
408         // Get current MXCSR rounding
409         uint sseRounding;
410         ushort savedFPUCW;
411         ushort newFPUCW;
412         long result;
413         asm pure nothrow @nogc @trusted
414         {
415             stmxcsr sseRounding;
416             fld value;
417             fnstcw savedFPUCW;
418             mov AX, savedFPUCW;
419             and AX, 0xf3ff;          // clear FPU rounding bits
420             movzx ECX, word ptr sseRounding;
421             and ECX, 0x6000;         // only keep SSE rounding bits
422             shr ECX, 3;
423             or AX, CX;               // make a new control word for FPU with SSE bits
424             mov newFPUCW, AX;
425             fldcw newFPUCW;
426             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
427             fldcw savedFPUCW;
428         }
429         return result;
430     }
431     else static if (GDC_with_x86)
432     {
433         version(X86_64) // 64-bit can just use the right instruction
434         {
435             static assert(GDC_with_SSE);
436             __m128 A;
437             A.ptr[0] = value;
438             return __builtin_ia32_cvtss2si64 (A);
439         }
440         else version(X86) // 32-bit
441         {
442             // This is untested!
443             uint sseRounding;
444             ushort savedFPUCW;
445             ushort newFPUCW;
446             long result;
447             asm pure nothrow @nogc @trusted
448             {
449                 "stmxcsr %1;\n" ~
450                 "fld %2;\n" ~
451                 "fnstcw %3;\n" ~
452                 "movw %3, %%ax;\n" ~
453                 "andw $0xf3ff, %%ax;\n" ~
454                 "movzwl %1, %%ecx;\n" ~
455                 "andl $0x6000, %%ecx;\n" ~
456                 "shrl $3, %%ecx;\n" ~
457                 "orw %%cx, %%ax\n" ~
458                 "movw %%ax, %4;\n" ~
459                 "fldcw %4;\n" ~
460                 "fistpll %0;\n" ~
461                 "fldcw %3;\n" 
462                   : "=m"(result)    // %0
463                   : "m" (sseRounding),
464                     "f" (value),
465                     "m" (savedFPUCW),
466                     "m" (newFPUCW) 
467                   : "eax", "ecx", "st";
468             }
469             return result;
470         }
471         else
472             static assert(false);
473     }
474     else
475         static assert(false);
476 }
477 
478 
479 ///ditto
480 long convertDoubleToInt64UsingMXCSR(double value) @trusted
481 {
482     static if (LDC_with_ARM32)
483     {
484         // We have to resort to libc since 32-bit ARM 
485         // doesn't seem to have 64-bit registers.
486         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
487         switch(fpscr & _MM_ROUND_MASK_ARM)
488         {
489             default:
490             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
491             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
492             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
493             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
494         }
495     }
496     else static if (LDC_with_ARM64)
497     {
498         // Get current rounding mode.
499         uint fpscr = arm_get_fpcr();
500 
501         switch(fpscr & _MM_ROUND_MASK_ARM)
502         {
503             default:
504             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
505             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
506             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
507             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
508         }
509     }
510     // 64-bit can use an SSE instruction
511     else version(D_InlineAsm_X86_64)
512     {
513         long result;
514         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
515         {
516             asm pure nothrow @nogc @trusted
517             {
518                 movsd XMM0, value;
519                 cvtsd2si RAX, XMM0;
520                 mov result, RAX;
521             }
522         }
523         else
524         {
525             asm pure nothrow @nogc @trusted
526             {
527                 movsd XMM0, value;
528                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
529                 mov result, RAX;
530             }
531         }
532         return result;
533     }
534     else version(D_InlineAsm_X86)
535     {
536         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
537         // This leads to an unfortunate FPU sequence in every C++ compiler.
538         // See: https://godbolt.org/z/vZym77
539 
540         // Get current MXCSR rounding
541         uint sseRounding;
542         ushort savedFPUCW;
543         ushort newFPUCW;
544         long result;
545         asm pure nothrow @nogc @trusted
546         {
547             stmxcsr sseRounding;
548             fld value;
549             fnstcw savedFPUCW;
550             mov AX, savedFPUCW;
551             and AX, 0xf3ff;
552             movzx ECX, word ptr sseRounding;
553             and ECX, 0x6000;
554             shr ECX, 3;
555             or AX, CX;
556             mov newFPUCW, AX;
557             fldcw newFPUCW;
558             fistp result;
559             fldcw savedFPUCW;
560         }
561         return result;
562     }
563     else static if (GDC_with_x86)
564     {
565         version(X86_64)
566         {
567             static assert(GDC_with_SSE2);
568             __m128d A;
569             A.ptr[0] = value;
570             return __builtin_ia32_cvtsd2si64 (A);
571         }
572         else
573         {
574             // This is untested!
575             uint sseRounding;
576             ushort savedFPUCW;
577             ushort newFPUCW;
578             long result;
579             asm pure nothrow @nogc @trusted
580             {
581                 "stmxcsr %1;\n" ~
582                 "fld %2;\n" ~
583                 "fnstcw %3;\n" ~
584                 "movw %3, %%ax;\n" ~
585                 "andw $0xf3ff, %%ax;\n" ~
586                 "movzwl %1, %%ecx;\n" ~
587                 "andl $0x6000, %%ecx;\n" ~
588                 "shrl $3, %%ecx;\n" ~
589                 "orw %%cx, %%ax\n" ~
590                 "movw %%ax, %4;\n" ~
591                 "fldcw %4;\n" ~
592                 "fistpll %0;\n" ~
593                 "fldcw %3;\n"         
594                   : "=m"(result)    // %0
595                   : "m" (sseRounding),
596                     "t" (value),
597                     "m" (savedFPUCW),
598                     "m" (newFPUCW) 
599                   : "eax", "ecx", "st";
600             }
601             return result;
602         }
603     }
604     else
605         static assert(false);
606 }
607 
608 //
609 //  </ROUNDING>
610 //
611 
612 
613 // using the Intel terminology here
614 
615 byte saturateSignedWordToSignedByte(short value) pure @safe
616 {
617     if (value > 127) value = 127;
618     if (value < -128) value = -128;
619     return cast(byte) value;
620 }
621 
622 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
623 {
624     if (value > 255) value = 255;
625     if (value < 0) value = 0;
626     return cast(ubyte) value;
627 }
628 
629 short saturateSignedIntToSignedShort(int value) pure @safe
630 {
631     if (value > 32767) value = 32767;
632     if (value < -32768) value = -32768;
633     return cast(short) value;
634 }
635 
636 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
637 {
638     if (value > 65535) value = 65535;
639     if (value < 0) value = 0;
640     return cast(ushort) value;
641 }
642 
643 unittest // test saturate operations
644 {
645     assert( saturateSignedWordToSignedByte(32000) == 127);
646     assert( saturateSignedWordToUnsignedByte(32000) == 255);
647     assert( saturateSignedWordToSignedByte(-4000) == -128);
648     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
649     assert( saturateSignedIntToSignedShort(32768) == 32767);
650     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
651     assert( saturateSignedIntToSignedShort(-32769) == -32768);
652     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
653 }
654 
655 version(unittest)
656 {
657     // This is just for debugging tests
658     import core.stdc.stdio: printf;
659 
660     // printing vectors for implementation
661     // Note: you can override `pure` within a `debug` clause
662 
663     void _mm_print_pi64(__m64 v) @trusted
664     {
665         long1 vl = cast(long1)v;
666         printf("%lld\n", vl.array[0]);
667     }
668 
669     void _mm_print_pi32(__m64 v) @trusted
670     {
671         int[2] C = (cast(int2)v).array;
672         printf("%d %d\n", C[0], C[1]);
673     }
674 
675     void _mm_print_pi16(__m64 v) @trusted
676     {
677         short[4] C = (cast(short4)v).array;
678         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
679     }
680 
681     void _mm_print_pi8(__m64 v) @trusted
682     {
683         byte[8] C = (cast(byte8)v).array;
684         printf("%d %d %d %d %d %d %d %d\n",
685         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
686     }
687 
688     void _mm_print_epi64(__m128i v) @trusted
689     {
690         long2 vl = cast(long2)v;
691         printf("%lld %lld\n", vl.array[0], vl.array[1]);
692     }
693 
694     void _mm_print_epi32(__m128i v) @trusted
695     {
696         printf("%d %d %d %d\n",
697               v.array[0], v.array[1], v.array[2], v.array[3]);
698     }  
699 
700     void _mm_print_epi16(__m128i v) @trusted
701     {
702         short[8] C = (cast(short8)v).array;
703         printf("%d %d %d %d %d %d %d %d\n",
704         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
705     }
706 
707     void _mm_print_epi8(__m128i v) @trusted
708     {
709         byte[16] C = (cast(byte16)v).array;
710         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
711         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
712     }
713 
714     void _mm_print_ps(__m128 v) @trusted
715     {
716         float[4] C = (cast(float4)v).array;
717         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
718     }
719 
720     void _mm_print_pd(__m128d v) @trusted
721     {
722         double[2] C = (cast(double2)v).array;
723         printf("%f %f\n", C[0], C[1]);
724     }    
725 }
726 
727 
728 //
729 //  <FLOATING-POINT COMPARISONS>
730 //
731 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
732 //       need different IR generation.
733 
734 enum FPComparison
735 {
736     oeq,   // ordered and equal
737     ogt,   // ordered and greater than
738     oge,   // ordered and greater than or equal
739     olt,   // ordered and less than
740     ole,   // ordered and less than or equal
741     one,   // ordered and not equal
742     ord,   // ordered (no nans)
743     ueq,   // unordered or equal
744     ugt,   // unordered or greater than ("nle")
745     uge,   // unordered or greater than or equal ("nlt")
746     ult,   // unordered or less than ("nge")
747     ule,   // unordered or less than or equal ("ngt")
748     une,   // unordered or not equal ("neq")
749     uno,   // unordered (either nans)
750 }
751 
752 private static immutable string[FPComparison.max+1] FPComparisonToString =
753 [
754     "oeq",
755     "ogt",
756     "oge",
757     "olt",
758     "ole",
759     "one",
760     "ord",
761     "ueq",
762     "ugt",
763     "uge",
764     "ult",
765     "ule",
766     "une",
767     "uno",
768 ];
769 
770 // Individual float comparison: returns -1 for true or 0 for false.
771 // Useful for DMD and testing
772 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
773 {
774     bool unordered = isnan(a) || isnan(b);
775     final switch(comparison) with(FPComparison)
776     {
777         case oeq: return a == b;
778         case ogt: return a > b;
779         case oge: return a >= b;
780         case olt: return a < b;
781         case ole: return a <= b;
782         case one: return !unordered && (a != b); // NaN with != always yields true
783         case ord: return !unordered; 
784         case ueq: return unordered || (a == b);
785         case ugt: return unordered || (a > b);
786         case uge: return unordered || (a >= b);
787         case ult: return unordered || (a < b);
788         case ule: return unordered || (a <= b);
789         case une: return (a != b); // NaN with != always yields true
790         case uno: return unordered;
791     }
792 }
793 
794 version(LDC)
795 {
796     /// Provides packed float comparisons
797     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
798     {
799         enum ir = `
800             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
801             %r = sext <4 x i1> %cmp to <4 x i32>
802             ret <4 x i32> %r`;
803 
804         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
805     }
806 
807     /// Provides packed double comparisons
808     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
809     {
810         enum ir = `
811             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
812             %r = sext <2 x i1> %cmp to <2 x i64>
813             ret <2 x i64> %r`;
814 
815         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
816     }
817 
818     /// CMPSS-style comparisons
819     /// clang implement it through x86 intrinsics, it is possible with IR alone
820     /// but leads to less optimal code.
821     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
822     /// Not that simple.
823     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
824     {
825         /*
826         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
827         enum bool invertOp = (predicateNumber & 0x80) != 0;
828         static if(invertOp)
829             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
830         else
831             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
832         */
833         enum ir = `
834             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
835             %r = sext i1 %cmp to i32
836             %r2 = bitcast i32 %r to float
837             ret float %r2`;
838 
839         float4 r = a;
840         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
841         return r;
842     }
843 
844     /// CMPSD-style comparisons
845     /// clang implement it through x86 intrinsics, it is possible with IR alone
846     /// but leads to less optimal code.
847     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
848     /// Not that simple.    
849     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
850     {
851         enum ir = `
852             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
853             %r = sext i1 %cmp to i64
854             %r2 = bitcast i64 %r to double
855             ret double %r2`;
856 
857         double2 r = a;
858         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
859         return r;
860     }
861 }
862 else
863 {
864     /// Provides packed float comparisons
865     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
866     {
867         int4 result;
868         foreach(i; 0..4)
869         {
870             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
871         }
872         return result;
873     }
874 
875     /// Provides packed double comparisons
876     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
877     {
878         long2 result;
879         foreach(i; 0..2)
880         {
881             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
882         }
883         return result;
884     }
885 
886     /// Provides CMPSS-style comparison
887     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
888     {
889         int4 result = cast(int4)a;
890         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
891         return cast(float4)result;
892     }
893 
894     /// Provides CMPSD-style comparison
895     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
896     {
897         long2 result = cast(long2)a;
898         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
899         return cast(double2)result;
900     }
901 }
902 unittest // cmpps
903 {
904     // Check all comparison type is working
905     float4 A = [1, 3, 5, float.nan];
906     float4 B = [2, 3, 4, 5];
907 
908     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
909     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
910     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
911     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
912     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
913     int4 result_one = cmpps!(FPComparison.one)(A, B);
914     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
915     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
916     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
917     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
918     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
919     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
920     int4 result_une = cmpps!(FPComparison.une)(A, B);
921     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
922 
923     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
924     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
925     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
926     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
927     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
928     static immutable int[4] correct_one    = [-1, 0,-1, 0];
929     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
930     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
931     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
932     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
933     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
934     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
935     static immutable int[4] correct_une    = [-1, 0,-1,-1];
936     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
937 
938     assert(result_oeq.array == correct_oeq);
939     assert(result_ogt.array == correct_ogt);
940     assert(result_oge.array == correct_oge);
941     assert(result_olt.array == correct_olt);
942     assert(result_ole.array == correct_ole);
943     assert(result_one.array == correct_one);
944     assert(result_ord.array == correct_ord);
945     assert(result_ueq.array == correct_ueq);
946     assert(result_ugt.array == correct_ugt);
947     assert(result_uge.array == correct_uge);
948     assert(result_ult.array == correct_ult);
949     assert(result_ule.array == correct_ule);
950     assert(result_une.array == correct_une);
951     assert(result_uno.array == correct_uno);
952 }
953 unittest
954 {
955     double2 a = [1, 3];
956     double2 b = [2, 3];
957     long2 c = cmppd!(FPComparison.ult)(a, b);
958     static immutable long[2] correct = [cast(long)(-1), 0];
959     assert(c.array == correct);
960 }
961 unittest // cmpss
962 {
963     void testComparison(FPComparison comparison)(float4 A, float4 B)
964     {
965         float4 result = cmpss!comparison(A, B);
966         int4 iresult = cast(int4)result;
967         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
968         assert(iresult.array[0] == expected);
969         assert(result.array[1] == A.array[1]);
970         assert(result.array[2] == A.array[2]);
971         assert(result.array[3] == A.array[3]);
972     }
973 
974     // Check all comparison type is working
975     float4 A = [1, 3, 5, 6];
976     float4 B = [2, 3, 4, 5];
977     float4 C = [float.nan, 3, 4, 5];
978 
979     testComparison!(FPComparison.oeq)(A, B);
980     testComparison!(FPComparison.oeq)(A, C);
981     testComparison!(FPComparison.ogt)(A, B);
982     testComparison!(FPComparison.ogt)(A, C);
983     testComparison!(FPComparison.oge)(A, B);
984     testComparison!(FPComparison.oge)(A, C);
985     testComparison!(FPComparison.olt)(A, B);
986     testComparison!(FPComparison.olt)(A, C);
987     testComparison!(FPComparison.ole)(A, B);
988     testComparison!(FPComparison.ole)(A, C);
989     testComparison!(FPComparison.one)(A, B);
990     testComparison!(FPComparison.one)(A, C);
991     testComparison!(FPComparison.ord)(A, B);
992     testComparison!(FPComparison.ord)(A, C);
993     testComparison!(FPComparison.ueq)(A, B);
994     testComparison!(FPComparison.ueq)(A, C);
995     testComparison!(FPComparison.ugt)(A, B);
996     testComparison!(FPComparison.ugt)(A, C);
997     testComparison!(FPComparison.uge)(A, B);
998     testComparison!(FPComparison.uge)(A, C);
999     testComparison!(FPComparison.ult)(A, B);
1000     testComparison!(FPComparison.ult)(A, C);
1001     testComparison!(FPComparison.ule)(A, B);
1002     testComparison!(FPComparison.ule)(A, C);
1003     testComparison!(FPComparison.une)(A, B);
1004     testComparison!(FPComparison.une)(A, C);
1005     testComparison!(FPComparison.uno)(A, B);
1006     testComparison!(FPComparison.uno)(A, C);
1007 }
1008 unittest // cmpsd
1009 {
1010     void testComparison(FPComparison comparison)(double2 A, double2 B)
1011     {
1012         double2 result = cmpsd!comparison(A, B);
1013         long2 iresult = cast(long2)result;
1014         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1015         assert(iresult.array[0] == expected);
1016         assert(result.array[1] == A.array[1]);
1017     }
1018 
1019     // Check all comparison type is working
1020     double2 A = [1, 3];
1021     double2 B = [2, 4];
1022     double2 C = [double.nan, 5];
1023 
1024     testComparison!(FPComparison.oeq)(A, B);
1025     testComparison!(FPComparison.oeq)(A, C);
1026     testComparison!(FPComparison.ogt)(A, B);
1027     testComparison!(FPComparison.ogt)(A, C);
1028     testComparison!(FPComparison.oge)(A, B);
1029     testComparison!(FPComparison.oge)(A, C);
1030     testComparison!(FPComparison.olt)(A, B);
1031     testComparison!(FPComparison.olt)(A, C);
1032     testComparison!(FPComparison.ole)(A, B);
1033     testComparison!(FPComparison.ole)(A, C);
1034     testComparison!(FPComparison.one)(A, B);
1035     testComparison!(FPComparison.one)(A, C);
1036     testComparison!(FPComparison.ord)(A, B);
1037     testComparison!(FPComparison.ord)(A, C);
1038     testComparison!(FPComparison.ueq)(A, B);
1039     testComparison!(FPComparison.ueq)(A, C);
1040     testComparison!(FPComparison.ugt)(A, B);
1041     testComparison!(FPComparison.ugt)(A, C);
1042     testComparison!(FPComparison.uge)(A, B);
1043     testComparison!(FPComparison.uge)(A, C);
1044     testComparison!(FPComparison.ult)(A, B);
1045     testComparison!(FPComparison.ult)(A, C);
1046     testComparison!(FPComparison.ule)(A, B);
1047     testComparison!(FPComparison.ule)(A, C);
1048     testComparison!(FPComparison.une)(A, B);
1049     testComparison!(FPComparison.une)(A, C);
1050     testComparison!(FPComparison.uno)(A, B);
1051     testComparison!(FPComparison.uno)(A, C);
1052 }
1053 
1054 //
1055 //  </FLOATING-POINT COMPARISONS>
1056 //
1057 
1058 
1059 __m64 to_m64(__m128i a) pure @trusted
1060 {
1061     long2 la = cast(long2)a;
1062     long1 r = la.array[0];
1063     return r;
1064 }
1065 
1066 __m128i to_m128i(__m64 a) pure @trusted
1067 {
1068   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1069     
1070     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1071     {
1072         long2 r = a.array[0];
1073         r.ptr[1] = 0;
1074         return cast(int4)r;
1075     }
1076     else */
1077     {
1078         long2 r = [0, 0];
1079         r.ptr[0] = a.array[0];
1080         return cast(__m128i)r;
1081     }
1082 }
1083 
1084 // ADDITIONAL x86 INTRINSICS
1085 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1086 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1087 static if (LDC_with_SSE41)
1088 {
1089     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1090         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1091 }
1092 
1093 // SOME NEON INTRINSICS
1094 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1095 // Not in the public API but the simde project expose it all for the user to use.
1096 // MAYDO: create a new neon.d module, for internal use only.
1097 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1098 static if (LDC_with_ARM64)
1099 {
1100     // VERY USEFUL LINK
1101     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1102     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1103 
1104     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1105         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1106 
1107     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1108         short8 vabsq_s16(short8 a) pure @safe;
1109 
1110     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1111         int4 vabsq_s32(int4 a) pure @safe;
1112 
1113     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1114         byte16 vabsq_s8(byte16 a) pure @safe;
1115 
1116     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1117     {
1118         return a & b;
1119     }
1120 
1121     long2 vandq_s64(long2 a, long2 b)
1122     {
1123         return a & b;
1124     }
1125 
1126     long2 vbicq_s64(long2 a, long2 b) pure @safe
1127     {
1128         return a & ~b;
1129     }
1130 
1131     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1132     {
1133         return c ^ ((c ^ b) & a);
1134     }
1135 
1136     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1137     {
1138         return c ^ ((c ^ b) & a);
1139     }
1140 
1141     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1142     {
1143         return c ^ ((c ^ b) & a);
1144     }
1145 
1146     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1147     {
1148         short8 r;
1149         r.ptr[0]  = lo.array[0];
1150         r.ptr[1]  = lo.array[1];
1151         r.ptr[2]  = lo.array[2];
1152         r.ptr[3]  = lo.array[3];
1153         r.ptr[4]  = hi.array[0];
1154         r.ptr[5]  = hi.array[1];
1155         r.ptr[6]  = hi.array[2];
1156         r.ptr[7]  = hi.array[3];
1157         return r;
1158     }
1159 
1160     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1161     {
1162         int4 r;
1163         r.ptr[0] = lo.array[0];
1164         r.ptr[1] = lo.array[1];
1165         r.ptr[2] = hi.array[0];
1166         r.ptr[3] = hi.array[1];
1167         return r;
1168     }
1169 
1170     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1171     {
1172         byte16 r;
1173         r.ptr[0]  = lo.array[0];
1174         r.ptr[1]  = lo.array[1];
1175         r.ptr[2]  = lo.array[2];
1176         r.ptr[3]  = lo.array[3];
1177         r.ptr[4]  = lo.array[4];
1178         r.ptr[5]  = lo.array[5];
1179         r.ptr[6]  = lo.array[6];
1180         r.ptr[7]  = lo.array[7];
1181         r.ptr[8]  = hi.array[0];
1182         r.ptr[9]  = hi.array[1];
1183         r.ptr[10] = hi.array[2];
1184         r.ptr[11] = hi.array[3];
1185         r.ptr[12] = hi.array[4];
1186         r.ptr[13] = hi.array[5];
1187         r.ptr[14] = hi.array[6];
1188         r.ptr[15] = hi.array[7];
1189         return r;
1190     }
1191 
1192     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1193     {
1194         short8 r;
1195         r.ptr[0]  = lo.array[0];
1196         r.ptr[1]  = lo.array[1];
1197         r.ptr[2]  = lo.array[2];
1198         r.ptr[3]  = lo.array[3];
1199         r.ptr[4]  = hi.array[0];
1200         r.ptr[5]  = hi.array[1];
1201         r.ptr[6]  = hi.array[2];
1202         r.ptr[7]  = hi.array[3];
1203         return r;
1204     }
1205 
1206 
1207     // float4 => int4
1208 
1209     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1210         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1211 
1212     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1213         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1214 
1215     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1216         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1217 
1218     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1219         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1220 
1221 
1222     // double2 => long2
1223 
1224     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1225         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1226 
1227     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1228         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1229 
1230     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1231         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1232 
1233     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1234         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1235 
1236     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1237         int vcvtms_s32_f32(float a) pure @safe;
1238 
1239     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1240         int vcvtns_s32_f32(float a) pure @safe;    
1241 
1242     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1243         int vcvtps_s32_f32(float a) pure @safe;
1244 
1245     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1246         int vcvts_s32_f32(float a) pure @safe;
1247      
1248     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1249         int vcvtms_s32_f64(double a) pure @safe;
1250 
1251     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1252         int vcvtns_s32_f64(double a) pure @safe;    
1253 
1254     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1255         int vcvtps_s32_f64(double a) pure @safe;
1256 
1257     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1258         int vcvts_s32_f64(double a) pure @safe;
1259 
1260     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1261         long vcvtms_s64_f32(float a) pure @safe;
1262 
1263     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1264         long vcvtns_s64_f32(float a) pure @safe;    
1265 
1266     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1267         long vcvtps_s64_f32(float a) pure @safe;
1268 
1269     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1270         long vcvts_s64_f32(float a) pure @safe;
1271 
1272     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1273         long vcvtms_s64_f64(double a) pure @safe;
1274 
1275     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1276         long vcvtns_s64_f64(double a) pure @safe;    
1277 
1278     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1279         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1280 
1281     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1282         long vcvts_s64_f64(double a) pure @safe;
1283 
1284     long2 vdupq_n_s64(long value) pure @safe
1285     {
1286         long2 r;
1287         r = value;
1288         return r;
1289     }
1290 
1291     short4 vget_high_s16(short8 a) pure @trusted
1292     {
1293         short4 r;
1294         r.ptr[0] = a.array[4];
1295         r.ptr[1] = a.array[5];
1296         r.ptr[2] = a.array[6];
1297         r.ptr[3] = a.array[7];
1298         return r;
1299     }
1300 
1301     int2 vget_high_s32(int4 a) pure @trusted
1302     {
1303         int2 r;
1304         r.ptr[0] = a.array[2];
1305         r.ptr[1] = a.array[3];
1306         return r;
1307     }
1308 
1309     byte8 vget_high_u8(byte16 a) pure @trusted
1310     {
1311         byte8 r;
1312         r.ptr[0] = a.array[8];
1313         r.ptr[1] = a.array[9];
1314         r.ptr[2] = a.array[10];
1315         r.ptr[3] = a.array[11];
1316         r.ptr[4] = a.array[12];
1317         r.ptr[5] = a.array[13];
1318         r.ptr[6] = a.array[14];
1319         r.ptr[7] = a.array[15];
1320         return r;
1321     }
1322 
1323     short4 vget_low_s16(short8 a) pure @trusted
1324     {
1325         short4 r;
1326         r.ptr[0] = a.array[0];
1327         r.ptr[1] = a.array[1];
1328         r.ptr[2] = a.array[2];
1329         r.ptr[3] = a.array[3];
1330         return r;
1331     } 
1332 
1333     int2 vget_low_s32(int4 a) pure @trusted
1334     {
1335         int2 r;
1336         r.ptr[0] = a.array[0];
1337         r.ptr[1] = a.array[1];
1338         return r;
1339     }
1340 
1341     byte8 vget_low_u8(byte16 a) pure @trusted
1342     {
1343         byte8 r;
1344         r.ptr[0] = a.array[0];
1345         r.ptr[1] = a.array[1];
1346         r.ptr[2] = a.array[2];
1347         r.ptr[3] = a.array[3];
1348         r.ptr[4] = a.array[4];
1349         r.ptr[5] = a.array[5];
1350         r.ptr[6] = a.array[6];
1351         r.ptr[7] = a.array[7];
1352         return r;
1353     }
1354 
1355     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1356     {
1357         return v.array[lane];
1358     }
1359 
1360     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1361         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1362 
1363     int4 vmaxq_s32(int4 a, int4 b)
1364     {
1365         int4 r;
1366         r[0] = a[0] >= b[0] ? a[0] : b[0];
1367         r[1] = a[1] >= b[1] ? a[1] : b[1];
1368         r[2] = a[2] >= b[2] ? a[2] : b[2];
1369         r[3] = a[3] >= b[3] ? a[3] : b[3];
1370         return r;
1371     }
1372 
1373     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1374         short8 vminq_s16(short8 a, short8 b) pure @safe;
1375 
1376     int2 vmovn_s64(long2 a) pure @trusted
1377     {
1378         int2 r;
1379         r.ptr[0] = cast(int)(a.array[0]);
1380         r.ptr[1] = cast(int)(a.array[1]);
1381         return r;
1382     }        
1383 
1384     int4 vmull_s16(short4 a, short4 b) pure @trusted
1385     {
1386         int4 r;
1387         r.ptr[0] = a.array[0] * b.array[0];
1388         r.ptr[1] = a.array[1] * b.array[1];
1389         r.ptr[2] = a.array[2] * b.array[2];
1390         r.ptr[3] = a.array[3] * b.array[3];
1391         return r;
1392     }
1393 
1394     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1395         long2 vmull_s32(int2 a, int2 b) pure @safe;
1396 
1397     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1398         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1399 
1400     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1401         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1402 
1403     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1404         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1405 
1406     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1407         short8 vpaddlq_u8 (byte16 a) pure @safe;
1408 
1409     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1410     {
1411         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1412             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1413     }
1414     else
1415     {
1416         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1417             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1418     }
1419     
1420     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1421         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1422 
1423     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1424         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1425 
1426     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1427         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1428 
1429     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1430         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1431 
1432     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1433         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1434 
1435     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1436         byte8 vqmovn_s16(short8 a) pure @safe;
1437 
1438     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1439         short4 vqmovn_s32(int4 a) pure @safe;
1440 
1441     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1442         short4 vqmovn_u32(int4 a) pure @safe;
1443 
1444     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1445         byte8 vqmovun_s16(short8 a) pure @safe;
1446 
1447     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1448         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1449 
1450     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1451         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1452 
1453     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1454         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1455 
1456     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1457         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1458 
1459     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1460         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1461 
1462     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1463         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1464 
1465     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1466     {
1467         return a >>> b;
1468     }
1469 
1470     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1471     { 
1472         a = a >> byte16(cast(byte)r);
1473         return a;
1474     }
1475 
1476     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1477         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1478 }
1479 
1480 version(unittest)
1481 {
1482     double abs_double(double x) @trusted
1483     {
1484         version(LDC)
1485             return llvm_fabs(x);
1486         else
1487         {
1488             long uf = *cast(long*)(&x);
1489             uf &= 0x7fffffff_ffffffff;
1490             return *cast(double*)(&uf);
1491         }
1492     }
1493 }
1494 
1495 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1496 
1497 bool isnan(float x) pure @trusted
1498 {
1499     uint u = *cast(uint*)(&x);
1500     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1501     return result;
1502 }
1503 unittest
1504 {
1505     float x = float.nan;
1506     assert(isnan(x));
1507 
1508     x = 0;
1509     assert(!isnan(x));
1510     
1511     x = float.infinity;
1512     assert(!isnan(x));
1513 }
1514 
1515 bool isnan(double x) pure @trusted
1516 {
1517     ulong u = *cast(ulong*)(&x);
1518     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1519 }
1520 unittest
1521 {
1522     double x = double.nan;
1523     assert(isnan(x));
1524 
1525     x = 0;
1526     assert(!isnan(x));
1527     
1528     x = double.infinity;
1529     assert(!isnan(x));
1530 }