inteli.internals source code

1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30     }
31     else version (X86_64)
32     {
33         // GDC support uses extended inline assembly:
34         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
35         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
36         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
37 
38         public import core.simd;
39 
40         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
41         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
42         public import gcc.builtins;
43                 
44         enum GDC_with_x86 = true;
45         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
46         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
47         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
48         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
49     }
50     else
51     {
52         enum GDC_with_x86 = false;
53         enum GDC_with_MMX = false;
54         enum GDC_with_SSE = false;
55         enum GDC_with_SSE2 = false;
56         enum GDC_with_SSE3 = false;
57     }
58 }
59 else
60 {
61     enum GDC_with_x86 = false;
62     enum GDC_with_MMX = false;
63     enum GDC_with_SSE = false;
64     enum GDC_with_SSE2 = false;
65     enum GDC_with_SSE3 = false;
66 }
67 
68 version(LDC)
69 {
70     public import core.simd;
71     public import ldc.simd;
72     public import ldc.intrinsics;
73     public import ldc.llvmasm: __asm;
74 
75     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
76     static if (__VERSION__ >= 2083)
77     {
78          import ldc.llvmasm;
79          alias LDCInlineIR = __ir_pure;
80 
81          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
82          alias LDCInlineIREx = __irEx_pure; 
83     }
84     else
85     {
86         alias LDCInlineIR = inlineIR;
87     }
88 
89     version(ARM)
90     {
91         public import ldc.gccbuiltins_arm;
92         enum LDC_with_ARM32 = true;
93         enum LDC_with_ARM64 = false;
94         enum LDC_with_SSE1 = false;
95         enum LDC_with_SSE2 = false;
96         enum LDC_with_SSE3 = false;
97     }
98     else version(AArch64)
99     {
100         enum LDC_with_ARM32 = false;
101         enum LDC_with_ARM64 = true;
102         enum LDC_with_SSE1 = false;
103         enum LDC_with_SSE2 = false;
104         enum LDC_with_SSE3 = false;
105     }
106     else
107     {
108         public import ldc.gccbuiltins_x86;
109         enum LDC_with_ARM32 = false;
110         enum LDC_with_ARM64 = false;
111         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
112         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
113         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
114     }
115 }
116 else
117 {
118     enum LDC_with_ARM32 = false;
119     enum LDC_with_ARM64 = false;
120     enum LDC_with_SSE1 = false;
121     enum LDC_with_SSE2 = false;
122     enum LDC_with_SSE3 = false;
123 }
124 
125 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
126 
127 version(DigitalMars)
128 {
129     version(D_InlineAsm_X86)
130         enum DMD_with_asm = true;
131     else version(D_InlineAsm_X86_64)
132         enum DMD_with_asm = true;
133     else
134         enum DMD_with_asm = false;
135 
136     version(D_InlineAsm_X86)
137         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
138     else
139         enum DMD_with_32bit_asm = false;
140 
141     version (D_SIMD)
142         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
143     else
144         enum DMD_with_DSIMD = false;
145 }
146 else
147 {
148     enum DMD_with_asm = false;
149     enum DMD_with_32bit_asm = false;
150     enum DMD_with_DSIMD = false;
151 }
152 
153 static if (LDC_with_ARM32)
154 {
155     package uint arm_get_fpcr() nothrow @nogc @trusted
156     {
157         return __builtin_arm_get_fpscr();
158     }
159 
160     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
161     {
162         __builtin_arm_set_fpscr(cw);
163     }
164 }
165 
166 static if (LDC_with_ARM64)
167 {
168     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
169         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
170 
171     package uint arm_get_fpcr() pure nothrow @nogc @trusted
172     {
173         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
174         return __asm!uint("mrs $0, fpcr", "=r");
175     }
176 
177     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
178     {
179         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
180         long save_x2;
181         __asm!void("str x2, $1 \n" ~
182                    "ldr w2, $0 \n" ~
183                    "msr fpcr, x2 \n" ~
184                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
185     }
186 }
187 
188 
189 // For internal use only, since public API deals with a x86 semantic emulation
190 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
191 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
192 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
193 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
194 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
195 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
196 
197 
198 //
199 //  <ROUNDING>
200 //
201 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
202 //  doesn't change the FPU rounding mode, and isn't expected to do so.
203 //  So we devised these rounding function to help having consistent rouding between 
204 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
205 //
206 //  Note: There is no MXCSR in ARM. But there is fpscr that implements similar 
207 //  functionality the same.
208 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
209 //  There is no
210 //  We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
211 
212 int convertFloatToInt32UsingMXCSR(float value) @trusted
213 {
214     int result;
215     version(GNU)
216     {
217         asm pure nothrow @nogc @trusted
218         {
219             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
220         }
221     }
222     else static if (LDC_with_ARM32)
223     {
224         // TODO: this is a bug, it won't preserve registers when optimized
225         result = __asm!int(`vldr s2, $1
226                             vcvtr.s32.f32 s2, s2
227                             vmov $0, s2`, "=r,m", value);
228     }
229     else static if (LDC_with_ARM64)
230     {
231         // Get current rounding mode.
232         uint fpscr = arm_get_fpcr();
233 
234         switch(fpscr & _MM_ROUND_MASK_ARM)
235         {
236             default:
237             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
238             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
239             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
240             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
241         }
242     }
243     else
244     {
245         asm pure nothrow @nogc @trusted
246         {
247             cvtss2si EAX, value;
248             mov result, EAX;
249         }
250     }
251     return result;
252 }
253 
254 int convertDoubleToInt32UsingMXCSR(double value) @trusted
255 {
256     int result;
257     version(GNU)
258     {
259         asm pure nothrow @nogc @trusted
260         {
261             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
262         }
263     }
264     else static if (LDC_with_ARM32)
265     {
266         // TODO: bug, doesn't preserve registers
267         result = __asm!int(`vldr d2, $1
268                             vcvtr.s32.f64 s2, d2
269                             vmov $0, s2`, "=r,m", value);
270     }
271     else static if (LDC_with_ARM64)
272     {
273         // Get current rounding mode.
274         uint fpscr = arm_get_fpcr();
275 
276         switch(fpscr & _MM_ROUND_MASK_ARM)
277         {
278             default:
279             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
280             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
281             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
282             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
283         }
284     }
285     else
286     {
287         asm pure nothrow @nogc @trusted
288         {
289             cvtsd2si EAX, value;
290             mov result, EAX;
291         }
292     }
293     return result;
294 }
295 
296 long convertFloatToInt64UsingMXCSR(float value) @trusted
297 {
298     static if (LDC_with_ARM32)
299     {
300         // We have to resort to libc since 32-bit ARM 
301         // doesn't seem to have 64-bit registers.
302         
303         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
304 
305         // Note: converting to double precision else rounding could be different for large integers
306         double asDouble = value; 
307 
308         switch(fpscr & _MM_ROUND_MASK_ARM)
309         {
310             default:
311             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
312             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
313             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
314             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
315         }
316     }
317     else static if (LDC_with_ARM64)
318     {
319         uint fpscr = arm_get_fpcr();
320 
321         switch(fpscr & _MM_ROUND_MASK_ARM)
322         {
323             default:
324             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
325             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
326             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
327             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
328         }
329     }
330     // 64-bit can use an SSE instruction
331     else version(D_InlineAsm_X86_64)
332     {
333         long result;
334         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
335         {
336             asm pure nothrow @nogc @trusted
337             {
338                 movss XMM0, value;
339                 cvtss2si RAX, XMM0;
340                 mov result, RAX;
341             }
342         }
343         else
344         {
345             asm pure nothrow @nogc @trusted
346             {
347                 movss XMM0, value;
348                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
349                 mov result, RAX;
350             }
351         }
352         return result;
353     }
354     else version(D_InlineAsm_X86)
355     {
356         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
357         // This leads to an unfortunate FPU sequence in every C++ compiler.
358         // See: https://godbolt.org/z/vZym77
359 
360         // Get current MXCSR rounding
361         uint sseRounding;
362         ushort savedFPUCW;
363         ushort newFPUCW;
364         long result;
365         asm pure nothrow @nogc @trusted
366         {
367             stmxcsr sseRounding;
368             fld value;
369             fnstcw savedFPUCW;
370             mov AX, savedFPUCW;
371             and AX, 0xf3ff;          // clear FPU rounding bits
372             movzx ECX, word ptr sseRounding;
373             and ECX, 0x6000;         // only keep SSE rounding bits
374             shr ECX, 3;
375             or AX, CX;               // make a new control word for FPU with SSE bits
376             mov newFPUCW, AX;
377             fldcw newFPUCW;
378             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
379             fldcw savedFPUCW;
380         }
381         return result;
382     }
383     else static if (GDC_with_x86)
384     {
385         version(X86_64) // 64-bit can just use the right instruction
386         {
387             static assert(GDC_with_SSE);
388             __m128 A;
389             A.ptr[0] = value;
390             return __builtin_ia32_cvtss2si64 (A);
391         }
392         else version(X86) // 32-bit
393         {
394             // This is untested!
395             uint sseRounding;
396             ushort savedFPUCW;
397             ushort newFPUCW;
398             long result;
399             asm pure nothrow @nogc @trusted
400             {
401                 "stmxcsr %1;\n" ~
402                 "fld %2;\n" ~
403                 "fnstcw %3;\n" ~
404                 "movw %3, %%ax;\n" ~
405                 "andw $0xf3ff, %%ax;\n" ~
406                 "movzwl %1, %%ecx;\n" ~
407                 "andl $0x6000, %%ecx;\n" ~
408                 "shrl $3, %%ecx;\n" ~
409                 "orw %%cx, %%ax\n" ~
410                 "movw %%ax, %4;\n" ~
411                 "fldcw %4;\n" ~
412                 "fistpll %0;\n" ~
413                 "fldcw %3;\n" 
414                   : "=m"(result)    // %0
415                   : "m" (sseRounding),
416                     "f" (value),
417                     "m" (savedFPUCW),
418                     "m" (newFPUCW) 
419                   : "eax", "ecx", "st";
420             }
421             return result;
422         }
423         else
424             static assert(false);
425     }
426     else
427         static assert(false);
428 }
429 
430 
431 ///ditto
432 long convertDoubleToInt64UsingMXCSR(double value) @trusted
433 {
434     static if (LDC_with_ARM32)
435     {
436         // We have to resort to libc since 32-bit ARM 
437         // doesn't seem to have 64-bit registers.
438         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
439         switch(fpscr & _MM_ROUND_MASK_ARM)
440         {
441             default:
442             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
443             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
444             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
445             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
446         }
447     }
448     else static if (LDC_with_ARM64)
449     {
450         // Get current rounding mode.
451         uint fpscr = arm_get_fpcr();
452 
453         switch(fpscr & _MM_ROUND_MASK_ARM)
454         {
455             default:
456             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
457             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
458             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
459             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
460         }
461     }
462     // 64-bit can use an SSE instruction
463     else version(D_InlineAsm_X86_64)
464     {
465         long result;
466         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
467         {
468             asm pure nothrow @nogc @trusted
469             {
470                 movsd XMM0, value;
471                 cvtsd2si RAX, XMM0;
472                 mov result, RAX;
473             }
474         }
475         else
476         {
477             asm pure nothrow @nogc @trusted
478             {
479                 movsd XMM0, value;
480                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
481                 mov result, RAX;
482             }
483         }
484         return result;
485     }
486     else version(D_InlineAsm_X86)
487     {
488         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
489         // This leads to an unfortunate FPU sequence in every C++ compiler.
490         // See: https://godbolt.org/z/vZym77
491 
492         // Get current MXCSR rounding
493         uint sseRounding;
494         ushort savedFPUCW;
495         ushort newFPUCW;
496         long result;
497         asm pure nothrow @nogc @trusted
498         {
499             stmxcsr sseRounding;
500             fld value;
501             fnstcw savedFPUCW;
502             mov AX, savedFPUCW;
503             and AX, 0xf3ff;
504             movzx ECX, word ptr sseRounding;
505             and ECX, 0x6000;
506             shr ECX, 3;
507             or AX, CX;
508             mov newFPUCW, AX;
509             fldcw newFPUCW;
510             fistp result;
511             fldcw savedFPUCW;
512         }
513         return result;
514     }
515     else static if (GDC_with_x86)
516     {
517         version(X86_64)
518         {
519             static assert(GDC_with_SSE2);
520             __m128d A;
521             A.ptr[0] = value;
522             return __builtin_ia32_cvtsd2si64 (A);
523         }
524         else
525         {
526             // This is untested!
527             uint sseRounding;
528             ushort savedFPUCW;
529             ushort newFPUCW;
530             long result;
531             asm pure nothrow @nogc @trusted
532             {
533                 "stmxcsr %1;\n" ~
534                 "fld %2;\n" ~
535                 "fnstcw %3;\n" ~
536                 "movw %3, %%ax;\n" ~
537                 "andw $0xf3ff, %%ax;\n" ~
538                 "movzwl %1, %%ecx;\n" ~
539                 "andl $0x6000, %%ecx;\n" ~
540                 "shrl $3, %%ecx;\n" ~
541                 "orw %%cx, %%ax\n" ~
542                 "movw %%ax, %4;\n" ~
543                 "fldcw %4;\n" ~
544                 "fistpll %0;\n" ~
545                 "fldcw %3;\n"         
546                   : "=m"(result)    // %0
547                   : "m" (sseRounding),
548                     "t" (value),
549                     "m" (savedFPUCW),
550                     "m" (newFPUCW) 
551                   : "eax", "ecx", "st";
552             }
553             return result;
554         }
555     }
556     else
557         static assert(false);
558 }
559 
560 //
561 //  </ROUNDING>
562 //
563 
564 
565 // using the Intel terminology here
566 
567 byte saturateSignedWordToSignedByte(short value) pure @safe
568 {
569     if (value > 127) value = 127;
570     if (value < -128) value = -128;
571     return cast(byte) value;
572 }
573 
574 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
575 {
576     if (value > 255) value = 255;
577     if (value < 0) value = 0;
578     return cast(ubyte) value;
579 }
580 
581 short saturateSignedIntToSignedShort(int value) pure @safe
582 {
583     if (value > 32767) value = 32767;
584     if (value < -32768) value = -32768;
585     return cast(short) value;
586 }
587 
588 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
589 {
590     if (value > 65535) value = 65535;
591     if (value < 0) value = 0;
592     return cast(ushort) value;
593 }
594 
595 unittest // test saturate operations
596 {
597     assert( saturateSignedWordToSignedByte(32000) == 127);
598     assert( saturateSignedWordToUnsignedByte(32000) == 255);
599     assert( saturateSignedWordToSignedByte(-4000) == -128);
600     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
601     assert( saturateSignedIntToSignedShort(32768) == 32767);
602     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
603     assert( saturateSignedIntToSignedShort(-32769) == -32768);
604     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
605 }
606 
607 version(unittest)
608 {
609     // This is just for debugging tests
610     import core.stdc.stdio: printf;
611 
612     // printing vectors for implementation
613     // Note: you can override `pure` within a `debug` clause
614 
615     void _mm_print_pi64(__m64 v) @trusted
616     {
617         long1 vl = cast(long1)v;
618         printf("%lld\n", vl.array[0]);
619     }
620 
621     void _mm_print_pi32(__m64 v) @trusted
622     {
623         int[2] C = (cast(int2)v).array;
624         printf("%d %d\n", C[0], C[1]);
625     }
626 
627     void _mm_print_pi16(__m64 v) @trusted
628     {
629         short[4] C = (cast(short4)v).array;
630         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
631     }
632 
633     void _mm_print_pi8(__m64 v) @trusted
634     {
635         byte[8] C = (cast(byte8)v).array;
636         printf("%d %d %d %d %d %d %d %d\n",
637         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
638     }
639 
640     void _mm_print_epi64(__m128i v) @trusted
641     {
642         long2 vl = cast(long2)v;
643         printf("%lld %lld\n", vl.array[0], vl.array[1]);
644     }
645 
646     void _mm_print_epi32(__m128i v) @trusted
647     {
648         printf("%d %d %d %d\n",
649               v.array[0], v.array[1], v.array[2], v.array[3]);
650     }  
651 
652     void _mm_print_epi16(__m128i v) @trusted
653     {
654         short[8] C = (cast(short8)v).array;
655         printf("%d %d %d %d %d %d %d %d\n",
656         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
657     }
658 
659     void _mm_print_epi8(__m128i v) @trusted
660     {
661         byte[16] C = (cast(byte16)v).array;
662         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
663         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
664     }
665 
666     void _mm_print_ps(__m128 v) @trusted
667     {
668         float[4] C = (cast(float4)v).array;
669         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
670     }
671 
672     void _mm_print_pd(__m128d v) @trusted
673     {
674         double[2] C = (cast(double2)v).array;
675         printf("%f %f\n", C[0], C[1]);
676     }    
677 }
678 
679 
680 //
681 //  <FLOATING-POINT COMPARISONS>
682 //
683 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
684 //       need different IR generation.
685 
686 enum FPComparison
687 {
688     oeq,   // ordered and equal
689     ogt,   // ordered and greater than
690     oge,   // ordered and greater than or equal
691     olt,   // ordered and less than
692     ole,   // ordered and less than or equal
693     one,   // ordered and not equal
694     ord,   // ordered (no nans)
695     ueq,   // unordered or equal
696     ugt,   // unordered or greater than ("nle")
697     uge,   // unordered or greater than or equal ("nlt")
698     ult,   // unordered or less than ("nge")
699     ule,   // unordered or less than or equal ("ngt")
700     une,   // unordered or not equal ("neq")
701     uno,   // unordered (either nans)
702 }
703 
704 private static immutable string[FPComparison.max+1] FPComparisonToString =
705 [
706     "oeq",
707     "ogt",
708     "oge",
709     "olt",
710     "ole",
711     "one",
712     "ord",
713     "ueq",
714     "ugt",
715     "uge",
716     "ult",
717     "ule",
718     "une",
719     "uno",
720 ];
721 
722 // Individual float comparison: returns -1 for true or 0 for false.
723 // Useful for DMD and testing
724 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
725 {
726     bool unordered = isnan(a) || isnan(b);
727     final switch(comparison) with(FPComparison)
728     {
729         case oeq: return a == b;
730         case ogt: return a > b;
731         case oge: return a >= b;
732         case olt: return a < b;
733         case ole: return a <= b;
734         case one: return !unordered && (a != b); // NaN with != always yields true
735         case ord: return !unordered; 
736         case ueq: return unordered || (a == b);
737         case ugt: return unordered || (a > b);
738         case uge: return unordered || (a >= b);
739         case ult: return unordered || (a < b);
740         case ule: return unordered || (a <= b);
741         case une: return (a != b); // NaN with != always yields true
742         case uno: return unordered;
743     }
744 }
745 
746 version(LDC)
747 {
748     /// Provides packed float comparisons
749     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
750     {
751         enum ir = `
752             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
753             %r = sext <4 x i1> %cmp to <4 x i32>
754             ret <4 x i32> %r`;
755 
756         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
757     }
758 
759     /// Provides packed double comparisons
760     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
761     {
762         enum ir = `
763             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
764             %r = sext <2 x i1> %cmp to <2 x i64>
765             ret <2 x i64> %r`;
766 
767         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
768     }
769 
770     /// CMPSS-style comparisons
771     /// clang implement it through x86 intrinsics, it is possible with IR alone
772     /// but leads to less optimal code.
773     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
774     /// Not that simple.
775     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
776     {
777         /*
778         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
779         enum bool invertOp = (predicateNumber & 0x80) != 0;
780         static if(invertOp)
781             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
782         else
783             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
784         */
785         enum ir = `
786             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
787             %r = sext i1 %cmp to i32
788             %r2 = bitcast i32 %r to float
789             ret float %r2`;
790 
791         float4 r = a;
792         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
793         return r;
794     }
795 
796     /// CMPSD-style comparisons
797     /// clang implement it through x86 intrinsics, it is possible with IR alone
798     /// but leads to less optimal code.
799     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
800     /// Not that simple.    
801     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
802     {
803         enum ir = `
804             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
805             %r = sext i1 %cmp to i64
806             %r2 = bitcast i64 %r to double
807             ret double %r2`;
808 
809         double2 r = a;
810         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
811         return r;
812     }
813 }
814 else
815 {
816     /// Provides packed float comparisons
817     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
818     {
819         int4 result;
820         foreach(i; 0..4)
821         {
822             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
823         }
824         return result;
825     }
826 
827     /// Provides packed double comparisons
828     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
829     {
830         long2 result;
831         foreach(i; 0..2)
832         {
833             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
834         }
835         return result;
836     }
837 
838     /// Provides CMPSS-style comparison
839     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
840     {
841         int4 result = cast(int4)a;
842         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
843         return cast(float4)result;
844     }
845 
846     /// Provides CMPSD-style comparison
847     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
848     {
849         long2 result = cast(long2)a;
850         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
851         return cast(double2)result;
852     }
853 }
854 unittest // cmpps
855 {
856     // Check all comparison type is working
857     float4 A = [1, 3, 5, float.nan];
858     float4 B = [2, 3, 4, 5];
859 
860     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
861     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
862     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
863     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
864     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
865     int4 result_one = cmpps!(FPComparison.one)(A, B);
866     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
867     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
868     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
869     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
870     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
871     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
872     int4 result_une = cmpps!(FPComparison.une)(A, B);
873     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
874 
875     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
876     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
877     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
878     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
879     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
880     static immutable int[4] correct_one    = [-1, 0,-1, 0];
881     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
882     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
883     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
884     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
885     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
886     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
887     static immutable int[4] correct_une    = [-1, 0,-1,-1];
888     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
889 
890     assert(result_oeq.array == correct_oeq);
891     assert(result_ogt.array == correct_ogt);
892     assert(result_oge.array == correct_oge);
893     assert(result_olt.array == correct_olt);
894     assert(result_ole.array == correct_ole);
895     assert(result_one.array == correct_one);
896     assert(result_ord.array == correct_ord);
897     assert(result_ueq.array == correct_ueq);
898     assert(result_ugt.array == correct_ugt);
899     assert(result_uge.array == correct_uge);
900     assert(result_ult.array == correct_ult);
901     assert(result_ule.array == correct_ule);
902     assert(result_une.array == correct_une);
903     assert(result_uno.array == correct_uno);
904 }
905 unittest
906 {
907     double2 a = [1, 3];
908     double2 b = [2, 3];
909     long2 c = cmppd!(FPComparison.ult)(a, b);
910     static immutable long[2] correct = [cast(long)(-1), 0];
911     assert(c.array == correct);
912 }
913 unittest // cmpss
914 {
915     void testComparison(FPComparison comparison)(float4 A, float4 B)
916     {
917         float4 result = cmpss!comparison(A, B);
918         int4 iresult = cast(int4)result;
919         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
920         assert(iresult.array[0] == expected);
921         assert(result.array[1] == A.array[1]);
922         assert(result.array[2] == A.array[2]);
923         assert(result.array[3] == A.array[3]);
924     }
925 
926     // Check all comparison type is working
927     float4 A = [1, 3, 5, 6];
928     float4 B = [2, 3, 4, 5];
929     float4 C = [float.nan, 3, 4, 5];
930 
931     testComparison!(FPComparison.oeq)(A, B);
932     testComparison!(FPComparison.oeq)(A, C);
933     testComparison!(FPComparison.ogt)(A, B);
934     testComparison!(FPComparison.ogt)(A, C);
935     testComparison!(FPComparison.oge)(A, B);
936     testComparison!(FPComparison.oge)(A, C);
937     testComparison!(FPComparison.olt)(A, B);
938     testComparison!(FPComparison.olt)(A, C);
939     testComparison!(FPComparison.ole)(A, B);
940     testComparison!(FPComparison.ole)(A, C);
941     testComparison!(FPComparison.one)(A, B);
942     testComparison!(FPComparison.one)(A, C);
943     testComparison!(FPComparison.ord)(A, B);
944     testComparison!(FPComparison.ord)(A, C);
945     testComparison!(FPComparison.ueq)(A, B);
946     testComparison!(FPComparison.ueq)(A, C);
947     testComparison!(FPComparison.ugt)(A, B);
948     testComparison!(FPComparison.ugt)(A, C);
949     testComparison!(FPComparison.uge)(A, B);
950     testComparison!(FPComparison.uge)(A, C);
951     testComparison!(FPComparison.ult)(A, B);
952     testComparison!(FPComparison.ult)(A, C);
953     testComparison!(FPComparison.ule)(A, B);
954     testComparison!(FPComparison.ule)(A, C);
955     testComparison!(FPComparison.une)(A, B);
956     testComparison!(FPComparison.une)(A, C);
957     testComparison!(FPComparison.uno)(A, B);
958     testComparison!(FPComparison.uno)(A, C);
959 }
960 unittest // cmpsd
961 {
962     void testComparison(FPComparison comparison)(double2 A, double2 B)
963     {
964         double2 result = cmpsd!comparison(A, B);
965         long2 iresult = cast(long2)result;
966         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
967         assert(iresult.array[0] == expected);
968         assert(result.array[1] == A.array[1]);
969     }
970 
971     // Check all comparison type is working
972     double2 A = [1, 3];
973     double2 B = [2, 4];
974     double2 C = [double.nan, 5];
975 
976     testComparison!(FPComparison.oeq)(A, B);
977     testComparison!(FPComparison.oeq)(A, C);
978     testComparison!(FPComparison.ogt)(A, B);
979     testComparison!(FPComparison.ogt)(A, C);
980     testComparison!(FPComparison.oge)(A, B);
981     testComparison!(FPComparison.oge)(A, C);
982     testComparison!(FPComparison.olt)(A, B);
983     testComparison!(FPComparison.olt)(A, C);
984     testComparison!(FPComparison.ole)(A, B);
985     testComparison!(FPComparison.ole)(A, C);
986     testComparison!(FPComparison.one)(A, B);
987     testComparison!(FPComparison.one)(A, C);
988     testComparison!(FPComparison.ord)(A, B);
989     testComparison!(FPComparison.ord)(A, C);
990     testComparison!(FPComparison.ueq)(A, B);
991     testComparison!(FPComparison.ueq)(A, C);
992     testComparison!(FPComparison.ugt)(A, B);
993     testComparison!(FPComparison.ugt)(A, C);
994     testComparison!(FPComparison.uge)(A, B);
995     testComparison!(FPComparison.uge)(A, C);
996     testComparison!(FPComparison.ult)(A, B);
997     testComparison!(FPComparison.ult)(A, C);
998     testComparison!(FPComparison.ule)(A, B);
999     testComparison!(FPComparison.ule)(A, C);
1000     testComparison!(FPComparison.une)(A, B);
1001     testComparison!(FPComparison.une)(A, C);
1002     testComparison!(FPComparison.uno)(A, B);
1003     testComparison!(FPComparison.uno)(A, C);
1004 }
1005 
1006 //
1007 //  </FLOATING-POINT COMPARISONS>
1008 //
1009 
1010 
1011 __m64 to_m64(__m128i a) pure @trusted
1012 {
1013     long2 la = cast(long2)a;
1014     long1 r = la.array[0];
1015     return r;
1016 }
1017 
1018 __m128i to_m128i(__m64 a) pure @trusted
1019 {
1020   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1021     
1022     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1023     {
1024         long2 r = a.array[0];
1025         r.ptr[1] = 0;
1026         return cast(int4)r;
1027     }
1028     else */
1029     {
1030         long2 r = [0, 0];
1031         r.ptr[0] = a.array[0];
1032         return cast(__m128i)r;
1033     }
1034 }
1035 
1036 // SOME NEON INTRINSICS
1037 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1038 // Not in the public API but the simde project expose it all for the user to use.
1039 // MAYDO: create a new neon.d module, for internal use only.
1040 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1041 static if (LDC_with_ARM64)
1042 {
1043     // VERY USEFUL LINK
1044     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1045 
1046     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1047         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1048 
1049     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1050         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1051 
1052     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1053         short8 vpaddlq_u8 (byte16 a) pure @safe;
1054 
1055     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1056     {
1057         return a & b;
1058     }
1059 
1060     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1061     {
1062         short8 r;
1063         r.ptr[0]  = lo.array[0];
1064         r.ptr[1]  = lo.array[1];
1065         r.ptr[2]  = lo.array[2];
1066         r.ptr[3]  = lo.array[3];
1067         r.ptr[4]  = hi.array[0];
1068         r.ptr[5]  = hi.array[1];
1069         r.ptr[6]  = hi.array[2];
1070         r.ptr[7]  = hi.array[3];
1071         return r;
1072     }
1073 
1074     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1075     {
1076         int4 r;
1077         r.ptr[0] = lo.array[0];
1078         r.ptr[1] = lo.array[1];
1079         r.ptr[2] = hi.array[0];
1080         r.ptr[3] = hi.array[1];
1081         return r;
1082     }
1083 
1084     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1085     {
1086         byte16 r;
1087         r.ptr[0]  = lo.array[0];
1088         r.ptr[1]  = lo.array[1];
1089         r.ptr[2]  = lo.array[2];
1090         r.ptr[3]  = lo.array[3];
1091         r.ptr[4]  = lo.array[4];
1092         r.ptr[5]  = lo.array[5];
1093         r.ptr[6]  = lo.array[6];
1094         r.ptr[7]  = lo.array[7];
1095         r.ptr[8]  = hi.array[0];
1096         r.ptr[9]  = hi.array[1];
1097         r.ptr[10] = hi.array[2];
1098         r.ptr[11] = hi.array[3];
1099         r.ptr[12] = hi.array[4];
1100         r.ptr[13] = hi.array[5];
1101         r.ptr[14] = hi.array[6];
1102         r.ptr[15] = hi.array[7];
1103         return r;
1104     }
1105 
1106 
1107     // float4 => int4
1108 
1109     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1110         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1111 
1112     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1113         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1114 
1115     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1116         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1117 
1118     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1119         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1120 
1121 
1122     // double2 => long2
1123 
1124     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1125         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1126 
1127     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1128         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1129 
1130     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1131         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1132 
1133     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1134         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1135 
1136     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1137         int vcvtms_s32_f32(float a) pure @safe;
1138 
1139     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1140         int vcvtns_s32_f32(float a) pure @safe;    
1141 
1142     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1143         int vcvtps_s32_f32(float a) pure @safe;
1144 
1145     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1146         int vcvts_s32_f32(float a) pure @safe;
1147      
1148     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1149         int vcvtms_s32_f64(double a) pure @safe;
1150 
1151     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1152         int vcvtns_s32_f64(double a) pure @safe;    
1153 
1154     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1155         int vcvtps_s32_f64(double a) pure @safe;
1156 
1157     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1158         int vcvts_s32_f64(double a) pure @safe;
1159 
1160     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1161         long vcvtms_s64_f32(float a) pure @safe;
1162 
1163     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1164         long vcvtns_s64_f32(float a) pure @safe;    
1165 
1166     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1167         long vcvtps_s64_f32(float a) pure @safe;
1168 
1169     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1170         long vcvts_s64_f32(float a) pure @safe;
1171 
1172     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1173         long vcvtms_s64_f64(double a) pure @safe;
1174 
1175     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1176         long vcvtns_s64_f64(double a) pure @safe;    
1177 
1178     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1179         long vcvtps_s64_f64(double a) pure @safe;
1180 
1181     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1182         long vcvts_s64_f64(double a) pure @safe;
1183 
1184     short4 vget_high_s16(short8 a) pure @trusted
1185     {
1186         short4 r;
1187         r.ptr[0] = a.array[4];
1188         r.ptr[1] = a.array[5];
1189         r.ptr[2] = a.array[6];
1190         r.ptr[3] = a.array[7];
1191         return r;
1192     }
1193 
1194     int2 vget_high_s32(int4 a) pure @trusted
1195     {
1196         int2 r;
1197         r.ptr[0] = a.array[2];
1198         r.ptr[1] = a.array[3];
1199         return r;
1200     }
1201 
1202     byte8 vget_high_u8(byte16 a) pure @trusted
1203     {
1204         byte8 r;
1205         r.ptr[0] = a.array[8];
1206         r.ptr[1] = a.array[9];
1207         r.ptr[2] = a.array[10];
1208         r.ptr[3] = a.array[11];
1209         r.ptr[4] = a.array[12];
1210         r.ptr[5] = a.array[13];
1211         r.ptr[6] = a.array[14];
1212         r.ptr[7] = a.array[15];
1213         return r;
1214     }
1215 
1216     short4 vget_low_s16(short8 a) pure @trusted
1217     {
1218         short4 r;
1219         r.ptr[0] = a.array[0];
1220         r.ptr[1] = a.array[1];
1221         r.ptr[2] = a.array[2];
1222         r.ptr[3] = a.array[3];
1223         return r;
1224     } 
1225 
1226     int2 vget_low_s32(int4 a) pure @trusted
1227     {
1228         int2 r;
1229         r.ptr[0] = a.array[0];
1230         r.ptr[1] = a.array[1];
1231         return r;
1232     }
1233 
1234     byte8 vget_low_u8(byte16 a) pure @trusted
1235     {
1236         byte8 r;
1237         r.ptr[0] = a.array[0];
1238         r.ptr[1] = a.array[1];
1239         r.ptr[2] = a.array[2];
1240         r.ptr[3] = a.array[3];
1241         r.ptr[4] = a.array[4];
1242         r.ptr[5] = a.array[5];
1243         r.ptr[6] = a.array[6];
1244         r.ptr[7] = a.array[7];
1245         return r;
1246     }
1247 
1248     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1249         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1250 
1251     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1252         short8 vminq_s16(short8 a, short8 b) pure @safe;
1253 
1254     int4 vmull_s16(short4 a, short4 b) pure @trusted
1255     {
1256         int4 r;
1257         r.ptr[0] = a.array[0] * b.array[0];
1258         r.ptr[1] = a.array[1] * b.array[1];
1259         r.ptr[2] = a.array[2] * b.array[2];
1260         r.ptr[3] = a.array[3] * b.array[3];
1261         return r;
1262     }
1263 
1264     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1265     {
1266         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1267             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1268     }
1269     else
1270     {
1271         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1272             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1273     }
1274 
1275     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1276         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1277 
1278     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1279         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1280 
1281     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1282         byte8 vqmovn_s16(short8 a) pure @safe;
1283 
1284      pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1285         short4 vqmovn_s32(int4 a) pure @safe;
1286 
1287     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1288         byte8 vqmovun_s16(short8 a) pure @safe;
1289 
1290     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1291         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1292 
1293     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1294         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1295 
1296     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1297     {
1298         return a >>> b;
1299     }
1300 }
1301 
1302 version(unittest)
1303 {
1304     double abs_double(double x) @trusted
1305     {
1306         version(LDC)
1307             return llvm_fabs(x);
1308         else
1309         {
1310             long uf = *cast(long*)(&x);
1311             uf &= 0x7fffffff_ffffffff;
1312             return *cast(double*)(&uf);
1313         }
1314     }
1315 }
1316 
1317 // needed because in olg GDC from travis, core.stdc.math.isnan isn't pure
1318 
1319 bool isnan(float x) pure @trusted
1320 {
1321     uint u = *cast(uint*)(&x);
1322     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1323     return result;
1324 }
1325 unittest
1326 {
1327     float x = float.nan;
1328     assert(isnan(x));
1329 
1330     x = 0;
1331     assert(!isnan(x));
1332     
1333     x = float.infinity;
1334     assert(!isnan(x));
1335 }
1336 
1337 bool isnan(double x) pure @trusted
1338 {
1339     ulong u = *cast(ulong*)(&x);
1340     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1341 }
1342 unittest
1343 {
1344     double x = double.nan;
1345     assert(isnan(x));
1346 
1347     x = 0;
1348     assert(!isnan(x));
1349     
1350     x = double.infinity;
1351     assert(!isnan(x));
1352 }