inteli.internals source code

1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit
14 
15 package:
16 nothrow:
17 @nogc:
18 
19 
20 version(GNU)
21 {
22     version (X86)
23     {
24         // For 32-bit x86, disable vector extensions with GDC. 
25         // It just doesn't work well.
26         enum GDC_with_x86 = true;
27         enum GDC_with_MMX = false;
28         enum GDC_with_SSE = false;
29         enum GDC_with_SSE2 = false;
30         enum GDC_with_SSE3 = false;
31     }
32     else version (X86_64)
33     {
34         // GDC support uses extended inline assembly:
35         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
36         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
37         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
38 
39         public import core.simd;
40 
41         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
42         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
43         public import gcc.builtins;
44                 
45         enum GDC_with_x86 = true;
46         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
47         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
48         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
49         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
50     }
51     else
52     {
53         enum GDC_with_x86 = false;
54         enum GDC_with_MMX = false;
55         enum GDC_with_SSE = false;
56         enum GDC_with_SSE2 = false;
57         enum GDC_with_SSE3 = false;
58     }
59 }
60 else
61 {
62     enum GDC_with_x86 = false;
63     enum GDC_with_MMX = false;
64     enum GDC_with_SSE = false;
65     enum GDC_with_SSE2 = false;
66     enum GDC_with_SSE3 = false;
67 }
68 
69 version(LDC)
70 {
71     public import core.simd;
72     public import ldc.simd;
73     public import ldc.intrinsics;
74     public import ldc.llvmasm: __asm;
75 
76     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
77     static if (__VERSION__ >= 2083)
78     {
79          import ldc.llvmasm;
80          alias LDCInlineIR = __ir_pure;
81 
82          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
83          alias LDCInlineIREx = __irEx_pure; 
84     }
85     else
86     {
87         alias LDCInlineIR = inlineIR;
88     }
89 
90     version(ARM)
91     {
92         public import ldc.gccbuiltins_arm;
93         enum LDC_with_ARM32 = true;
94         enum LDC_with_ARM64 = false;
95         enum LDC_with_SSE1 = false;
96         enum LDC_with_SSE2 = false;
97         enum LDC_with_SSE3 = false;
98     }
99     else version(AArch64)
100     {
101         enum LDC_with_ARM32 = false;
102         enum LDC_with_ARM64 = true;
103         enum LDC_with_SSE1 = false;
104         enum LDC_with_SSE2 = false;
105         enum LDC_with_SSE3 = false;
106     }
107     else
108     {
109         public import ldc.gccbuiltins_x86;
110         enum LDC_with_ARM32 = false;
111         enum LDC_with_ARM64 = false;
112         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
113         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
114         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
115     }
116 }
117 else
118 {
119     enum LDC_with_ARM32 = false;
120     enum LDC_with_ARM64 = false;
121     enum LDC_with_SSE1 = false;
122     enum LDC_with_SSE2 = false;
123     enum LDC_with_SSE3 = false;
124 }
125 
126 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
127 
128 version(DigitalMars)
129 {
130     version(D_InlineAsm_X86)
131         enum DMD_with_asm = true;
132     else version(D_InlineAsm_X86_64)
133         enum DMD_with_asm = true;
134     else
135         enum DMD_with_asm = false;
136 
137     version(D_InlineAsm_X86)
138         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
139     else
140         enum DMD_with_32bit_asm = false;
141 
142     version (D_SIMD)
143         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
144     else
145         enum DMD_with_DSIMD = false;
146 }
147 else
148 {
149     enum DMD_with_asm = false;
150     enum DMD_with_32bit_asm = false;
151     enum DMD_with_DSIMD = false;
152 }
153 
154 static if (LDC_with_ARM32)
155 {
156     package uint arm_get_fpcr() nothrow @nogc @trusted
157     {
158         return __builtin_arm_get_fpscr();
159     }
160 
161     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
162     {
163         __builtin_arm_set_fpscr(cw);
164     }
165 }
166 
167 static if (LDC_with_ARM64)
168 {
169     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
170         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
171 
172     package uint arm_get_fpcr() pure nothrow @nogc @trusted
173     {
174         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
175         return __asm!uint("mrs $0, fpcr", "=r");
176     }
177 
178     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
179     {
180         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
181         long save_x2;
182         __asm!void("str x2, $1 \n" ~
183                    "ldr w2, $0 \n" ~
184                    "msr fpcr, x2 \n" ~
185                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
186     }
187 }
188 
189 
190 // For internal use only, since public API deals with a x86 semantic emulation
191 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
192 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
193 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
194 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
195 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
196 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
197 
198 
199 //
200 //  <ROUNDING>
201 //
202 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
203 //  doesn't change the FPU rounding mode, and isn't expected to do so.
204 //  So we devised these rounding function to help having consistent rouding between 
205 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
206 //
207 //  Note: There is no MXCSR in ARM. But there is fpscr that implements similar 
208 //  functionality the same.
209 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
210 //  There is no
211 //  We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
212 
213 int convertFloatToInt32UsingMXCSR(float value) @trusted
214 {
215     int result;
216     version(GNU)
217     {
218         asm pure nothrow @nogc @trusted
219         {
220             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
221         }
222     }
223     else static if (LDC_with_ARM32)
224     {
225         // TODO: this is a bug, it won't preserve registers when optimized
226         result = __asm!int(`vldr s2, $1
227                             vcvtr.s32.f32 s2, s2
228                             vmov $0, s2`, "=r,m", value);
229     }
230     else static if (LDC_with_ARM64)
231     {
232         // Get current rounding mode.
233         uint fpscr = arm_get_fpcr();
234 
235         switch(fpscr & _MM_ROUND_MASK_ARM)
236         {
237             default:
238             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
239             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
240             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
241             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
242         }
243     }
244     else
245     {
246         asm pure nothrow @nogc @trusted
247         {
248             cvtss2si EAX, value;
249             mov result, EAX;
250         }
251     }
252     return result;
253 }
254 
255 int convertDoubleToInt32UsingMXCSR(double value) @trusted
256 {
257     int result;
258     version(GNU)
259     {
260         asm pure nothrow @nogc @trusted
261         {
262             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
263         }
264     }
265     else static if (LDC_with_ARM32)
266     {
267         // TODO: bug, doesn't preserve registers
268         result = __asm!int(`vldr d2, $1
269                             vcvtr.s32.f64 s2, d2
270                             vmov $0, s2`, "=r,m", value);
271     }
272     else static if (LDC_with_ARM64)
273     {
274         // Get current rounding mode.
275         uint fpscr = arm_get_fpcr();
276 
277         switch(fpscr & _MM_ROUND_MASK_ARM)
278         {
279             default:
280             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
281             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
282             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
283             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
284         }
285     }
286     else
287     {
288         asm pure nothrow @nogc @trusted
289         {
290             cvtsd2si EAX, value;
291             mov result, EAX;
292         }
293     }
294     return result;
295 }
296 
297 long convertFloatToInt64UsingMXCSR(float value) @trusted
298 {
299     static if (LDC_with_ARM32)
300     {
301         // We have to resort to libc since 32-bit ARM 
302         // doesn't seem to have 64-bit registers.
303         
304         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
305 
306         // Note: converting to double precision else rounding could be different for large integers
307         double asDouble = value; 
308 
309         switch(fpscr & _MM_ROUND_MASK_ARM)
310         {
311             default:
312             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
313             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
314             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
315             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
316         }
317     }
318     else static if (LDC_with_ARM64)
319     {
320         uint fpscr = arm_get_fpcr();
321 
322         switch(fpscr & _MM_ROUND_MASK_ARM)
323         {
324             default:
325             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
326             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
327             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
328             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
329         }
330     }
331     // 64-bit can use an SSE instruction
332     else version(D_InlineAsm_X86_64)
333     {
334         long result;
335         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
336         {
337             asm pure nothrow @nogc @trusted
338             {
339                 movss XMM0, value;
340                 cvtss2si RAX, XMM0;
341                 mov result, RAX;
342             }
343         }
344         else
345         {
346             asm pure nothrow @nogc @trusted
347             {
348                 movss XMM0, value;
349                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
350                 mov result, RAX;
351             }
352         }
353         return result;
354     }
355     else version(D_InlineAsm_X86)
356     {
357         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
358         // This leads to an unfortunate FPU sequence in every C++ compiler.
359         // See: https://godbolt.org/z/vZym77
360 
361         // Get current MXCSR rounding
362         uint sseRounding;
363         ushort savedFPUCW;
364         ushort newFPUCW;
365         long result;
366         asm pure nothrow @nogc @trusted
367         {
368             stmxcsr sseRounding;
369             fld value;
370             fnstcw savedFPUCW;
371             mov AX, savedFPUCW;
372             and AX, 0xf3ff;          // clear FPU rounding bits
373             movzx ECX, word ptr sseRounding;
374             and ECX, 0x6000;         // only keep SSE rounding bits
375             shr ECX, 3;
376             or AX, CX;               // make a new control word for FPU with SSE bits
377             mov newFPUCW, AX;
378             fldcw newFPUCW;
379             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
380             fldcw savedFPUCW;
381         }
382         return result;
383     }
384     else static if (GDC_with_x86)
385     {
386         version(X86_64) // 64-bit can just use the right instruction
387         {
388             static assert(GDC_with_SSE);
389             __m128 A;
390             A.ptr[0] = value;
391             return __builtin_ia32_cvtss2si64 (A);
392         }
393         else version(X86) // 32-bit
394         {
395             // This is untested!
396             uint sseRounding;
397             ushort savedFPUCW;
398             ushort newFPUCW;
399             long result;
400             asm pure nothrow @nogc @trusted
401             {
402                 "stmxcsr %1;\n" ~
403                 "fld %2;\n" ~
404                 "fnstcw %3;\n" ~
405                 "movw %3, %%ax;\n" ~
406                 "andw $0xf3ff, %%ax;\n" ~
407                 "movzwl %1, %%ecx;\n" ~
408                 "andl $0x6000, %%ecx;\n" ~
409                 "shrl $3, %%ecx;\n" ~
410                 "orw %%cx, %%ax\n" ~
411                 "movw %%ax, %4;\n" ~
412                 "fldcw %4;\n" ~
413                 "fistpll %0;\n" ~
414                 "fldcw %3;\n" 
415                   : "=m"(result)    // %0
416                   : "m" (sseRounding),
417                     "f" (value),
418                     "m" (savedFPUCW),
419                     "m" (newFPUCW) 
420                   : "eax", "ecx", "st";
421             }
422             return result;
423         }
424         else
425             static assert(false);
426     }
427     else
428         static assert(false);
429 }
430 
431 
432 ///ditto
433 long convertDoubleToInt64UsingMXCSR(double value) @trusted
434 {
435     static if (LDC_with_ARM32)
436     {
437         // We have to resort to libc since 32-bit ARM 
438         // doesn't seem to have 64-bit registers.
439         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
440         switch(fpscr & _MM_ROUND_MASK_ARM)
441         {
442             default:
443             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
444             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
445             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
446             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
447         }
448     }
449     else static if (LDC_with_ARM64)
450     {
451         // Get current rounding mode.
452         uint fpscr = arm_get_fpcr();
453 
454         switch(fpscr & _MM_ROUND_MASK_ARM)
455         {
456             default:
457             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
458             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
459             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
460             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
461         }
462     }
463     // 64-bit can use an SSE instruction
464     else version(D_InlineAsm_X86_64)
465     {
466         long result;
467         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
468         {
469             asm pure nothrow @nogc @trusted
470             {
471                 movsd XMM0, value;
472                 cvtsd2si RAX, XMM0;
473                 mov result, RAX;
474             }
475         }
476         else
477         {
478             asm pure nothrow @nogc @trusted
479             {
480                 movsd XMM0, value;
481                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
482                 mov result, RAX;
483             }
484         }
485         return result;
486     }
487     else version(D_InlineAsm_X86)
488     {
489         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
490         // This leads to an unfortunate FPU sequence in every C++ compiler.
491         // See: https://godbolt.org/z/vZym77
492 
493         // Get current MXCSR rounding
494         uint sseRounding;
495         ushort savedFPUCW;
496         ushort newFPUCW;
497         long result;
498         asm pure nothrow @nogc @trusted
499         {
500             stmxcsr sseRounding;
501             fld value;
502             fnstcw savedFPUCW;
503             mov AX, savedFPUCW;
504             and AX, 0xf3ff;
505             movzx ECX, word ptr sseRounding;
506             and ECX, 0x6000;
507             shr ECX, 3;
508             or AX, CX;
509             mov newFPUCW, AX;
510             fldcw newFPUCW;
511             fistp result;
512             fldcw savedFPUCW;
513         }
514         return result;
515     }
516     else static if (GDC_with_x86)
517     {
518         version(X86_64)
519         {
520             static assert(GDC_with_SSE2);
521             __m128d A;
522             A.ptr[0] = value;
523             return __builtin_ia32_cvtsd2si64 (A);
524         }
525         else
526         {
527             // This is untested!
528             uint sseRounding;
529             ushort savedFPUCW;
530             ushort newFPUCW;
531             long result;
532             asm pure nothrow @nogc @trusted
533             {
534                 "stmxcsr %1;\n" ~
535                 "fld %2;\n" ~
536                 "fnstcw %3;\n" ~
537                 "movw %3, %%ax;\n" ~
538                 "andw $0xf3ff, %%ax;\n" ~
539                 "movzwl %1, %%ecx;\n" ~
540                 "andl $0x6000, %%ecx;\n" ~
541                 "shrl $3, %%ecx;\n" ~
542                 "orw %%cx, %%ax\n" ~
543                 "movw %%ax, %4;\n" ~
544                 "fldcw %4;\n" ~
545                 "fistpll %0;\n" ~
546                 "fldcw %3;\n"         
547                   : "=m"(result)    // %0
548                   : "m" (sseRounding),
549                     "t" (value),
550                     "m" (savedFPUCW),
551                     "m" (newFPUCW) 
552                   : "eax", "ecx", "st";
553             }
554             return result;
555         }
556     }
557     else
558         static assert(false);
559 }
560 
561 //
562 //  </ROUNDING>
563 //
564 
565 
566 // using the Intel terminology here
567 
568 byte saturateSignedWordToSignedByte(short value) pure @safe
569 {
570     if (value > 127) value = 127;
571     if (value < -128) value = -128;
572     return cast(byte) value;
573 }
574 
575 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
576 {
577     if (value > 255) value = 255;
578     if (value < 0) value = 0;
579     return cast(ubyte) value;
580 }
581 
582 short saturateSignedIntToSignedShort(int value) pure @safe
583 {
584     if (value > 32767) value = 32767;
585     if (value < -32768) value = -32768;
586     return cast(short) value;
587 }
588 
589 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
590 {
591     if (value > 65535) value = 65535;
592     if (value < 0) value = 0;
593     return cast(ushort) value;
594 }
595 
596 unittest // test saturate operations
597 {
598     assert( saturateSignedWordToSignedByte(32000) == 127);
599     assert( saturateSignedWordToUnsignedByte(32000) == 255);
600     assert( saturateSignedWordToSignedByte(-4000) == -128);
601     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
602     assert( saturateSignedIntToSignedShort(32768) == 32767);
603     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
604     assert( saturateSignedIntToSignedShort(-32769) == -32768);
605     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
606 }
607 
608 version(unittest)
609 {
610     // This is just for debugging tests
611     import core.stdc.stdio: printf;
612 
613     // printing vectors for implementation
614     // Note: you can override `pure` within a `debug` clause
615 
616     void _mm_print_pi64(__m64 v) @trusted
617     {
618         long1 vl = cast(long1)v;
619         printf("%lld\n", vl.array[0]);
620     }
621 
622     void _mm_print_pi32(__m64 v) @trusted
623     {
624         int[2] C = (cast(int2)v).array;
625         printf("%d %d\n", C[0], C[1]);
626     }
627 
628     void _mm_print_pi16(__m64 v) @trusted
629     {
630         short[4] C = (cast(short4)v).array;
631         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
632     }
633 
634     void _mm_print_pi8(__m64 v) @trusted
635     {
636         byte[8] C = (cast(byte8)v).array;
637         printf("%d %d %d %d %d %d %d %d\n",
638         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
639     }
640 
641     void _mm_print_epi64(__m128i v) @trusted
642     {
643         long2 vl = cast(long2)v;
644         printf("%lld %lld\n", vl.array[0], vl.array[1]);
645     }
646 
647     void _mm_print_epi32(__m128i v) @trusted
648     {
649         printf("%d %d %d %d\n",
650               v.array[0], v.array[1], v.array[2], v.array[3]);
651     }  
652 
653     void _mm_print_epi16(__m128i v) @trusted
654     {
655         short[8] C = (cast(short8)v).array;
656         printf("%d %d %d %d %d %d %d %d\n",
657         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
658     }
659 
660     void _mm_print_epi8(__m128i v) @trusted
661     {
662         byte[16] C = (cast(byte16)v).array;
663         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
664         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
665     }
666 
667     void _mm_print_ps(__m128 v) @trusted
668     {
669         float[4] C = (cast(float4)v).array;
670         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
671     }
672 
673     void _mm_print_pd(__m128d v) @trusted
674     {
675         double[2] C = (cast(double2)v).array;
676         printf("%f %f\n", C[0], C[1]);
677     }    
678 }
679 
680 
681 //
682 //  <FLOATING-POINT COMPARISONS>
683 //
684 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
685 //       need different IR generation.
686 
687 enum FPComparison
688 {
689     oeq,   // ordered and equal
690     ogt,   // ordered and greater than
691     oge,   // ordered and greater than or equal
692     olt,   // ordered and less than
693     ole,   // ordered and less than or equal
694     one,   // ordered and not equal
695     ord,   // ordered (no nans)
696     ueq,   // unordered or equal
697     ugt,   // unordered or greater than ("nle")
698     uge,   // unordered or greater than or equal ("nlt")
699     ult,   // unordered or less than ("nge")
700     ule,   // unordered or less than or equal ("ngt")
701     une,   // unordered or not equal ("neq")
702     uno,   // unordered (either nans)
703 }
704 
705 private static immutable string[FPComparison.max+1] FPComparisonToString =
706 [
707     "oeq",
708     "ogt",
709     "oge",
710     "olt",
711     "ole",
712     "one",
713     "ord",
714     "ueq",
715     "ugt",
716     "uge",
717     "ult",
718     "ule",
719     "une",
720     "uno",
721 ];
722 
723 // Individual float comparison: returns -1 for true or 0 for false.
724 // Useful for DMD and testing
725 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
726 {
727     import std.math;
728     bool unordered = isNaN(a) || isNaN(b);
729     final switch(comparison) with(FPComparison)
730     {
731         case oeq: return a == b;
732         case ogt: return a > b;
733         case oge: return a >= b;
734         case olt: return a < b;
735         case ole: return a <= b;
736         case one: return !unordered && (a != b); // NaN with != always yields true
737         case ord: return !unordered; 
738         case ueq: return unordered || (a == b);
739         case ugt: return unordered || (a > b);
740         case uge: return unordered || (a >= b);
741         case ult: return unordered || (a < b);
742         case ule: return unordered || (a <= b);
743         case une: return (a != b); // NaN with != always yields true
744         case uno: return unordered;
745     }
746 }
747 
748 version(LDC)
749 {
750     /// Provides packed float comparisons
751     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
752     {
753         enum ir = `
754             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
755             %r = sext <4 x i1> %cmp to <4 x i32>
756             ret <4 x i32> %r`;
757 
758         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
759     }
760 
761     /// Provides packed double comparisons
762     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
763     {
764         enum ir = `
765             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
766             %r = sext <2 x i1> %cmp to <2 x i64>
767             ret <2 x i64> %r`;
768 
769         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
770     }
771 
772     /// CMPSS-style comparisons
773     /// clang implement it through x86 intrinsics, it is possible with IR alone
774     /// but leads to less optimal code.
775     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
776     /// Not that simple.
777     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
778     {
779         /*
780         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
781         enum bool invertOp = (predicateNumber & 0x80) != 0;
782         static if(invertOp)
783             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
784         else
785             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
786         */
787         enum ir = `
788             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
789             %r = sext i1 %cmp to i32
790             %r2 = bitcast i32 %r to float
791             ret float %r2`;
792 
793         float4 r = a;
794         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
795         return r;
796     }
797 
798     /// CMPSD-style comparisons
799     /// clang implement it through x86 intrinsics, it is possible with IR alone
800     /// but leads to less optimal code.
801     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
802     /// Not that simple.    
803     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
804     {
805         enum ir = `
806             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
807             %r = sext i1 %cmp to i64
808             %r2 = bitcast i64 %r to double
809             ret double %r2`;
810 
811         double2 r = a;
812         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
813         return r;
814     }
815 
816     // Note: ucomss and ucomsd are left unimplemented
817     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
818     {
819         enum ir = `
820             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
821             %r = zext i1 %cmp to i32
822             ret i32 %r`;
823 
824         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
825     }
826 
827     // Note: ucomss and ucomsd are left unimplemented
828     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
829     {
830         enum ir = `
831             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
832             %r = zext i1 %cmp to i32
833             ret i32 %r`;
834 
835         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
836     }
837 }
838 else
839 {
840     /// Provides packed float comparisons
841     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
842     {
843         int4 result;
844         foreach(i; 0..4)
845         {
846             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
847         }
848         return result;
849     }
850 
851     /// Provides packed double comparisons
852     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
853     {
854         long2 result;
855         foreach(i; 0..2)
856         {
857             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
858         }
859         return result;
860     }
861 
862     /// Provides CMPSS-style comparison
863     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
864     {
865         int4 result = cast(int4)a;
866         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
867         return cast(float4)result;
868     }
869 
870     /// Provides CMPSD-style comparison
871     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
872     {
873         long2 result = cast(long2)a;
874         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
875         return cast(double2)result;
876     }
877 
878     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
879     {
880         return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0;
881     }
882 
883     // Note: ucomss and ucomsd are left unimplemented
884     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
885     {
886         return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0;
887     }
888 }
889 unittest // cmpps
890 {
891     // Check all comparison type is working
892     float4 A = [1, 3, 5, float.nan];
893     float4 B = [2, 3, 4, 5];
894 
895     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
896     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
897     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
898     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
899     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
900     int4 result_one = cmpps!(FPComparison.one)(A, B);
901     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
902     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
903     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
904     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
905     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
906     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
907     int4 result_une = cmpps!(FPComparison.une)(A, B);
908     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
909 
910     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
911     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
912     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
913     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
914     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
915     static immutable int[4] correct_one    = [-1, 0,-1, 0];
916     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
917     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
918     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
919     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
920     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
921     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
922     static immutable int[4] correct_une    = [-1, 0,-1,-1];
923     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
924 
925     assert(result_oeq.array == correct_oeq);
926     assert(result_ogt.array == correct_ogt);
927     assert(result_oge.array == correct_oge);
928     assert(result_olt.array == correct_olt);
929     assert(result_ole.array == correct_ole);
930     assert(result_one.array == correct_one);
931     assert(result_ord.array == correct_ord);
932     assert(result_ueq.array == correct_ueq);
933     assert(result_ugt.array == correct_ugt);
934     assert(result_uge.array == correct_uge);
935     assert(result_ult.array == correct_ult);
936     assert(result_ule.array == correct_ule);
937     assert(result_une.array == correct_une);
938     assert(result_uno.array == correct_uno);
939 }
940 unittest
941 {
942     double2 a = [1, 3];
943     double2 b = [2, 3];
944     long2 c = cmppd!(FPComparison.ult)(a, b);
945     static immutable long[2] correct = [cast(long)(-1), 0];
946     assert(c.array == correct);
947 }
948 unittest // cmpss and comss
949 {
950     void testComparison(FPComparison comparison)(float4 A, float4 B)
951     {
952         float4 result = cmpss!comparison(A, B);
953         int4 iresult = cast(int4)result;
954         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
955         assert(iresult.array[0] == expected);
956         assert(result.array[1] == A.array[1]);
957         assert(result.array[2] == A.array[2]);
958         assert(result.array[3] == A.array[3]);
959 
960         // check comss
961         int comResult = comss!comparison(A, B);
962         assert( (expected != 0) == (comResult != 0) );
963     }
964 
965     // Check all comparison type is working
966     float4 A = [1, 3, 5, 6];
967     float4 B = [2, 3, 4, 5];
968     float4 C = [float.nan, 3, 4, 5];
969 
970     testComparison!(FPComparison.oeq)(A, B);
971     testComparison!(FPComparison.oeq)(A, C);
972     testComparison!(FPComparison.ogt)(A, B);
973     testComparison!(FPComparison.ogt)(A, C);
974     testComparison!(FPComparison.oge)(A, B);
975     testComparison!(FPComparison.oge)(A, C);
976     testComparison!(FPComparison.olt)(A, B);
977     testComparison!(FPComparison.olt)(A, C);
978     testComparison!(FPComparison.ole)(A, B);
979     testComparison!(FPComparison.ole)(A, C);
980     testComparison!(FPComparison.one)(A, B);
981     testComparison!(FPComparison.one)(A, C);
982     testComparison!(FPComparison.ord)(A, B);
983     testComparison!(FPComparison.ord)(A, C);
984     testComparison!(FPComparison.ueq)(A, B);
985     testComparison!(FPComparison.ueq)(A, C);
986     testComparison!(FPComparison.ugt)(A, B);
987     testComparison!(FPComparison.ugt)(A, C);
988     testComparison!(FPComparison.uge)(A, B);
989     testComparison!(FPComparison.uge)(A, C);
990     testComparison!(FPComparison.ult)(A, B);
991     testComparison!(FPComparison.ult)(A, C);
992     testComparison!(FPComparison.ule)(A, B);
993     testComparison!(FPComparison.ule)(A, C);
994     testComparison!(FPComparison.une)(A, B);
995     testComparison!(FPComparison.une)(A, C);
996     testComparison!(FPComparison.uno)(A, B);
997     testComparison!(FPComparison.uno)(A, C);
998 }
999 unittest // cmpsd and comsd
1000 {
1001     void testComparison(FPComparison comparison)(double2 A, double2 B)
1002     {
1003         double2 result = cmpsd!comparison(A, B);
1004         long2 iresult = cast(long2)result;
1005         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1006         assert(iresult.array[0] == expected);
1007         assert(result.array[1] == A.array[1]);
1008 
1009         // check comsd
1010         int comResult = comsd!comparison(A, B);
1011         assert( (expected != 0) == (comResult != 0) );
1012     }
1013 
1014     // Check all comparison type is working
1015     double2 A = [1, 3];
1016     double2 B = [2, 4];
1017     double2 C = [double.nan, 5];
1018 
1019     testComparison!(FPComparison.oeq)(A, B);
1020     testComparison!(FPComparison.oeq)(A, C);
1021     testComparison!(FPComparison.ogt)(A, B);
1022     testComparison!(FPComparison.ogt)(A, C);
1023     testComparison!(FPComparison.oge)(A, B);
1024     testComparison!(FPComparison.oge)(A, C);
1025     testComparison!(FPComparison.olt)(A, B);
1026     testComparison!(FPComparison.olt)(A, C);
1027     testComparison!(FPComparison.ole)(A, B);
1028     testComparison!(FPComparison.ole)(A, C);
1029     testComparison!(FPComparison.one)(A, B);
1030     testComparison!(FPComparison.one)(A, C);
1031     testComparison!(FPComparison.ord)(A, B);
1032     testComparison!(FPComparison.ord)(A, C);
1033     testComparison!(FPComparison.ueq)(A, B);
1034     testComparison!(FPComparison.ueq)(A, C);
1035     testComparison!(FPComparison.ugt)(A, B);
1036     testComparison!(FPComparison.ugt)(A, C);
1037     testComparison!(FPComparison.uge)(A, B);
1038     testComparison!(FPComparison.uge)(A, C);
1039     testComparison!(FPComparison.ult)(A, B);
1040     testComparison!(FPComparison.ult)(A, C);
1041     testComparison!(FPComparison.ule)(A, B);
1042     testComparison!(FPComparison.ule)(A, C);
1043     testComparison!(FPComparison.une)(A, B);
1044     testComparison!(FPComparison.une)(A, C);
1045     testComparison!(FPComparison.uno)(A, B);
1046     testComparison!(FPComparison.uno)(A, C);
1047 }
1048 
1049 //
1050 //  </FLOATING-POINT COMPARISONS>
1051 //
1052 
1053 
1054 __m64 to_m64(__m128i a) pure @trusted
1055 {
1056     long2 la = cast(long2)a;
1057     long1 r = la.array[0];
1058     return r;
1059 }
1060 
1061 __m128i to_m128i(__m64 a) pure @trusted
1062 {
1063   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1064     
1065     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1066     {
1067         long2 r = a.array[0];
1068         r.ptr[1] = 0;
1069         return cast(int4)r;
1070     }
1071     else */
1072     {
1073         long2 r = [0, 0];
1074         r.ptr[0] = a.array[0];
1075         return cast(__m128i)r;
1076     }
1077 }
1078 
1079 // SOME NEON INTRINSICS
1080 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1081 // Not in the public API but the simde project expose it all for the user to use.
1082 // MAYDO: create a new neon.d module, for internal use only.
1083 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1084 static if (LDC_with_ARM64)
1085 {
1086     // VERY USEFUL LINK
1087     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1088 
1089     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1090         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1091 
1092     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1093     {
1094         return a & b;
1095     }
1096 
1097     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1098     {
1099         short8 r;
1100         r.ptr[0]  = lo.array[0];
1101         r.ptr[1]  = lo.array[1];
1102         r.ptr[2]  = lo.array[2];
1103         r.ptr[3]  = lo.array[3];
1104         r.ptr[4]  = hi.array[0];
1105         r.ptr[5]  = hi.array[1];
1106         r.ptr[6]  = hi.array[2];
1107         r.ptr[7]  = hi.array[3];
1108         return r;
1109     }
1110 
1111     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1112     {
1113         int4 r;
1114         r.ptr[0] = lo.array[0];
1115         r.ptr[1] = lo.array[1];
1116         r.ptr[2] = hi.array[0];
1117         r.ptr[3] = hi.array[1];
1118         return r;
1119     }
1120 
1121     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1122     {
1123         byte16 r;
1124         r.ptr[0]  = lo.array[0];
1125         r.ptr[1]  = lo.array[1];
1126         r.ptr[2]  = lo.array[2];
1127         r.ptr[3]  = lo.array[3];
1128         r.ptr[4]  = lo.array[4];
1129         r.ptr[5]  = lo.array[5];
1130         r.ptr[6]  = lo.array[6];
1131         r.ptr[7]  = lo.array[7];
1132         r.ptr[8]  = hi.array[0];
1133         r.ptr[9]  = hi.array[1];
1134         r.ptr[10] = hi.array[2];
1135         r.ptr[11] = hi.array[3];
1136         r.ptr[12] = hi.array[4];
1137         r.ptr[13] = hi.array[5];
1138         r.ptr[14] = hi.array[6];
1139         r.ptr[15] = hi.array[7];
1140         return r;
1141     }
1142 
1143     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1144         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1145 
1146     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1147         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1148 
1149     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1150         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1151 
1152     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1153         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1154 
1155     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1156         int vcvtms_s32_f32(float a) pure @safe;
1157 
1158     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1159         int vcvtns_s32_f32(float a) pure @safe;    
1160 
1161     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1162         int vcvtps_s32_f32(float a) pure @safe;
1163 
1164     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1165         int vcvts_s32_f32(float a) pure @safe;
1166      
1167     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1168         int vcvtms_s32_f64(double a) pure @safe;
1169 
1170     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1171         int vcvtns_s32_f64(double a) pure @safe;    
1172 
1173     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1174         int vcvtps_s32_f64(double a) pure @safe;
1175 
1176     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1177         int vcvts_s32_f64(double a) pure @safe;
1178 
1179     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1180         long vcvtms_s64_f32(float a) pure @safe;
1181 
1182     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1183         long vcvtns_s64_f32(float a) pure @safe;    
1184 
1185     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1186         long vcvtps_s64_f32(float a) pure @safe;
1187 
1188     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1189         long vcvts_s64_f32(float a) pure @safe;
1190 
1191     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1192         long vcvtms_s64_f64(double a) pure @safe;
1193 
1194     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1195         long vcvtns_s64_f64(double a) pure @safe;    
1196 
1197     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1198         long vcvtps_s64_f64(double a) pure @safe;
1199 
1200     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1201         long vcvts_s64_f64(double a) pure @safe;
1202 
1203     short4 vget_high_s16(short8 a) pure @trusted
1204     {
1205         short4 r;
1206         r.ptr[0] = a.array[4];
1207         r.ptr[1] = a.array[5];
1208         r.ptr[2] = a.array[6];
1209         r.ptr[3] = a.array[7];
1210         return r;
1211     }
1212 
1213     int2 vget_high_s32(int4 a) pure @trusted
1214     {
1215         int2 r;
1216         r.ptr[0] = a.array[2];
1217         r.ptr[1] = a.array[3];
1218         return r;
1219     }
1220 
1221     byte8 vget_high_u8(byte16 a) pure @trusted
1222     {
1223         byte8 r;
1224         r.ptr[0] = a.array[8];
1225         r.ptr[1] = a.array[9];
1226         r.ptr[2] = a.array[10];
1227         r.ptr[3] = a.array[11];
1228         r.ptr[4] = a.array[12];
1229         r.ptr[5] = a.array[13];
1230         r.ptr[6] = a.array[14];
1231         r.ptr[7] = a.array[15];
1232         return r;
1233     }
1234 
1235     short4 vget_low_s16(short8 a) pure @trusted
1236     {
1237         short4 r;
1238         r.ptr[0] = a.array[0];
1239         r.ptr[1] = a.array[1];
1240         r.ptr[2] = a.array[2];
1241         r.ptr[3] = a.array[3];
1242         return r;
1243     } 
1244 
1245     int2 vget_low_s32(int4 a) pure @trusted
1246     {
1247         int2 r;
1248         r.ptr[0] = a.array[0];
1249         r.ptr[1] = a.array[1];
1250         return r;
1251     }
1252 
1253     byte8 vget_low_u8(byte16 a) pure @trusted
1254     {
1255         byte8 r;
1256         r.ptr[0] = a.array[0];
1257         r.ptr[1] = a.array[1];
1258         r.ptr[2] = a.array[2];
1259         r.ptr[3] = a.array[3];
1260         r.ptr[4] = a.array[4];
1261         r.ptr[5] = a.array[5];
1262         r.ptr[6] = a.array[6];
1263         r.ptr[7] = a.array[7];
1264         return r;
1265     }
1266 
1267     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1268         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1269 
1270     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1271         short8 vminq_s16(short8 a, short8 b) pure @safe;
1272 
1273     int4 vmull_s16(short4 a, short4 b) pure @trusted
1274     {
1275         int4 r;
1276         r.ptr[0] = a.array[0] * b.array[0];
1277         r.ptr[1] = a.array[1] * b.array[1];
1278         r.ptr[2] = a.array[2] * b.array[2];
1279         r.ptr[3] = a.array[3] * b.array[3];
1280         return r;
1281     }
1282 
1283     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1284     {
1285         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1286             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1287     }
1288     else
1289     {
1290         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1291             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1292     }
1293 
1294     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1295         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1296 
1297     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1298         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1299 
1300     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1301         byte8 vqmovn_s16(short8 a) pure @safe;
1302 
1303      pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1304         short4 vqmovn_s32(int4 a) pure @safe;
1305 
1306     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1307         byte8 vqmovun_s16(short8 a) pure @safe;
1308 
1309     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1310         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1311 
1312     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1313         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1314 
1315     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1316     {
1317         return a >>> b;
1318     }
1319 }
1320