inteli.internals source code

1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 * Authors:   Guillaume Piolat
7 */
8 module inteli.internals;
9 
10 import inteli.types;
11 
12 // The only math functions needed for intel-intrinsics
13 public import core.math: sqrt; // since it's an intrinsics
14 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit
15 
16 
17 version(GNU)
18 {
19     version (X86)
20     {
21         // For 32-bit x86, disable vector extensions with GDC. 
22         // It just doesn't work well.
23         enum GDC_with_x86 = true;
24         enum GDC_with_MMX = false;
25         enum GDC_with_SSE = false;
26         enum GDC_with_SSE2 = false;
27         enum GDC_with_SSE3 = false;
28         enum LDC_with_ARM32 = false;
29         enum LDC_with_ARM64 = false;
30         enum LDC_with_SSE1 = false;
31         enum LDC_with_SSE2 = false;
32         enum LDC_with_SSE3 = false;
33     }
34     else version (X86_64)
35     {
36         // GDC support uses extended inline assembly:
37         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
38         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
39         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
40 
41         public import core.simd;
42 
43         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
44         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
45         public import gcc.builtins;
46                 
47         enum GDC_with_x86 = true;
48         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
49         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
50         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
51         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
52         enum LDC_with_ARM32 = false;
53         enum LDC_with_ARM64 = false;
54         enum LDC_with_SSE1 = false;
55         enum LDC_with_SSE2 = false;
56         enum LDC_with_SSE3 = false;
57     }
58     else
59     {
60         enum GDC_with_x86 = false;
61         enum GDC_with_MMX = false;
62         enum GDC_with_SSE = false;
63         enum GDC_with_SSE2 = false;
64         enum GDC_with_SSE3 = false;
65         enum LDC_with_ARM32 = false;
66         enum LDC_with_ARM64 = false;
67         enum LDC_with_SSE1 = false;
68         enum LDC_with_SSE2 = false;
69         enum LDC_with_SSE3 = false;
70     }
71 }
72 else version(LDC)
73 {
74     public import core.simd;
75     public import ldc.simd;
76     public import ldc.intrinsics;
77     public import ldc.llvmasm: __asm;
78 
79     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
80     static if (__VERSION__ >= 2083)
81     {
82          import ldc.llvmasm;
83          alias LDCInlineIR = __ir_pure;
84 
85          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
86          alias LDCInlineIREx = __irEx_pure; 
87     }
88     else
89     {
90         alias LDCInlineIR = inlineIR;
91     }
92     
93     package(inteli)
94     {
95         enum GDC_with_x86 = false;
96         enum GDC_with_MMX = false;
97         enum GDC_with_SSE = false;
98         enum GDC_with_SSE2 = false;
99         enum GDC_with_SSE3 = false;
100     }
101 
102     version(ARM)
103     {
104         public import ldc.gccbuiltins_arm;
105         enum LDC_with_ARM32 = true;
106         enum LDC_with_ARM64 = false;
107         enum LDC_with_SSE1 = false;
108         enum LDC_with_SSE2 = false;
109         enum LDC_with_SSE3 = false;
110     }
111     else version(AArch64)
112     {
113         //public import ldc.gccbuiltins_arm;
114         enum LDC_with_ARM32 = false;
115         enum LDC_with_ARM64 = true;
116         enum LDC_with_SSE1 = false;
117         enum LDC_with_SSE2 = false;
118         enum LDC_with_SSE3 = false;
119     }
120     else
121     {
122         public import ldc.gccbuiltins_x86;
123         enum LDC_with_ARM32 = false;
124         enum LDC_with_ARM64 = false;
125         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
126         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
127         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
128     }
129 }
130 else version(DigitalMars)
131 {
132     package(inteli)
133     {
134         enum GDC_with_x86 = false;
135         enum GDC_with_MMX = false;
136         enum GDC_with_SSE = false;
137         enum GDC_with_SSE2 = false;
138         enum GDC_with_SSE3 = false;
139         enum LDC_with_ARM32 = false;
140         enum LDC_with_ARM64 = false;
141         enum LDC_with_SSE1 = false;
142         enum LDC_with_SSE2 = false;
143         enum LDC_with_SSE3 = false;
144     }
145 }
146 else
147 {
148     static assert(false, "Unknown compiler");
149 }
150 
151 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though
152 
153 static if (LDC_with_ARM32)
154 {
155     package uint arm_get_fpcr() nothrow @nogc @trusted
156     {
157         return __builtin_arm_get_fpscr();
158     }
159 
160     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
161     {
162         __builtin_arm_set_fpscr(cw);
163     }
164 }
165 
166 static if (LDC_with_ARM64)
167 {
168     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
169         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
170 
171     package uint arm_get_fpcr() pure nothrow @nogc @trusted
172     {
173         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
174         return __asm!uint("mrs $0, fpcr", "=r");
175     }
176 
177     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
178     {
179         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
180         long save_x2;
181         __asm!void("str x2, $1 \n" ~
182                    "ldr w2, $0 \n" ~
183                    "msr fpcr, x2 \n" ~
184                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
185     }
186 }
187 
188 version(DigitalMars)
189 {
190     version(D_InlineAsm_X86)
191         enum DMD_with_asm = true;
192     else version(D_InlineAsm_X86_64)
193         enum DMD_with_asm = true;
194     else
195         enum DMD_with_asm = false;
196 
197     version(D_InlineAsm_X86)
198         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
199     else
200         enum DMD_with_32bit_asm = false;
201 }
202 else
203 {
204     enum DMD_with_asm = false;
205     enum DMD_with_32bit_asm = false;
206 }
207 
208 
209 package:
210 nothrow @nogc:
211 
212 
213 // For internal use only, since public API deals with a x86 semantic emulation
214 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
215 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
216 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
217 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
218 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
219 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
220 
221 
222 //
223 //  <ROUNDING>
224 //
225 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
226 //  doesn't change the FPU rounding mode, and isn't expected to do so.
227 //  So we devised these rounding function to help having consistent rouding between 
228 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
229 //
230 //  Note: There is no MXCSR in ARM. But there is fpscr that implements similar 
231 //  functionality the same.
232 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
233 //  There is no
234 //  We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
235 
236 int convertFloatToInt32UsingMXCSR(float value) @trusted
237 {
238     int result;
239     version(GNU)
240     {
241         asm pure nothrow @nogc @trusted
242         {
243             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
244         }
245     }
246     else static if (LDC_with_ARM32)
247     {
248         // TODO: this is a bug, it won't preserve registers when optimized
249         result = __asm!int(`vldr s2, $1
250                             vcvtr.s32.f32 s2, s2
251                             vmov $0, s2`, "=r,m", value);
252     }
253     else static if (LDC_with_ARM64)
254     {
255         // Get current rounding mode.
256         uint fpscr = arm_get_fpcr();
257 
258         switch(fpscr & _MM_ROUND_MASK_ARM)
259         {
260             default:
261             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
262             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
263             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
264             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
265         }
266     }
267     else
268     {
269         asm pure nothrow @nogc @trusted
270         {
271             cvtss2si EAX, value;
272             mov result, EAX;
273         }
274     }
275     return result;
276 }
277 
278 int convertDoubleToInt32UsingMXCSR(double value) @trusted
279 {
280     int result;
281     version(GNU)
282     {
283         asm pure nothrow @nogc @trusted
284         {
285             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
286         }
287     }
288     else static if (LDC_with_ARM32)
289     {
290         // TODO: bug, doesn't preserve registers
291         result = __asm!int(`vldr d2, $1
292                             vcvtr.s32.f64 s2, d2
293                             vmov $0, s2`, "=r,m", value);
294     }
295     else static if (LDC_with_ARM64)
296     {
297         // Get current rounding mode.
298         uint fpscr = arm_get_fpcr();
299 
300         switch(fpscr & _MM_ROUND_MASK_ARM)
301         {
302             default:
303             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
304             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
305             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
306             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
307         }
308     }
309     else
310     {
311         asm pure nothrow @nogc @trusted
312         {
313             cvtsd2si EAX, value;
314             mov result, EAX;
315         }
316     }
317     return result;
318 }
319 
320 long convertFloatToInt64UsingMXCSR(float value) @trusted
321 {
322     static if (LDC_with_ARM32)
323     {
324         // We have to resort to libc since 32-bit ARM 
325         // doesn't seem to have 64-bit registers.
326         
327         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
328 
329         // Note: converting to double precision else rounding could be different for large integers
330         double asDouble = value; 
331 
332         switch(fpscr & _MM_ROUND_MASK_ARM)
333         {
334             default:
335             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
336             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
337             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
338             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
339         }
340     }
341     else static if (LDC_with_ARM64)
342     {
343         uint fpscr = arm_get_fpcr();
344 
345         switch(fpscr & _MM_ROUND_MASK_ARM)
346         {
347             default:
348             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
349             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
350             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
351             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
352         }
353     }
354     // 64-bit can use an SSE instruction
355     else version(D_InlineAsm_X86_64)
356     {
357         long result;
358         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
359         {
360             asm pure nothrow @nogc @trusted
361             {
362                 movss XMM0, value;
363                 cvtss2si RAX, XMM0;
364                 mov result, RAX;
365             }
366         }
367         else
368         {
369             asm pure nothrow @nogc @trusted
370             {
371                 movss XMM0, value;
372                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
373                 mov result, RAX;
374             }
375         }
376         return result;
377     }
378     else version(D_InlineAsm_X86)
379     {
380         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
381         // This leads to an unfortunate FPU sequence in every C++ compiler.
382         // See: https://godbolt.org/z/vZym77
383 
384         // Get current MXCSR rounding
385         uint sseRounding;
386         ushort savedFPUCW;
387         ushort newFPUCW;
388         long result;
389         asm pure nothrow @nogc @trusted
390         {
391             stmxcsr sseRounding;
392             fld value;
393             fnstcw savedFPUCW;
394             mov AX, savedFPUCW;
395             and AX, 0xf3ff;          // clear FPU rounding bits
396             movzx ECX, word ptr sseRounding;
397             and ECX, 0x6000;         // only keep SSE rounding bits
398             shr ECX, 3;
399             or AX, CX;               // make a new control word for FPU with SSE bits
400             mov newFPUCW, AX;
401             fldcw newFPUCW;
402             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
403             fldcw savedFPUCW;
404         }
405         return result;
406     }
407     else static if (GDC_with_x86)
408     {
409         version(X86_64) // 64-bit can just use the right instruction
410         {
411             static assert(GDC_with_SSE);
412             __m128 A;
413             A.ptr[0] = value;
414             return __builtin_ia32_cvtss2si64 (A);
415         }
416         else version(X86) // 32-bit
417         {
418             // This is untested!
419             uint sseRounding;
420             ushort savedFPUCW;
421             ushort newFPUCW;
422             long result;
423             asm pure nothrow @nogc @trusted
424             {
425                 "stmxcsr %1;\n" ~
426                 "fld %2;\n" ~
427                 "fnstcw %3;\n" ~
428                 "movw %3, %%ax;\n" ~
429                 "andw $0xf3ff, %%ax;\n" ~
430                 "movzwl %1, %%ecx;\n" ~
431                 "andl $0x6000, %%ecx;\n" ~
432                 "shrl $3, %%ecx;\n" ~
433                 "orw %%cx, %%ax\n" ~
434                 "movw %%ax, %4;\n" ~
435                 "fldcw %4;\n" ~
436                 "fistpll %0;\n" ~
437                 "fldcw %3;\n" 
438                   : "=m"(result)    // %0
439                   : "m" (sseRounding),
440                     "f" (value),
441                     "m" (savedFPUCW),
442                     "m" (newFPUCW) 
443                   : "eax", "ecx", "st";
444             }
445             return result;
446         }
447         else
448             static assert(false);
449     }
450     else
451         static assert(false);
452 }
453 
454 
455 ///ditto
456 long convertDoubleToInt64UsingMXCSR(double value) @trusted
457 {
458     static if (LDC_with_ARM32)
459     {
460         // We have to resort to libc since 32-bit ARM 
461         // doesn't seem to have 64-bit registers.
462         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
463         switch(fpscr & _MM_ROUND_MASK_ARM)
464         {
465             default:
466             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
467             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
468             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
469             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
470         }
471     }
472     else static if (LDC_with_ARM64)
473     {
474         // Get current rounding mode.
475         uint fpscr = arm_get_fpcr();
476 
477         switch(fpscr & _MM_ROUND_MASK_ARM)
478         {
479             default:
480             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
481             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
482             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
483             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
484         }
485     }
486     // 64-bit can use an SSE instruction
487     else version(D_InlineAsm_X86_64)
488     {
489         long result;
490         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
491         {
492             asm pure nothrow @nogc @trusted
493             {
494                 movsd XMM0, value;
495                 cvtsd2si RAX, XMM0;
496                 mov result, RAX;
497             }
498         }
499         else
500         {
501             asm pure nothrow @nogc @trusted
502             {
503                 movsd XMM0, value;
504                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
505                 mov result, RAX;
506             }
507         }
508         return result;
509     }
510     else version(D_InlineAsm_X86)
511     {
512         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
513         // This leads to an unfortunate FPU sequence in every C++ compiler.
514         // See: https://godbolt.org/z/vZym77
515 
516         // Get current MXCSR rounding
517         uint sseRounding;
518         ushort savedFPUCW;
519         ushort newFPUCW;
520         long result;
521         asm pure nothrow @nogc @trusted
522         {
523             stmxcsr sseRounding;
524             fld value;
525             fnstcw savedFPUCW;
526             mov AX, savedFPUCW;
527             and AX, 0xf3ff;
528             movzx ECX, word ptr sseRounding;
529             and ECX, 0x6000;
530             shr ECX, 3;
531             or AX, CX;
532             mov newFPUCW, AX;
533             fldcw newFPUCW;
534             fistp result;
535             fldcw savedFPUCW;
536         }
537         return result;
538     }
539     else static if (GDC_with_x86)
540     {
541         version(X86_64)
542         {
543             static assert(GDC_with_SSE2);
544             __m128d A;
545             A.ptr[0] = value;
546             return __builtin_ia32_cvtsd2si64 (A);
547         }
548         else
549         {
550             // This is untested!
551             uint sseRounding;
552             ushort savedFPUCW;
553             ushort newFPUCW;
554             long result;
555             asm pure nothrow @nogc @trusted
556             {
557                 "stmxcsr %1;\n" ~
558                 "fld %2;\n" ~
559                 "fnstcw %3;\n" ~
560                 "movw %3, %%ax;\n" ~
561                 "andw $0xf3ff, %%ax;\n" ~
562                 "movzwl %1, %%ecx;\n" ~
563                 "andl $0x6000, %%ecx;\n" ~
564                 "shrl $3, %%ecx;\n" ~
565                 "orw %%cx, %%ax\n" ~
566                 "movw %%ax, %4;\n" ~
567                 "fldcw %4;\n" ~
568                 "fistpll %0;\n" ~
569                 "fldcw %3;\n"         
570                   : "=m"(result)    // %0
571                   : "m" (sseRounding),
572                     "t" (value),
573                     "m" (savedFPUCW),
574                     "m" (newFPUCW) 
575                   : "eax", "ecx", "st";
576             }
577             return result;
578         }
579     }
580     else
581         static assert(false);
582 }
583 
584 //
585 //  </ROUNDING>
586 //
587 
588 
589 // using the Intel terminology here
590 
591 byte saturateSignedWordToSignedByte(short value) pure @safe
592 {
593     if (value > 127) value = 127;
594     if (value < -128) value = -128;
595     return cast(byte) value;
596 }
597 
598 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
599 {
600     if (value > 255) value = 255;
601     if (value < 0) value = 0;
602     return cast(ubyte) value;
603 }
604 
605 short saturateSignedIntToSignedShort(int value) pure @safe
606 {
607     if (value > 32767) value = 32767;
608     if (value < -32768) value = -32768;
609     return cast(short) value;
610 }
611 
612 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
613 {
614     if (value > 65535) value = 65535;
615     if (value < 0) value = 0;
616     return cast(ushort) value;
617 }
618 
619 unittest // test saturate operations
620 {
621     assert( saturateSignedWordToSignedByte(32000) == 127);
622     assert( saturateSignedWordToUnsignedByte(32000) == 255);
623     assert( saturateSignedWordToSignedByte(-4000) == -128);
624     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
625     assert( saturateSignedIntToSignedShort(32768) == 32767);
626     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
627     assert( saturateSignedIntToSignedShort(-32769) == -32768);
628     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
629 }
630 
631 version(unittest)
632 {
633     // This is just for debugging tests
634     import core.stdc.stdio: printf;
635 
636     // printing vectors for implementation
637     // Note: you can override `pure` within a `debug` clause
638 
639     void _mm_print_pi64(__m64 v) @trusted
640     {
641         long1 vl = cast(long1)v;
642         printf("%lld\n", vl.array[0]);
643     }
644 
645     void _mm_print_pi32(__m64 v) @trusted
646     {
647         int[2] C = (cast(int2)v).array;
648         printf("%d %d\n", C[0], C[1]);
649     }
650 
651     void _mm_print_pi16(__m64 v) @trusted
652     {
653         short[4] C = (cast(short4)v).array;
654         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
655     }
656 
657     void _mm_print_pi8(__m64 v) @trusted
658     {
659         byte[8] C = (cast(byte8)v).array;
660         printf("%d %d %d %d %d %d %d %d\n",
661         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
662     }
663 
664     void _mm_print_epi64(__m128i v) @trusted
665     {
666         long2 vl = cast(long2)v;
667         printf("%lld %lld\n", vl.array[0], vl.array[1]);
668     }
669 
670     void _mm_print_epi32(__m128i v) @trusted
671     {
672         printf("%d %d %d %d\n",
673               v.array[0], v.array[1], v.array[2], v.array[3]);
674     }  
675 
676     void _mm_print_epi16(__m128i v) @trusted
677     {
678         short[8] C = (cast(short8)v).array;
679         printf("%d %d %d %d %d %d %d %d\n",
680         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
681     }
682 
683     void _mm_print_epi8(__m128i v) @trusted
684     {
685         byte[16] C = (cast(byte16)v).array;
686         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
687         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
688     }
689 
690     void _mm_print_ps(__m128 v) @trusted
691     {
692         float[4] C = (cast(float4)v).array;
693         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
694     }
695 
696     void _mm_print_pd(__m128d v) @trusted
697     {
698         double[2] C = (cast(double2)v).array;
699         printf("%f %f\n", C[0], C[1]);
700     }    
701 }
702 
703 
704 //
705 //  <FLOATING-POINT COMPARISONS>
706 //
707 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
708 //       need different IR generation.
709 
710 enum FPComparison
711 {
712     oeq,   // ordered and equal
713     ogt,   // ordered and greater than
714     oge,   // ordered and greater than or equal
715     olt,   // ordered and less than
716     ole,   // ordered and less than or equal
717     one,   // ordered and not equal
718     ord,   // ordered (no nans)
719     ueq,   // unordered or equal
720     ugt,   // unordered or greater than ("nle")
721     uge,   // unordered or greater than or equal ("nlt")
722     ult,   // unordered or less than ("nge")
723     ule,   // unordered or less than or equal ("ngt")
724     une,   // unordered or not equal ("neq")
725     uno,   // unordered (either nans)
726 }
727 
728 private static immutable string[FPComparison.max+1] FPComparisonToString =
729 [
730     "oeq",
731     "ogt",
732     "oge",
733     "olt",
734     "ole",
735     "one",
736     "ord",
737     "ueq",
738     "ugt",
739     "uge",
740     "ult",
741     "ule",
742     "une",
743     "uno",
744 ];
745 
746 // Individual float comparison: returns -1 for true or 0 for false.
747 // Useful for DMD and testing
748 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
749 {
750     import std.math;
751     bool unordered = isNaN(a) || isNaN(b);
752     final switch(comparison) with(FPComparison)
753     {
754         case oeq: return a == b;
755         case ogt: return a > b;
756         case oge: return a >= b;
757         case olt: return a < b;
758         case ole: return a <= b;
759         case one: return !unordered && (a != b); // NaN with != always yields true
760         case ord: return !unordered; 
761         case ueq: return unordered || (a == b);
762         case ugt: return unordered || (a > b);
763         case uge: return unordered || (a >= b);
764         case ult: return unordered || (a < b);
765         case ule: return unordered || (a <= b);
766         case une: return (a != b); // NaN with != always yields true
767         case uno: return unordered;
768     }
769 }
770 
771 version(LDC)
772 {
773     /// Provides packed float comparisons
774     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
775     {
776         enum ir = `
777             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
778             %r = sext <4 x i1> %cmp to <4 x i32>
779             ret <4 x i32> %r`;
780 
781         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
782     }
783 
784     /// Provides packed double comparisons
785     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
786     {
787         enum ir = `
788             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
789             %r = sext <2 x i1> %cmp to <2 x i64>
790             ret <2 x i64> %r`;
791 
792         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
793     }
794 
795     /// CMPSS-style comparisons
796     /// clang implement it through x86 intrinsics, it is possible with IR alone
797     /// but leads to less optimal code.
798     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
799     /// Not that simple.
800     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
801     {
802         /*
803         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
804         enum bool invertOp = (predicateNumber & 0x80) != 0;
805         static if(invertOp)
806             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
807         else
808             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
809         */
810         enum ir = `
811             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
812             %r = sext i1 %cmp to i32
813             %r2 = bitcast i32 %r to float
814             ret float %r2`;
815 
816         float4 r = a;
817         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
818         return r;
819     }
820 
821     /// CMPSD-style comparisons
822     /// clang implement it through x86 intrinsics, it is possible with IR alone
823     /// but leads to less optimal code.
824     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
825     /// Not that simple.    
826     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
827     {
828         enum ir = `
829             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
830             %r = sext i1 %cmp to i64
831             %r2 = bitcast i64 %r to double
832             ret double %r2`;
833 
834         double2 r = a;
835         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
836         return r;
837     }
838 
839     // Note: ucomss and ucomsd are left unimplemented
840     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
841     {
842         enum ir = `
843             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
844             %r = zext i1 %cmp to i32
845             ret i32 %r`;
846 
847         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
848     }
849 
850     // Note: ucomss and ucomsd are left unimplemented
851     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
852     {
853         enum ir = `
854             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
855             %r = zext i1 %cmp to i32
856             ret i32 %r`;
857 
858         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
859     }
860 }
861 else
862 {
863     /// Provides packed float comparisons
864     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
865     {
866         int4 result;
867         foreach(i; 0..4)
868         {
869             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
870         }
871         return result;
872     }
873 
874     /// Provides packed double comparisons
875     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
876     {
877         long2 result;
878         foreach(i; 0..2)
879         {
880             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
881         }
882         return result;
883     }
884 
885     /// Provides CMPSS-style comparison
886     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
887     {
888         int4 result = cast(int4)a;
889         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
890         return cast(float4)result;
891     }
892 
893     /// Provides CMPSD-style comparison
894     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
895     {
896         long2 result = cast(long2)a;
897         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
898         return cast(double2)result;
899     }
900 
901     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
902     {
903         return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0;
904     }
905 
906     // Note: ucomss and ucomsd are left unimplemented
907     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
908     {
909         return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0;
910     }
911 }
912 unittest // cmpps
913 {
914     // Check all comparison type is working
915     float4 A = [1, 3, 5, float.nan];
916     float4 B = [2, 3, 4, 5];
917 
918     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
919     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
920     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
921     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
922     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
923     int4 result_one = cmpps!(FPComparison.one)(A, B);
924     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
925     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
926     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
927     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
928     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
929     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
930     int4 result_une = cmpps!(FPComparison.une)(A, B);
931     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
932 
933     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
934     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
935     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
936     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
937     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
938     static immutable int[4] correct_one    = [-1, 0,-1, 0];
939     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
940     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
941     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
942     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
943     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
944     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
945     static immutable int[4] correct_une    = [-1, 0,-1,-1];
946     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
947 
948     assert(result_oeq.array == correct_oeq);
949     assert(result_ogt.array == correct_ogt);
950     assert(result_oge.array == correct_oge);
951     assert(result_olt.array == correct_olt);
952     assert(result_ole.array == correct_ole);
953     assert(result_one.array == correct_one);
954     assert(result_ord.array == correct_ord);
955     assert(result_ueq.array == correct_ueq);
956     assert(result_ugt.array == correct_ugt);
957     assert(result_uge.array == correct_uge);
958     assert(result_ult.array == correct_ult);
959     assert(result_ule.array == correct_ule);
960     assert(result_une.array == correct_une);
961     assert(result_uno.array == correct_uno);
962 }
963 unittest
964 {
965     double2 a = [1, 3];
966     double2 b = [2, 3];
967     long2 c = cmppd!(FPComparison.ult)(a, b);
968     static immutable long[2] correct = [cast(long)(-1), 0];
969     assert(c.array == correct);
970 }
971 unittest // cmpss and comss
972 {
973     void testComparison(FPComparison comparison)(float4 A, float4 B)
974     {
975         float4 result = cmpss!comparison(A, B);
976         int4 iresult = cast(int4)result;
977         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
978         assert(iresult.array[0] == expected);
979         assert(result.array[1] == A.array[1]);
980         assert(result.array[2] == A.array[2]);
981         assert(result.array[3] == A.array[3]);
982 
983         // check comss
984         int comResult = comss!comparison(A, B);
985         assert( (expected != 0) == (comResult != 0) );
986     }
987 
988     // Check all comparison type is working
989     float4 A = [1, 3, 5, 6];
990     float4 B = [2, 3, 4, 5];
991     float4 C = [float.nan, 3, 4, 5];
992 
993     testComparison!(FPComparison.oeq)(A, B);
994     testComparison!(FPComparison.oeq)(A, C);
995     testComparison!(FPComparison.ogt)(A, B);
996     testComparison!(FPComparison.ogt)(A, C);
997     testComparison!(FPComparison.oge)(A, B);
998     testComparison!(FPComparison.oge)(A, C);
999     testComparison!(FPComparison.olt)(A, B);
1000     testComparison!(FPComparison.olt)(A, C);
1001     testComparison!(FPComparison.ole)(A, B);
1002     testComparison!(FPComparison.ole)(A, C);
1003     testComparison!(FPComparison.one)(A, B);
1004     testComparison!(FPComparison.one)(A, C);
1005     testComparison!(FPComparison.ord)(A, B);
1006     testComparison!(FPComparison.ord)(A, C);
1007     testComparison!(FPComparison.ueq)(A, B);
1008     testComparison!(FPComparison.ueq)(A, C);
1009     testComparison!(FPComparison.ugt)(A, B);
1010     testComparison!(FPComparison.ugt)(A, C);
1011     testComparison!(FPComparison.uge)(A, B);
1012     testComparison!(FPComparison.uge)(A, C);
1013     testComparison!(FPComparison.ult)(A, B);
1014     testComparison!(FPComparison.ult)(A, C);
1015     testComparison!(FPComparison.ule)(A, B);
1016     testComparison!(FPComparison.ule)(A, C);
1017     testComparison!(FPComparison.une)(A, B);
1018     testComparison!(FPComparison.une)(A, C);
1019     testComparison!(FPComparison.uno)(A, B);
1020     testComparison!(FPComparison.uno)(A, C);
1021 }
1022 unittest // cmpsd and comsd
1023 {
1024     void testComparison(FPComparison comparison)(double2 A, double2 B)
1025     {
1026         double2 result = cmpsd!comparison(A, B);
1027         long2 iresult = cast(long2)result;
1028         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1029         assert(iresult.array[0] == expected);
1030         assert(result.array[1] == A.array[1]);
1031 
1032         // check comsd
1033         int comResult = comsd!comparison(A, B);
1034         assert( (expected != 0) == (comResult != 0) );
1035     }
1036 
1037     // Check all comparison type is working
1038     double2 A = [1, 3];
1039     double2 B = [2, 4];
1040     double2 C = [double.nan, 5];
1041 
1042     testComparison!(FPComparison.oeq)(A, B);
1043     testComparison!(FPComparison.oeq)(A, C);
1044     testComparison!(FPComparison.ogt)(A, B);
1045     testComparison!(FPComparison.ogt)(A, C);
1046     testComparison!(FPComparison.oge)(A, B);
1047     testComparison!(FPComparison.oge)(A, C);
1048     testComparison!(FPComparison.olt)(A, B);
1049     testComparison!(FPComparison.olt)(A, C);
1050     testComparison!(FPComparison.ole)(A, B);
1051     testComparison!(FPComparison.ole)(A, C);
1052     testComparison!(FPComparison.one)(A, B);
1053     testComparison!(FPComparison.one)(A, C);
1054     testComparison!(FPComparison.ord)(A, B);
1055     testComparison!(FPComparison.ord)(A, C);
1056     testComparison!(FPComparison.ueq)(A, B);
1057     testComparison!(FPComparison.ueq)(A, C);
1058     testComparison!(FPComparison.ugt)(A, B);
1059     testComparison!(FPComparison.ugt)(A, C);
1060     testComparison!(FPComparison.uge)(A, B);
1061     testComparison!(FPComparison.uge)(A, C);
1062     testComparison!(FPComparison.ult)(A, B);
1063     testComparison!(FPComparison.ult)(A, C);
1064     testComparison!(FPComparison.ule)(A, B);
1065     testComparison!(FPComparison.ule)(A, C);
1066     testComparison!(FPComparison.une)(A, B);
1067     testComparison!(FPComparison.une)(A, C);
1068     testComparison!(FPComparison.uno)(A, B);
1069     testComparison!(FPComparison.uno)(A, C);
1070 }
1071 
1072 //
1073 //  </FLOATING-POINT COMPARISONS>
1074 //
1075 
1076 
1077 __m64 to_m64(__m128i a) pure @trusted
1078 {
1079     long2 la = cast(long2)a;
1080     long1 r;
1081     r.ptr[0] = la.array[0];
1082     return r;
1083 }
1084 
1085 __m128i to_m128i(__m64 a) pure @trusted
1086 {
1087     long2 r = [0, 0];
1088     r.ptr[0] = a.array[0];
1089     return cast(__m128i)r;
1090 }
1091 
1092 // SOME NEON INTRINSICS
1093 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1094 // Not in the public API but the simde project expose it all for the user to use.
1095 // MAYDO: create a new neon.d module, for internal use only.
1096 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1097 static if (LDC_with_ARM64)
1098 {
1099     // VERY USEFUL LINK
1100     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1101 
1102     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1103         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1104 
1105     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1106     {
1107         return a & b;
1108     }
1109 
1110     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1111     {
1112         int4 r;
1113         r.ptr[0] = lo.array[0];
1114         r.ptr[1] = lo.array[1];
1115         r.ptr[2] = hi.array[0];
1116         r.ptr[3] = hi.array[1];
1117         return r;
1118     }
1119 
1120     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1121     {
1122         byte16 r;
1123         r.ptr[0]  = lo.array[0];
1124         r.ptr[1]  = lo.array[1];
1125         r.ptr[2]  = lo.array[2];
1126         r.ptr[3]  = lo.array[3];
1127         r.ptr[4]  = lo.array[4];
1128         r.ptr[5]  = lo.array[5];
1129         r.ptr[6]  = lo.array[6];
1130         r.ptr[7]  = lo.array[7];
1131         r.ptr[8]  = hi.array[0];
1132         r.ptr[9]  = hi.array[1];
1133         r.ptr[10] = hi.array[2];
1134         r.ptr[11] = hi.array[3];
1135         r.ptr[12] = hi.array[4];
1136         r.ptr[13] = hi.array[5];
1137         r.ptr[14] = hi.array[6];
1138         r.ptr[15] = hi.array[7];
1139         return r;
1140     }
1141 
1142     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1143         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1144 
1145     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1146         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1147 
1148     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1149         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1150 
1151     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1152         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1153 
1154     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1155         int vcvtms_s32_f32(float a) pure @safe;
1156 
1157     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1158         int vcvtns_s32_f32(float a) pure @safe;    
1159 
1160     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1161         int vcvtps_s32_f32(float a) pure @safe;
1162 
1163     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1164         int vcvts_s32_f32(float a) pure @safe;
1165      
1166     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1167         int vcvtms_s32_f64(double a) pure @safe;
1168 
1169     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1170         int vcvtns_s32_f64(double a) pure @safe;    
1171 
1172     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1173         int vcvtps_s32_f64(double a) pure @safe;
1174 
1175     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1176         int vcvts_s32_f64(double a) pure @safe;
1177 
1178     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1179         long vcvtms_s64_f32(float a) pure @safe;
1180 
1181     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1182         long vcvtns_s64_f32(float a) pure @safe;    
1183 
1184     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1185         long vcvtps_s64_f32(float a) pure @safe;
1186 
1187     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1188         long vcvts_s64_f32(float a) pure @safe;
1189 
1190     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1191         long vcvtms_s64_f64(double a) pure @safe;
1192 
1193     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1194         long vcvtns_s64_f64(double a) pure @safe;    
1195 
1196     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1197         long vcvtps_s64_f64(double a) pure @safe;
1198 
1199     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1200         long vcvts_s64_f64(double a) pure @safe;
1201 
1202     short4 vget_high_s16(short8 a) pure @trusted
1203     {
1204         short4 r;
1205         r.ptr[0] = a.array[4];
1206         r.ptr[1] = a.array[5];
1207         r.ptr[2] = a.array[6];
1208         r.ptr[3] = a.array[7];
1209         return r;
1210     }
1211 
1212     int2 vget_high_s32(int4 a) pure @trusted
1213     {
1214         int2 r;
1215         r.ptr[0] = a.array[2];
1216         r.ptr[1] = a.array[3];
1217         return r;
1218     }
1219 
1220     byte8 vget_high_u8(byte16 a) pure @trusted
1221     {
1222         byte8 r;
1223         r.ptr[0] = a.array[8];
1224         r.ptr[1] = a.array[9];
1225         r.ptr[2] = a.array[10];
1226         r.ptr[3] = a.array[11];
1227         r.ptr[4] = a.array[12];
1228         r.ptr[5] = a.array[13];
1229         r.ptr[6] = a.array[14];
1230         r.ptr[7] = a.array[15];
1231         return r;
1232     }
1233 
1234     short4 vget_low_s16(short8 a) pure @trusted
1235     {
1236         short4 r;
1237         r.ptr[0] = a.array[0];
1238         r.ptr[1] = a.array[1];
1239         r.ptr[2] = a.array[2];
1240         r.ptr[3] = a.array[3];
1241         return r;
1242     } 
1243 
1244     int2 vget_low_s32(int4 a) pure @trusted
1245     {
1246         int2 r;
1247         r.ptr[0] = a.array[0];
1248         r.ptr[1] = a.array[1];
1249         return r;
1250     }
1251 
1252     byte8 vget_low_u8(byte16 a) pure @trusted
1253     {
1254         byte8 r;
1255         r.ptr[0] = a.array[0];
1256         r.ptr[1] = a.array[1];
1257         r.ptr[2] = a.array[2];
1258         r.ptr[3] = a.array[3];
1259         r.ptr[4] = a.array[4];
1260         r.ptr[5] = a.array[5];
1261         r.ptr[6] = a.array[6];
1262         r.ptr[7] = a.array[7];
1263         return r;
1264     }
1265 
1266     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1267         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1268 
1269     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1270         short8 vminq_s16(short8 a, short8 b) pure @safe;
1271 
1272     int4 vmull_s16(short4 a, short4 b) pure @trusted
1273     {
1274         int4 r;
1275         r.ptr[0] = a.array[0] * b.array[0];
1276         r.ptr[1] = a.array[1] * b.array[1];
1277         r.ptr[2] = a.array[2] * b.array[2];
1278         r.ptr[3] = a.array[3] * b.array[3];
1279         return r;
1280     }
1281 
1282     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1283     {
1284         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1285             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1286     }
1287     else
1288     {
1289         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1290             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1291     }
1292 
1293     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1294         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1295 
1296     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1297         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1298 
1299     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1300         byte8 vqmovn_s16(short8 a) pure @safe;
1301 
1302     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1303         byte8 vqmovun_s16(short8 a) pure @safe;
1304 
1305     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1306         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1307 
1308     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1309         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1310 
1311     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1312     {
1313         return a >>> b;
1314     }
1315 }
1316