1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 * Authors:   Guillaume Piolat
7 */
8 module inteli.internals;
9 
10 import inteli.types;
11 
12 // The only math functions needed for intel-intrinsics
13 public import core.math: sqrt; // since it's an intrinsics
14 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit
15 
16 
17 version(GNU)
18 {
19     version (X86)
20     {
21         // For 32-bit x86, disable vector extensions with GDC. 
22         // It just doesn't work well.
23         enum GDC_with_x86 = true;
24         enum GDC_with_MMX = false;
25         enum GDC_with_SSE = false;
26         enum GDC_with_SSE2 = false;
27         enum GDC_with_SSE3 = false;
28         enum LDC_with_ARM32 = false;
29         enum LDC_with_ARM64 = false;
30         enum LDC_with_SSE1 = false;
31         enum LDC_with_SSE2 = false;
32         enum LDC_with_SSE3 = false;
33     }
34     else version (X86_64)
35     {
36         // GDC support uses extended inline assembly:
37         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
38         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
39         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
40 
41         public import core.simd;
42 
43         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
44         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
45         public import gcc.builtins;
46                 
47         enum GDC_with_x86 = true;
48         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
49         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
50         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
51         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
52         enum LDC_with_ARM32 = false;
53         enum LDC_with_ARM64 = false;
54         enum LDC_with_SSE1 = false;
55         enum LDC_with_SSE2 = false;
56         enum LDC_with_SSE3 = false;
57     }
58     else
59     {
60         enum GDC_with_x86 = false;
61         enum GDC_with_MMX = false;
62         enum GDC_with_SSE = false;
63         enum GDC_with_SSE2 = false;
64         enum GDC_with_SSE3 = false;
65         enum LDC_with_ARM32 = false;
66         enum LDC_with_ARM64 = false;
67         enum LDC_with_SSE1 = false;
68         enum LDC_with_SSE2 = false;
69         enum LDC_with_SSE3 = false;
70     }
71 }
72 else version(LDC)
73 {
74     public import core.simd;
75     public import ldc.simd;
76     public import ldc.intrinsics;
77     public import ldc.llvmasm: __asm;
78 
79     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
80     static if (__VERSION__ >= 2083)
81     {
82          import ldc.llvmasm;
83          alias LDCInlineIR = __ir_pure;
84 
85          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
86          alias LDCInlineIREx = __irEx_pure; 
87     }
88     else
89     {
90         alias LDCInlineIR = inlineIR;
91     }
92     
93     package(inteli)
94     {
95         enum GDC_with_x86 = false;
96         enum GDC_with_MMX = false;
97         enum GDC_with_SSE = false;
98         enum GDC_with_SSE2 = false;
99         enum GDC_with_SSE3 = false;
100     }
101 
102     version(ARM)
103     {
104         public import ldc.gccbuiltins_arm;
105         enum LDC_with_ARM32 = true;
106         enum LDC_with_ARM64 = false;
107         enum LDC_with_SSE1 = false;
108         enum LDC_with_SSE2 = false;
109         enum LDC_with_SSE3 = false;
110     }
111     else version(AArch64)
112     {
113         //public import ldc.gccbuiltins_arm;
114         enum LDC_with_ARM32 = false;
115         enum LDC_with_ARM64 = true;
116         enum LDC_with_SSE1 = false;
117         enum LDC_with_SSE2 = false;
118         enum LDC_with_SSE3 = false;
119     }
120     else
121     {
122         public import ldc.gccbuiltins_x86;
123         enum LDC_with_ARM32 = false;
124         enum LDC_with_ARM64 = false;
125         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
126         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
127         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
128     }
129 }
130 else version(DigitalMars)
131 {
132     package(inteli)
133     {
134         enum GDC_with_x86 = false;
135         enum GDC_with_MMX = false;
136         enum GDC_with_SSE = false;
137         enum GDC_with_SSE2 = false;
138         enum GDC_with_SSE3 = false;
139         enum LDC_with_ARM32 = false;
140         enum LDC_with_ARM64 = false;
141         enum LDC_with_SSE1 = false;
142         enum LDC_with_SSE2 = false;
143         enum LDC_with_SSE3 = false;
144     }
145 }
146 else
147 {
148     static assert(false, "Unknown compiler");
149 }
150 
151 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though
152 
153 static if (LDC_with_ARM32)
154 {
155     package uint arm_get_fpcr() nothrow @nogc @trusted
156     {
157         return __builtin_arm_get_fpscr();
158     }
159 
160     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
161     {
162         __builtin_arm_set_fpscr(cw);
163     }
164 }
165 
166 static if (LDC_with_ARM64)
167 {
168     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
169         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
170 
171     package uint arm_get_fpcr() pure nothrow @nogc @trusted
172     {
173         return cast(uint) __builtin_aarch64_get_fpcr();
174     }
175 
176     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
177     {
178         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
179         long save_x2;
180         __asm!void("str x2, $1 \n" ~
181                    "ldr w2, $0 \n" ~
182                    "msr fpcr, x2 \n" ~
183                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
184     }
185 }
186 
187 version(DigitalMars)
188 {
189     version(D_InlineAsm_X86)
190         enum DMD_with_asm = true;
191     else version(D_InlineAsm_X86_64)
192         enum DMD_with_asm = true;
193     else
194         enum DMD_with_asm = false;
195 
196     version(D_InlineAsm_X86)
197         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
198     else
199         enum DMD_with_32bit_asm = false;
200 }
201 else
202 {
203     enum DMD_with_asm = false;
204     enum DMD_with_32bit_asm = false;
205 }
206 
207 
208 package:
209 nothrow @nogc:
210 
211 
212 // For internal use only, since public API deals with a x86 semantic emulation
213 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
214 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
215 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
216 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
217 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
218 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
219 
220 
221 //
222 //  <ROUNDING>
223 //
224 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
225 //  doesn't change the FPU rounding mode, and isn't expected to do so.
226 //  So we devised these rounding function to help having consistent rouding between 
227 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
228 //
229 //  Note: There is no MXCSR in ARM. But there is fpscr that implements similar 
230 //  functionality the same.
231 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
232 //  There is no
233 //  We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
234 
235 int convertFloatToInt32UsingMXCSR(float value) @trusted
236 {
237     int result;
238     version(GNU)
239     {
240         asm pure nothrow @nogc @trusted
241         {
242             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
243         }
244     }
245     else static if (LDC_with_ARM32)
246     {
247         result = __asm!int(`vldr s2, $1
248                             vcvtr.s32.f32 s2, s2
249                             vmov $0, s2`, "=r,m", value);
250     }
251     else static if (LDC_with_ARM64)
252     {
253         // Get current rounding mode.
254         uint fpscr = arm_get_fpcr();
255 
256         switch(fpscr & _MM_ROUND_MASK_ARM)
257         {
258             default:
259             case _MM_ROUND_NEAREST_ARM:
260                 result = __asm!int(`ldr s2, $1
261                                     fcvtns $0,s2`, "=r,m", value);
262                 break;
263             case _MM_ROUND_DOWN_ARM:
264                 result = __asm!int(`ldr s2, $1
265                                     fcvtms $0,s2`, "=r,m", value);
266                 break;
267             case _MM_ROUND_UP_ARM:
268                 result = __asm!int(`ldr s2, $1
269                                     fcvtps $0,s2`, "=r,m", value);
270                 break;
271             case _MM_ROUND_TOWARD_ZERO_ARM:
272                 result = cast(int)value;
273                 break;
274         }
275     }
276     else
277     {        
278         asm pure nothrow @nogc @trusted
279         {
280             cvtss2si EAX, value;
281             mov result, EAX;
282         }
283     }
284     return result;
285 }
286 
287 int convertDoubleToInt32UsingMXCSR(double value) @trusted
288 {
289     int result;
290     version(GNU)
291     {
292         asm pure nothrow @nogc @trusted
293         {
294             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
295         }
296     }
297     else static if (LDC_with_ARM32)
298     {
299         result = __asm!int(`vldr d2, $1
300                             vcvtr.s32.f64 s2, d2
301                             vmov $0, s2`, "=r,m", value);
302     }
303     else static if (LDC_with_ARM64)
304     {
305         // Get current rounding mode.
306         uint fpscr = arm_get_fpcr();
307 
308         switch(fpscr & _MM_ROUND_MASK_ARM)
309         {
310             default:
311             case _MM_ROUND_NEAREST_ARM:
312                 result = __asm!int(`ldr d2, $1
313                                     fcvtns $0,d2`, "=r,m", value);
314                 break;
315             case _MM_ROUND_DOWN_ARM:
316                 result = __asm!int(`ldr d2, $1
317                                     fcvtms $0,d2`, "=r,m", value);
318                 break;
319             case _MM_ROUND_UP_ARM:
320                 result = __asm!int(`ldr d2, $1
321                                     fcvtps $0,d2`, "=r,m", value);
322                 break;
323             case _MM_ROUND_TOWARD_ZERO_ARM:
324                 result = cast(int)value;
325                 break;
326         }
327     }
328     else
329     {
330         asm pure nothrow @nogc @trusted
331         {
332             cvtsd2si EAX, value;
333             mov result, EAX;
334         }
335     }
336     return result;
337 }
338 
339 long convertFloatToInt64UsingMXCSR(float value) @trusted
340 {
341     static if (LDC_with_ARM32)
342     {
343         // We have to resort to libc since 32-bit ARM 
344         // doesn't seem to have 64-bit registers.
345         
346         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
347 
348         // Note: converting to double precision else rounding could be different for large integers
349         double asDouble = value; 
350 
351         switch(fpscr & _MM_ROUND_MASK_ARM)
352         {
353             default:
354             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
355             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
356             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
357             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
358         }
359     }
360     else static if (LDC_with_ARM64)
361     {
362         uint fpscr = arm_get_fpcr();
363 
364         switch(fpscr & _MM_ROUND_MASK_ARM)
365         {
366             default:
367             case _MM_ROUND_NEAREST_ARM:
368                 return __asm!long(`ldr s2, $1
369                                    fcvtns $0,s2`, "=r,m", value);
370             case _MM_ROUND_DOWN_ARM:
371                 return __asm!long(`ldr s2, $1
372                                    fcvtms $0,s2`, "=r,m", value);
373             case _MM_ROUND_UP_ARM:
374                 return __asm!long(`ldr s2, $1
375                                    fcvtps $0,s2`, "=r,m", value);
376             case _MM_ROUND_TOWARD_ZERO_ARM:
377                 return cast(long)value;
378         }
379     }
380     // 64-bit can use an SSE instruction
381     else version(D_InlineAsm_X86_64)
382     {
383         long result;
384         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
385         {
386             asm pure nothrow @nogc @trusted
387             {
388                 movss XMM0, value;
389                 cvtss2si RAX, XMM0;
390                 mov result, RAX;
391             }
392         }
393         else
394         {
395             asm pure nothrow @nogc @trusted
396             {
397                 movss XMM0, value;
398                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
399                 mov result, RAX;
400             }
401         }
402         return result;
403     }
404     else version(D_InlineAsm_X86)
405     {
406         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
407         // This leads to an unfortunate FPU sequence in every C++ compiler.
408         // See: https://godbolt.org/z/vZym77
409 
410         // Get current MXCSR rounding
411         uint sseRounding;
412         ushort savedFPUCW;
413         ushort newFPUCW;
414         long result;
415         asm pure nothrow @nogc @trusted
416         {
417             stmxcsr sseRounding;
418             fld value;
419             fnstcw savedFPUCW;
420             mov AX, savedFPUCW;
421             and AX, 0xf3ff;          // clear FPU rounding bits
422             movzx ECX, word ptr sseRounding;
423             and ECX, 0x6000;         // only keep SSE rounding bits
424             shr ECX, 3;
425             or AX, CX;               // make a new control word for FPU with SSE bits
426             mov newFPUCW, AX;
427             fldcw newFPUCW;
428             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
429             fldcw savedFPUCW;
430         }
431         return result;
432     }
433     else static if (GDC_with_x86)
434     {
435         version(X86_64) // 64-bit can just use the right instruction
436         {
437             static assert(GDC_with_SSE);
438             __m128 A;
439             A.ptr[0] = value;
440             return __builtin_ia32_cvtss2si64 (A);
441         }
442         else version(X86) // 32-bit
443         {
444             // This is untested!
445             uint sseRounding;
446             ushort savedFPUCW;
447             ushort newFPUCW;
448             long result;
449             asm pure nothrow @nogc @trusted
450             {
451                 "stmxcsr %1;\n" ~
452                 "fld %2;\n" ~
453                 "fnstcw %3;\n" ~
454                 "movw %3, %%ax;\n" ~
455                 "andw $0xf3ff, %%ax;\n" ~
456                 "movzwl %1, %%ecx;\n" ~
457                 "andl $0x6000, %%ecx;\n" ~
458                 "shrl $3, %%ecx;\n" ~
459                 "orw %%cx, %%ax\n" ~
460                 "movw %%ax, %4;\n" ~
461                 "fldcw %4;\n" ~
462                 "fistpll %0;\n" ~
463                 "fldcw %3;\n" 
464                   : "=m"(result)    // %0
465                   : "m" (sseRounding),
466                     "f" (value),
467                     "m" (savedFPUCW),
468                     "m" (newFPUCW) 
469                   : "eax", "ecx", "st";
470             }
471             return result;
472         }
473         else
474             static assert(false);
475     }
476     else
477         static assert(false);
478 }
479 
480 
481 ///ditto
482 long convertDoubleToInt64UsingMXCSR(double value) @trusted
483 {
484     static if (LDC_with_ARM32)
485     {
486         // We have to resort to libc since 32-bit ARM 
487         // doesn't seem to have 64-bit registers.
488         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
489         switch(fpscr & _MM_ROUND_MASK_ARM)
490         {
491             default:
492             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
493             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
494             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
495             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
496         }
497     }
498     else static if (LDC_with_ARM64)
499     {
500         // Get current rounding mode.
501         uint fpscr = arm_get_fpcr();
502 
503         switch(fpscr & _MM_ROUND_MASK_ARM)
504         {
505             default:
506             case _MM_ROUND_NEAREST_ARM:
507                 return __asm!long(`ldr d2, $1
508                                    fcvtns $0,d2`, "=r,m", value);
509             case _MM_ROUND_DOWN_ARM:
510                 return __asm!long(`ldr d2, $1
511                                    fcvtms $0,d2`, "=r,m", value);
512             case _MM_ROUND_UP_ARM:
513                 return __asm!long(`ldr d2, $1
514                                    fcvtps $0,d2`, "=r,m", value);
515             case _MM_ROUND_TOWARD_ZERO_ARM:
516                 return cast(long)value;
517         }
518     }
519     // 64-bit can use an SSE instruction
520     else version(D_InlineAsm_X86_64)
521     {
522         long result;
523         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
524         {
525             asm pure nothrow @nogc @trusted
526             {
527                 movsd XMM0, value;
528                 cvtsd2si RAX, XMM0;
529                 mov result, RAX;
530             }
531         }
532         else
533         {
534             asm pure nothrow @nogc @trusted
535             {
536                 movsd XMM0, value;
537                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
538                 mov result, RAX;
539             }
540         }
541         return result;
542     }
543     else version(D_InlineAsm_X86)
544     {
545         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
546         // This leads to an unfortunate FPU sequence in every C++ compiler.
547         // See: https://godbolt.org/z/vZym77
548 
549         // Get current MXCSR rounding
550         uint sseRounding;
551         ushort savedFPUCW;
552         ushort newFPUCW;
553         long result;
554         asm pure nothrow @nogc @trusted
555         {
556             stmxcsr sseRounding;
557             fld value;
558             fnstcw savedFPUCW;
559             mov AX, savedFPUCW;
560             and AX, 0xf3ff;
561             movzx ECX, word ptr sseRounding;
562             and ECX, 0x6000;
563             shr ECX, 3;
564             or AX, CX;
565             mov newFPUCW, AX;
566             fldcw newFPUCW;
567             fistp result;
568             fldcw savedFPUCW;
569         }
570         return result;
571     }
572     else static if (GDC_with_x86)
573     {
574         version(X86_64)
575         {
576             static assert(GDC_with_SSE2);
577             __m128d A;
578             A.ptr[0] = value;
579             return __builtin_ia32_cvtsd2si64 (A);
580         }
581         else
582         {
583             // This is untested!
584             uint sseRounding;
585             ushort savedFPUCW;
586             ushort newFPUCW;
587             long result;
588             asm pure nothrow @nogc @trusted
589             {
590                 "stmxcsr %1;\n" ~
591                 "fld %2;\n" ~
592                 "fnstcw %3;\n" ~
593                 "movw %3, %%ax;\n" ~
594                 "andw $0xf3ff, %%ax;\n" ~
595                 "movzwl %1, %%ecx;\n" ~
596                 "andl $0x6000, %%ecx;\n" ~
597                 "shrl $3, %%ecx;\n" ~
598                 "orw %%cx, %%ax\n" ~
599                 "movw %%ax, %4;\n" ~
600                 "fldcw %4;\n" ~
601                 "fistpll %0;\n" ~
602                 "fldcw %3;\n"         
603                   : "=m"(result)    // %0
604                   : "m" (sseRounding),
605                     "t" (value),
606                     "m" (savedFPUCW),
607                     "m" (newFPUCW) 
608                   : "eax", "ecx", "st";
609             }
610             return result;
611         }
612     }
613     else
614         static assert(false);
615 }
616 
617 //
618 //  </ROUNDING>
619 //
620 
621 
622 // using the Intel terminology here
623 
624 byte saturateSignedWordToSignedByte(short value) pure @safe
625 {
626     if (value > 127) value = 127;
627     if (value < -128) value = -128;
628     return cast(byte) value;
629 }
630 
631 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
632 {
633     if (value > 255) value = 255;
634     if (value < 0) value = 0;
635     return cast(ubyte) value;
636 }
637 
638 short saturateSignedIntToSignedShort(int value) pure @safe
639 {
640     if (value > 32767) value = 32767;
641     if (value < -32768) value = -32768;
642     return cast(short) value;
643 }
644 
645 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
646 {
647     if (value > 65535) value = 65535;
648     if (value < 0) value = 0;
649     return cast(ushort) value;
650 }
651 
652 unittest // test saturate operations
653 {
654     assert( saturateSignedWordToSignedByte(32000) == 127);
655     assert( saturateSignedWordToUnsignedByte(32000) == 255);
656     assert( saturateSignedWordToSignedByte(-4000) == -128);
657     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
658     assert( saturateSignedIntToSignedShort(32768) == 32767);
659     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
660     assert( saturateSignedIntToSignedShort(-32769) == -32768);
661     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
662 }
663 
664 version(unittest)
665 {
666     // This is just for debugging tests
667     import core.stdc.stdio: printf;
668 
669     // printing vectors for implementation
670     // Note: you can override `pure` within a `debug` clause
671 
672     void _mm_print_pi64(__m64 v) @trusted
673     {
674         long1 vl = cast(long1)v;
675         printf("%lld\n", vl.array[0]);
676     }
677 
678     void _mm_print_pi32(__m64 v) @trusted
679     {
680         int[2] C = (cast(int2)v).array;
681         printf("%d %d\n", C[0], C[1]);
682     }
683 
684     void _mm_print_pi16(__m64 v) @trusted
685     {
686         short[4] C = (cast(short4)v).array;
687         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
688     }
689 
690     void _mm_print_pi8(__m64 v) @trusted
691     {
692         byte[8] C = (cast(byte8)v).array;
693         printf("%d %d %d %d %d %d %d %d\n",
694         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
695     }
696 
697     void _mm_print_epi64(__m128i v) @trusted
698     {
699         long2 vl = cast(long2)v;
700         printf("%lld %lld\n", vl.array[0], vl.array[1]);
701     }
702 
703     void _mm_print_epi32(__m128i v) @trusted
704     {
705         printf("%d %d %d %d\n",
706               v.array[0], v.array[1], v.array[2], v.array[3]);
707     }  
708 
709     void _mm_print_epi16(__m128i v) @trusted
710     {
711         short[8] C = (cast(short8)v).array;
712         printf("%d %d %d %d %d %d %d %d\n",
713         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
714     }
715 
716     void _mm_print_epi8(__m128i v) @trusted
717     {
718         byte[16] C = (cast(byte16)v).array;
719         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
720         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
721     }
722 
723     void _mm_print_ps(__m128 v) @trusted
724     {
725         float[4] C = (cast(float4)v).array;
726         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
727     }
728 
729     void _mm_print_pd(__m128d v) @trusted
730     {
731         double[2] C = (cast(double2)v).array;
732         printf("%f %f\n", C[0], C[1]);
733     }    
734 }
735 
736 
737 //
738 //  <FLOATING-POINT COMPARISONS>
739 //
740 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
741 //       need different IR generation.
742 
743 enum FPComparison
744 {
745     oeq,   // ordered and equal
746     ogt,   // ordered and greater than
747     oge,   // ordered and greater than or equal
748     olt,   // ordered and less than
749     ole,   // ordered and less than or equal
750     one,   // ordered and not equal
751     ord,   // ordered (no nans)
752     ueq,   // unordered or equal
753     ugt,   // unordered or greater than ("nle")
754     uge,   // unordered or greater than or equal ("nlt")
755     ult,   // unordered or less than ("nge")
756     ule,   // unordered or less than or equal ("ngt")
757     une,   // unordered or not equal ("neq")
758     uno,   // unordered (either nans)
759 }
760 
761 private static immutable string[FPComparison.max+1] FPComparisonToString =
762 [
763     "oeq",
764     "ogt",
765     "oge",
766     "olt",
767     "ole",
768     "one",
769     "ord",
770     "ueq",
771     "ugt",
772     "uge",
773     "ult",
774     "ule",
775     "une",
776     "uno",
777 ];
778 
779 // Individual float comparison: returns -1 for true or 0 for false.
780 // Useful for DMD and testing
781 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
782 {
783     import std.math;
784     bool unordered = isNaN(a) || isNaN(b);
785     final switch(comparison) with(FPComparison)
786     {
787         case oeq: return a == b;
788         case ogt: return a > b;
789         case oge: return a >= b;
790         case olt: return a < b;
791         case ole: return a <= b;
792         case one: return !unordered && (a != b); // NaN with != always yields true
793         case ord: return !unordered; 
794         case ueq: return unordered || (a == b);
795         case ugt: return unordered || (a > b);
796         case uge: return unordered || (a >= b);
797         case ult: return unordered || (a < b);
798         case ule: return unordered || (a <= b);
799         case une: return (a != b); // NaN with != always yields true
800         case uno: return unordered;
801     }
802 }
803 
804 version(LDC)
805 {
806     /// Provides packed float comparisons
807     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
808     {
809         enum ir = `
810             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
811             %r = sext <4 x i1> %cmp to <4 x i32>
812             ret <4 x i32> %r`;
813 
814         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
815     }
816 
817     /// Provides packed double comparisons
818     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
819     {
820         enum ir = `
821             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
822             %r = sext <2 x i1> %cmp to <2 x i64>
823             ret <2 x i64> %r`;
824 
825         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
826     }
827 
828     /// CMPSS-style comparisons
829     /// clang implement it through x86 intrinsics, it is possible with IR alone
830     /// but leads to less optimal code.
831     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
832     /// Not that simple.
833     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
834     {
835         /*
836         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
837         enum bool invertOp = (predicateNumber & 0x80) != 0;
838         static if(invertOp)
839             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
840         else
841             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
842         */
843         enum ir = `
844             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
845             %r = sext i1 %cmp to i32
846             %r2 = bitcast i32 %r to float
847             ret float %r2`;
848 
849         float4 r = a;
850         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
851         return r;
852     }
853 
854     /// CMPSD-style comparisons
855     /// clang implement it through x86 intrinsics, it is possible with IR alone
856     /// but leads to less optimal code.
857     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
858     /// Not that simple.    
859     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
860     {
861         enum ir = `
862             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
863             %r = sext i1 %cmp to i64
864             %r2 = bitcast i64 %r to double
865             ret double %r2`;
866 
867         double2 r = a;
868         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
869         return r;
870     }
871 
872     // Note: ucomss and ucomsd are left unimplemented
873     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
874     {
875         enum ir = `
876             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
877             %r = zext i1 %cmp to i32
878             ret i32 %r`;
879 
880         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
881     }
882 
883     // Note: ucomss and ucomsd are left unimplemented
884     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
885     {
886         enum ir = `
887             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
888             %r = zext i1 %cmp to i32
889             ret i32 %r`;
890 
891         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
892     }
893 }
894 else
895 {
896     /// Provides packed float comparisons
897     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
898     {
899         int4 result;
900         foreach(i; 0..4)
901         {
902             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
903         }
904         return result;
905     }
906 
907     /// Provides packed double comparisons
908     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
909     {
910         long2 result;
911         foreach(i; 0..2)
912         {
913             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
914         }
915         return result;
916     }
917 
918     /// Provides CMPSS-style comparison
919     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
920     {
921         int4 result = cast(int4)a;
922         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
923         return cast(float4)result;
924     }
925 
926     /// Provides CMPSD-style comparison
927     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
928     {
929         long2 result = cast(long2)a;
930         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
931         return cast(double2)result;
932     }
933 
934     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
935     {
936         return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0;
937     }
938 
939     // Note: ucomss and ucomsd are left unimplemented
940     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
941     {
942         return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0;
943     }
944 }
945 unittest // cmpps
946 {
947     // Check all comparison type is working
948     float4 A = [1, 3, 5, float.nan];
949     float4 B = [2, 3, 4, 5];
950 
951     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
952     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
953     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
954     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
955     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
956     int4 result_one = cmpps!(FPComparison.one)(A, B);
957     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
958     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
959     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
960     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
961     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
962     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
963     int4 result_une = cmpps!(FPComparison.une)(A, B);
964     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
965 
966     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
967     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
968     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
969     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
970     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
971     static immutable int[4] correct_one    = [-1, 0,-1, 0];
972     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
973     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
974     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
975     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
976     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
977     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
978     static immutable int[4] correct_une    = [-1, 0,-1,-1];
979     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
980 
981     assert(result_oeq.array == correct_oeq);
982     assert(result_ogt.array == correct_ogt);
983     assert(result_oge.array == correct_oge);
984     assert(result_olt.array == correct_olt);
985     assert(result_ole.array == correct_ole);
986     assert(result_one.array == correct_one);
987     assert(result_ord.array == correct_ord);
988     assert(result_ueq.array == correct_ueq);
989     assert(result_ugt.array == correct_ugt);
990     assert(result_uge.array == correct_uge);
991     assert(result_ult.array == correct_ult);
992     assert(result_ule.array == correct_ule);
993     assert(result_une.array == correct_une);
994     assert(result_uno.array == correct_uno);
995 }
996 unittest
997 {
998     double2 a = [1, 3];
999     double2 b = [2, 3];
1000     long2 c = cmppd!(FPComparison.ult)(a, b);
1001     static immutable long[2] correct = [cast(long)(-1), 0];
1002     assert(c.array == correct);
1003 }
1004 unittest // cmpss and comss
1005 {
1006     void testComparison(FPComparison comparison)(float4 A, float4 B)
1007     {
1008         float4 result = cmpss!comparison(A, B);
1009         int4 iresult = cast(int4)result;
1010         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1011         assert(iresult.array[0] == expected);
1012         assert(result.array[1] == A.array[1]);
1013         assert(result.array[2] == A.array[2]);
1014         assert(result.array[3] == A.array[3]);
1015 
1016         // check comss
1017         int comResult = comss!comparison(A, B);
1018         assert( (expected != 0) == (comResult != 0) );
1019     }
1020 
1021     // Check all comparison type is working
1022     float4 A = [1, 3, 5, 6];
1023     float4 B = [2, 3, 4, 5];
1024     float4 C = [float.nan, 3, 4, 5];
1025 
1026     testComparison!(FPComparison.oeq)(A, B);
1027     testComparison!(FPComparison.oeq)(A, C);
1028     testComparison!(FPComparison.ogt)(A, B);
1029     testComparison!(FPComparison.ogt)(A, C);
1030     testComparison!(FPComparison.oge)(A, B);
1031     testComparison!(FPComparison.oge)(A, C);
1032     testComparison!(FPComparison.olt)(A, B);
1033     testComparison!(FPComparison.olt)(A, C);
1034     testComparison!(FPComparison.ole)(A, B);
1035     testComparison!(FPComparison.ole)(A, C);
1036     testComparison!(FPComparison.one)(A, B);
1037     testComparison!(FPComparison.one)(A, C);
1038     testComparison!(FPComparison.ord)(A, B);
1039     testComparison!(FPComparison.ord)(A, C);
1040     testComparison!(FPComparison.ueq)(A, B);
1041     testComparison!(FPComparison.ueq)(A, C);
1042     testComparison!(FPComparison.ugt)(A, B);
1043     testComparison!(FPComparison.ugt)(A, C);
1044     testComparison!(FPComparison.uge)(A, B);
1045     testComparison!(FPComparison.uge)(A, C);
1046     testComparison!(FPComparison.ult)(A, B);
1047     testComparison!(FPComparison.ult)(A, C);
1048     testComparison!(FPComparison.ule)(A, B);
1049     testComparison!(FPComparison.ule)(A, C);
1050     testComparison!(FPComparison.une)(A, B);
1051     testComparison!(FPComparison.une)(A, C);
1052     testComparison!(FPComparison.uno)(A, B);
1053     testComparison!(FPComparison.uno)(A, C);
1054 }
1055 unittest // cmpsd and comsd
1056 {
1057     void testComparison(FPComparison comparison)(double2 A, double2 B)
1058     {
1059         double2 result = cmpsd!comparison(A, B);
1060         long2 iresult = cast(long2)result;
1061         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1062         assert(iresult.array[0] == expected);
1063         assert(result.array[1] == A.array[1]);
1064 
1065         // check comsd
1066         int comResult = comsd!comparison(A, B);
1067         assert( (expected != 0) == (comResult != 0) );
1068     }
1069 
1070     // Check all comparison type is working
1071     double2 A = [1, 3];
1072     double2 B = [2, 4];
1073     double2 C = [double.nan, 5];
1074 
1075     testComparison!(FPComparison.oeq)(A, B);
1076     testComparison!(FPComparison.oeq)(A, C);
1077     testComparison!(FPComparison.ogt)(A, B);
1078     testComparison!(FPComparison.ogt)(A, C);
1079     testComparison!(FPComparison.oge)(A, B);
1080     testComparison!(FPComparison.oge)(A, C);
1081     testComparison!(FPComparison.olt)(A, B);
1082     testComparison!(FPComparison.olt)(A, C);
1083     testComparison!(FPComparison.ole)(A, B);
1084     testComparison!(FPComparison.ole)(A, C);
1085     testComparison!(FPComparison.one)(A, B);
1086     testComparison!(FPComparison.one)(A, C);
1087     testComparison!(FPComparison.ord)(A, B);
1088     testComparison!(FPComparison.ord)(A, C);
1089     testComparison!(FPComparison.ueq)(A, B);
1090     testComparison!(FPComparison.ueq)(A, C);
1091     testComparison!(FPComparison.ugt)(A, B);
1092     testComparison!(FPComparison.ugt)(A, C);
1093     testComparison!(FPComparison.uge)(A, B);
1094     testComparison!(FPComparison.uge)(A, C);
1095     testComparison!(FPComparison.ult)(A, B);
1096     testComparison!(FPComparison.ult)(A, C);
1097     testComparison!(FPComparison.ule)(A, B);
1098     testComparison!(FPComparison.ule)(A, C);
1099     testComparison!(FPComparison.une)(A, B);
1100     testComparison!(FPComparison.une)(A, C);
1101     testComparison!(FPComparison.uno)(A, B);
1102     testComparison!(FPComparison.uno)(A, C);
1103 }
1104 
1105 //
1106 //  </FLOATING-POINT COMPARISONS>
1107 //
1108 
1109 
1110 __m64 to_m64(__m128i a) pure @trusted
1111 {
1112     long2 la = cast(long2)a;
1113     long1 r;
1114     r.ptr[0] = la.array[0];
1115     return r;
1116 }
1117 
1118 __m128i to_m128i(__m64 a) pure @trusted
1119 {
1120     long2 r = [0, 0];
1121     r.ptr[0] = a.array[0];
1122     return cast(__m128i)r;
1123 }
1124 
1125 // SOME NEON INTRINSICS
1126 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1127 // Not in the public API but the simde project expose it all for the user to use.
1128 // MAYDO: create a new neon.d module, for internal use only.
1129 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1130 static if (LDC_with_ARM64)
1131 {
1132     // VERY USEFUL LINK
1133     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1134 
1135     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1136         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1137 
1138     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1139     {
1140         return a & b;
1141     }
1142 
1143     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1144     {
1145         int4 r;
1146         r.ptr[0] = lo.array[0];
1147         r.ptr[1] = lo.array[1];
1148         r.ptr[2] = hi.array[0];
1149         r.ptr[3] = hi.array[1];
1150         return r;
1151     }
1152 
1153     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1154     {
1155         byte16 r;
1156         r.ptr[0]  = lo.array[0];
1157         r.ptr[1]  = lo.array[1];
1158         r.ptr[2]  = lo.array[2];
1159         r.ptr[3]  = lo.array[3];
1160         r.ptr[4]  = lo.array[4];
1161         r.ptr[5]  = lo.array[5];
1162         r.ptr[6]  = lo.array[6];
1163         r.ptr[7]  = lo.array[7];
1164         r.ptr[8]  = hi.array[0];
1165         r.ptr[9]  = hi.array[1];
1166         r.ptr[10] = hi.array[2];
1167         r.ptr[11] = hi.array[3];
1168         r.ptr[12] = hi.array[4];
1169         r.ptr[13] = hi.array[5];
1170         r.ptr[14] = hi.array[6];
1171         r.ptr[15] = hi.array[7];
1172         return r;
1173     }
1174 
1175     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1176         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1177 
1178     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1179         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1180 
1181     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1182         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1183 
1184     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1185         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1186 
1187     short4 vget_high_s16(short8 a) pure @trusted
1188     {
1189         short4 r;
1190         r.ptr[0] = a.array[4];
1191         r.ptr[1] = a.array[5];
1192         r.ptr[2] = a.array[6];
1193         r.ptr[3] = a.array[7];
1194         return r;
1195     }
1196 
1197     int2 vget_high_s32(int4 a) pure @trusted
1198     {
1199         int2 r;
1200         r.ptr[0] = a.array[2];
1201         r.ptr[1] = a.array[3];
1202         return r;
1203     }
1204 
1205     byte8 vget_high_u8(byte16 a) pure @trusted
1206     {
1207         byte8 r;
1208         r.ptr[0] = a.array[8];
1209         r.ptr[1] = a.array[9];
1210         r.ptr[2] = a.array[10];
1211         r.ptr[3] = a.array[11];
1212         r.ptr[4] = a.array[12];
1213         r.ptr[5] = a.array[13];
1214         r.ptr[6] = a.array[14];
1215         r.ptr[7] = a.array[15];
1216         return r;
1217     }
1218 
1219     short4 vget_low_s16(short8 a) pure @trusted
1220     {
1221         short4 r;
1222         r.ptr[0] = a.array[0];
1223         r.ptr[1] = a.array[1];
1224         r.ptr[2] = a.array[2];
1225         r.ptr[3] = a.array[3];
1226         return r;
1227     } 
1228 
1229     int2 vget_low_s32(int4 a) pure @trusted
1230     {
1231         int2 r;
1232         r.ptr[0] = a.array[0];
1233         r.ptr[1] = a.array[1];
1234         return r;
1235     }
1236 
1237     byte8 vget_low_u8(byte16 a) pure @trusted
1238     {
1239         byte8 r;
1240         r.ptr[0] = a.array[0];
1241         r.ptr[1] = a.array[1];
1242         r.ptr[2] = a.array[2];
1243         r.ptr[3] = a.array[3];
1244         r.ptr[4] = a.array[4];
1245         r.ptr[5] = a.array[5];
1246         r.ptr[6] = a.array[6];
1247         r.ptr[7] = a.array[7];
1248         return r;
1249     }
1250 
1251     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1252         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1253 
1254     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1255         short8 vminq_s16(short8 a, short8 b) pure @safe;
1256 
1257     int4 vmull_s16(short4 a, short4 b) pure @trusted
1258     {
1259         int4 r;
1260         r.ptr[0] = a.array[0] * b.array[0];
1261         r.ptr[1] = a.array[1] * b.array[1];
1262         r.ptr[2] = a.array[2] * b.array[2];
1263         r.ptr[3] = a.array[3] * b.array[3];
1264         return r;
1265     }
1266 
1267     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1268     {
1269         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1270             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1271     }
1272     else
1273     {
1274         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1275             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1276     }
1277 
1278     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1279         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1280 
1281     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1282         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1283 
1284     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1285         byte8 vqmovn_s16(short8 a) pure @safe;
1286 
1287     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1288         byte8 vqmovun_s16(short8 a) pure @safe;
1289 
1290     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1291         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1292 
1293     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1294         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1295 
1296     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1297     {
1298         return a >>> b;
1299     }
1300 }
1301