inteli.internals source code

1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2018, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 * Authors:   Guillaume Piolat
7 */
8 module inteli.internals;
9 
10 import inteli.types;
11 
12 // The only math functions needed for intel-intrinsics
13 public import core.math: sqrt; // since it's an intrinsics
14 public import std.math: abs; // `fabs` is broken with GCC 4.9.2 on Linux 64-bit
15 
16 
17 version(GNU)
18 {
19     version (X86)
20     {
21         // For 32-bit x86, disable vector extensions with GDC. 
22         // It just doesn't work well.
23         enum GDC_with_x86 = true;
24         enum GDC_with_MMX = false;
25         enum GDC_with_SSE = false;
26         enum GDC_with_SSE2 = false;
27         enum GDC_with_SSE3 = false;
28         enum LDC_with_ARM32 = false;
29         enum LDC_with_ARM64 = false;
30         enum LDC_with_SSE1 = false;
31         enum LDC_with_SSE2 = false;
32         enum LDC_with_SSE3 = false;
33     }
34     else version (X86_64)
35     {
36         // GDC support uses extended inline assembly:
37         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
38         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
39         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
40 
41         public import core.simd;
42 
43         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
44         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
45         public import gcc.builtins;
46                 
47         enum GDC_with_x86 = true;
48         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
49         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
50         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
51         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
52         enum LDC_with_ARM32 = false;
53         enum LDC_with_ARM64 = false;
54         enum LDC_with_SSE1 = false;
55         enum LDC_with_SSE2 = false;
56         enum LDC_with_SSE3 = false;
57     }
58     else
59     {
60         enum GDC_with_x86 = false;
61         enum GDC_with_MMX = false;
62         enum GDC_with_SSE = false;
63         enum GDC_with_SSE2 = false;
64         enum GDC_with_SSE3 = false;
65         enum LDC_with_ARM32 = false;
66         enum LDC_with_ARM64 = false;
67         enum LDC_with_SSE1 = false;
68         enum LDC_with_SSE2 = false;
69         enum LDC_with_SSE3 = false;
70     }
71 }
72 else version(LDC)
73 {
74     public import core.simd;
75     public import ldc.simd;
76     public import ldc.intrinsics;
77     public import ldc.llvmasm: __asm;
78 
79     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
80     static if (__VERSION__ >= 2083)
81     {
82          import ldc.llvmasm;
83          alias LDCInlineIR = __ir_pure;
84 
85          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
86          alias LDCInlineIREx = __irEx_pure; 
87     }
88     else
89     {
90         alias LDCInlineIR = inlineIR;
91     }
92     
93     package(inteli)
94     {
95         enum GDC_with_x86 = false;
96         enum GDC_with_MMX = false;
97         enum GDC_with_SSE = false;
98         enum GDC_with_SSE2 = false;
99         enum GDC_with_SSE3 = false;
100     }
101 
102     version(ARM)
103     {
104         public import ldc.gccbuiltins_arm;
105         enum LDC_with_ARM32 = true;
106         enum LDC_with_ARM64 = false;
107         enum LDC_with_SSE1 = false;
108         enum LDC_with_SSE2 = false;
109         enum LDC_with_SSE3 = false;
110     }
111     else version(AArch64)
112     {
113         //public import ldc.gccbuiltins_arm;
114         enum LDC_with_ARM32 = false;
115         enum LDC_with_ARM64 = true;
116         enum LDC_with_SSE1 = false;
117         enum LDC_with_SSE2 = false;
118         enum LDC_with_SSE3 = false;
119     }
120     else
121     {
122         public import ldc.gccbuiltins_x86;
123         enum LDC_with_ARM32 = false;
124         enum LDC_with_ARM64 = false;
125         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
126         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
127         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
128     }
129 }
130 else version(DigitalMars)
131 {
132     package(inteli)
133     {
134         enum GDC_with_x86 = false;
135         enum GDC_with_MMX = false;
136         enum GDC_with_SSE = false;
137         enum GDC_with_SSE2 = false;
138         enum GDC_with_SSE3 = false;
139         enum LDC_with_ARM32 = false;
140         enum LDC_with_ARM64 = false;
141         enum LDC_with_SSE1 = false;
142         enum LDC_with_SSE2 = false;
143         enum LDC_with_SSE3 = false;
144     }
145 }
146 else
147 {
148     static assert(false, "Unknown compiler");
149 }
150 
151 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64; // ARM32 is largely unsupported though
152 
153 static if (LDC_with_ARM32)
154 {
155     package uint arm_get_fpcr() nothrow @nogc @trusted
156     {
157         return __builtin_arm_get_fpscr();
158     }
159 
160     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
161     {
162         __builtin_arm_set_fpscr(cw);
163     }
164 }
165 
166 static if (LDC_with_ARM64)
167 {
168     package uint arm_get_fpcr() pure nothrow @nogc @trusted
169     {
170         return __asm!uint("mrs $0, fpcr", "=r");
171     }
172 
173     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
174     {
175         __asm!void("ldr w2, $0 \n msr fpcr, x2", "m", cw);
176     }
177 }
178 
179 version(DigitalMars)
180 {
181     version(D_InlineAsm_X86)
182         enum DMD_with_asm = true;
183     else version(D_InlineAsm_X86_64)
184         enum DMD_with_asm = true;
185     else
186         enum DMD_with_asm = false;
187 
188     version(D_InlineAsm_X86)
189         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
190     else
191         enum DMD_with_32bit_asm = false;
192 }
193 else
194 {
195     enum DMD_with_asm = false;
196     enum DMD_with_32bit_asm = false;
197 }
198 
199 
200 package:
201 nothrow @nogc:
202 
203 
204 // For internal use only, since public API deals with a x86 semantic emulation
205 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
206 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
207 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
208 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
209 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
210 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
211 
212 
213 //
214 //  <ROUNDING>
215 //
216 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
217 //  doesn't change the FPU rounding mode, and isn't expected to do so.
218 //  So we devised these rounding function to help having consistent rouding between 
219 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
220 //
221 //  Note: There is no MXCSR in ARM. But there is fpscr that implements similar 
222 //  functionality the same.
223 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
224 //  There is no
225 //  We use fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
226 
227 int convertFloatToInt32UsingMXCSR(float value) @trusted
228 {
229     int result;
230     version(GNU)
231     {
232         asm pure nothrow @nogc @trusted
233         {
234             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
235         }
236     }
237     else static if (LDC_with_ARM32)
238     {
239         result = __asm!int(`vldr s2, $1
240                             vcvtr.s32.f32 s2, s2
241                             vmov $0, s2`, "=r,m", value);
242     }
243     else static if (LDC_with_ARM64)
244     {
245         // Get current rounding mode.
246         uint fpscr = arm_get_fpcr();
247 
248         switch(fpscr & _MM_ROUND_MASK_ARM)
249         {
250             default:
251             case _MM_ROUND_NEAREST_ARM:
252                 result = __asm!int(`ldr s2, $1
253                                     fcvtns $0,s2`, "=r,m", value);
254                 break;
255             case _MM_ROUND_DOWN_ARM:
256                 result = __asm!int(`ldr s2, $1
257                                     fcvtms $0,s2`, "=r,m", value);
258                 break;
259             case _MM_ROUND_UP_ARM:
260                 result = __asm!int(`ldr s2, $1
261                                     fcvtps $0,s2`, "=r,m", value);
262                 break;
263             case _MM_ROUND_TOWARD_ZERO_ARM:
264                 result = cast(int)value;
265                 break;
266         }
267     }
268     else
269     {        
270         asm pure nothrow @nogc @trusted
271         {
272             cvtss2si EAX, value;
273             mov result, EAX;
274         }
275     }
276     return result;
277 }
278 
279 int convertDoubleToInt32UsingMXCSR(double value) @trusted
280 {
281     int result;
282     version(GNU)
283     {
284         asm pure nothrow @nogc @trusted
285         {
286             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
287         }
288     }
289     else static if (LDC_with_ARM32)
290     {
291         result = __asm!int(`vldr d2, $1
292                             vcvtr.s32.f64 s2, d2
293                             vmov $0, s2`, "=r,m", value);
294     }
295     else static if (LDC_with_ARM64)
296     {
297         // Get current rounding mode.
298         uint fpscr = arm_get_fpcr();
299 
300         switch(fpscr & _MM_ROUND_MASK_ARM)
301         {
302             default:
303             case _MM_ROUND_NEAREST_ARM:
304                 result = __asm!int(`ldr d2, $1
305                                     fcvtns $0,d2`, "=r,m", value);
306                 break;
307             case _MM_ROUND_DOWN_ARM:
308                 result = __asm!int(`ldr d2, $1
309                                     fcvtms $0,d2`, "=r,m", value);
310                 break;
311             case _MM_ROUND_UP_ARM:
312                 result = __asm!int(`ldr d2, $1
313                                     fcvtps $0,d2`, "=r,m", value);
314                 break;
315             case _MM_ROUND_TOWARD_ZERO_ARM:
316                 result = cast(int)value;
317                 break;
318         }
319     }
320     else
321     {
322         asm pure nothrow @nogc @trusted
323         {
324             cvtsd2si EAX, value;
325             mov result, EAX;
326         }
327     }
328     return result;
329 }
330 
331 long convertFloatToInt64UsingMXCSR(float value) @trusted
332 {
333     static if (LDC_with_ARM32)
334     {
335         // We have to resort to libc since 32-bit ARM 
336         // doesn't seem to have 64-bit registers.
337         
338         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
339 
340         // Note: converting to double precision else rounding could be different for large integers
341         double asDouble = value; 
342 
343         switch(fpscr & _MM_ROUND_MASK_ARM)
344         {
345             default:
346             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
347             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
348             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
349             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
350         }
351     }
352     else static if (LDC_with_ARM64)
353     {
354         uint fpscr = arm_get_fpcr();
355 
356         switch(fpscr & _MM_ROUND_MASK_ARM)
357         {
358             default:
359             case _MM_ROUND_NEAREST_ARM:
360                 return __asm!long(`ldr s2, $1
361                                    fcvtns $0,s2`, "=r,m", value);
362             case _MM_ROUND_DOWN_ARM:
363                 return __asm!long(`ldr s2, $1
364                                    fcvtms $0,s2`, "=r,m", value);
365             case _MM_ROUND_UP_ARM:
366                 return __asm!long(`ldr s2, $1
367                                    fcvtps $0,s2`, "=r,m", value);
368             case _MM_ROUND_TOWARD_ZERO_ARM:
369                 return cast(long)value;
370         }
371     }
372     // 64-bit can use an SSE instruction
373     else version(D_InlineAsm_X86_64)
374     {
375         long result;
376         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
377         {
378             asm pure nothrow @nogc @trusted
379             {
380                 movss XMM0, value;
381                 cvtss2si RAX, XMM0;
382                 mov result, RAX;
383             }
384         }
385         else
386         {
387             asm pure nothrow @nogc @trusted
388             {
389                 movss XMM0, value;
390                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
391                 mov result, RAX;
392             }
393         }
394         return result;
395     }
396     else version(D_InlineAsm_X86)
397     {
398         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
399         // This leads to an unfortunate FPU sequence in every C++ compiler.
400         // See: https://godbolt.org/z/vZym77
401 
402         // Get current MXCSR rounding
403         uint sseRounding;
404         ushort savedFPUCW;
405         ushort newFPUCW;
406         long result;
407         asm pure nothrow @nogc @trusted
408         {
409             stmxcsr sseRounding;
410             fld value;
411             fnstcw savedFPUCW;
412             mov AX, savedFPUCW;
413             and AX, 0xf3ff;          // clear FPU rounding bits
414             movzx ECX, word ptr sseRounding;
415             and ECX, 0x6000;         // only keep SSE rounding bits
416             shr ECX, 3;
417             or AX, CX;               // make a new control word for FPU with SSE bits
418             mov newFPUCW, AX;
419             fldcw newFPUCW;
420             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
421             fldcw savedFPUCW;
422         }
423         return result;
424     }
425     else static if (GDC_with_x86)
426     {
427         version(X86_64) // 64-bit can just use the right instruction
428         {
429             static assert(GDC_with_SSE);
430             __m128 A;
431             A.ptr[0] = value;
432             return __builtin_ia32_cvtss2si64 (A);
433         }
434         else version(X86) // 32-bit
435         {
436             // This is untested!
437             uint sseRounding;
438             ushort savedFPUCW;
439             ushort newFPUCW;
440             long result;
441             asm pure nothrow @nogc @trusted
442             {
443                 "stmxcsr %1;\n" ~
444                 "fld %2;\n" ~
445                 "fnstcw %3;\n" ~
446                 "movw %3, %%ax;\n" ~
447                 "andw $0xf3ff, %%ax;\n" ~
448                 "movzwl %1, %%ecx;\n" ~
449                 "andl $0x6000, %%ecx;\n" ~
450                 "shrl $3, %%ecx;\n" ~
451                 "orw %%cx, %%ax\n" ~
452                 "movw %%ax, %4;\n" ~
453                 "fldcw %4;\n" ~
454                 "fistpll %0;\n" ~
455                 "fldcw %3;\n" 
456                   : "=m"(result)    // %0
457                   : "m" (sseRounding),
458                     "f" (value),
459                     "m" (savedFPUCW),
460                     "m" (newFPUCW) 
461                   : "eax", "ecx", "st";
462             }
463             return result;
464         }
465         else
466             static assert(false);
467     }
468     else
469         static assert(false);
470 }
471 
472 
473 ///ditto
474 long convertDoubleToInt64UsingMXCSR(double value) @trusted
475 {
476     static if (LDC_with_ARM32)
477     {
478         // We have to resort to libc since 32-bit ARM 
479         // doesn't seem to have 64-bit registers.
480         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
481         switch(fpscr & _MM_ROUND_MASK_ARM)
482         {
483             default:
484             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
485             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
486             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
487             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
488         }
489     }
490     else static if (LDC_with_ARM64)
491     {
492         // Get current rounding mode.
493         uint fpscr = arm_get_fpcr();
494 
495         switch(fpscr & _MM_ROUND_MASK_ARM)
496         {
497             default:
498             case _MM_ROUND_NEAREST_ARM:
499                 return __asm!long(`ldr d2, $1
500                                    fcvtns $0,d2`, "=r,m", value);
501             case _MM_ROUND_DOWN_ARM:
502                 return __asm!long(`ldr d2, $1
503                                    fcvtms $0,d2`, "=r,m", value);
504             case _MM_ROUND_UP_ARM:
505                 return __asm!long(`ldr d2, $1
506                                    fcvtps $0,d2`, "=r,m", value);
507             case _MM_ROUND_TOWARD_ZERO_ARM:
508                 return cast(long)value;
509         }
510     }
511     // 64-bit can use an SSE instruction
512     else version(D_InlineAsm_X86_64)
513     {
514         long result;
515         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
516         {
517             asm pure nothrow @nogc @trusted
518             {
519                 movsd XMM0, value;
520                 cvtsd2si RAX, XMM0;
521                 mov result, RAX;
522             }
523         }
524         else
525         {
526             asm pure nothrow @nogc @trusted
527             {
528                 movsd XMM0, value;
529                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
530                 mov result, RAX;
531             }
532         }
533         return result;
534     }
535     else version(D_InlineAsm_X86)
536     {
537         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
538         // This leads to an unfortunate FPU sequence in every C++ compiler.
539         // See: https://godbolt.org/z/vZym77
540 
541         // Get current MXCSR rounding
542         uint sseRounding;
543         ushort savedFPUCW;
544         ushort newFPUCW;
545         long result;
546         asm pure nothrow @nogc @trusted
547         {
548             stmxcsr sseRounding;
549             fld value;
550             fnstcw savedFPUCW;
551             mov AX, savedFPUCW;
552             and AX, 0xf3ff;
553             movzx ECX, word ptr sseRounding;
554             and ECX, 0x6000;
555             shr ECX, 3;
556             or AX, CX;
557             mov newFPUCW, AX;
558             fldcw newFPUCW;
559             fistp result;
560             fldcw savedFPUCW;
561         }
562         return result;
563     }
564     else static if (GDC_with_x86)
565     {
566         version(X86_64)
567         {
568             static assert(GDC_with_SSE2);
569             __m128d A;
570             A.ptr[0] = value;
571             return __builtin_ia32_cvtsd2si64 (A);
572         }
573         else
574         {
575             // This is untested!
576             uint sseRounding;
577             ushort savedFPUCW;
578             ushort newFPUCW;
579             long result;
580             asm pure nothrow @nogc @trusted
581             {
582                 "stmxcsr %1;\n" ~
583                 "fld %2;\n" ~
584                 "fnstcw %3;\n" ~
585                 "movw %3, %%ax;\n" ~
586                 "andw $0xf3ff, %%ax;\n" ~
587                 "movzwl %1, %%ecx;\n" ~
588                 "andl $0x6000, %%ecx;\n" ~
589                 "shrl $3, %%ecx;\n" ~
590                 "orw %%cx, %%ax\n" ~
591                 "movw %%ax, %4;\n" ~
592                 "fldcw %4;\n" ~
593                 "fistpll %0;\n" ~
594                 "fldcw %3;\n"         
595                   : "=m"(result)    // %0
596                   : "m" (sseRounding),
597                     "t" (value),
598                     "m" (savedFPUCW),
599                     "m" (newFPUCW) 
600                   : "eax", "ecx", "st";
601             }
602             return result;
603         }
604     }
605     else
606         static assert(false);
607 }
608 
609 //
610 //  </ROUNDING>
611 //
612 
613 
614 // using the Intel terminology here
615 
616 byte saturateSignedWordToSignedByte(short value) pure @safe
617 {
618     if (value > 127) value = 127;
619     if (value < -128) value = -128;
620     return cast(byte) value;
621 }
622 
623 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
624 {
625     if (value > 255) value = 255;
626     if (value < 0) value = 0;
627     return cast(ubyte) value;
628 }
629 
630 short saturateSignedIntToSignedShort(int value) pure @safe
631 {
632     if (value > 32767) value = 32767;
633     if (value < -32768) value = -32768;
634     return cast(short) value;
635 }
636 
637 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
638 {
639     if (value > 65535) value = 65535;
640     if (value < 0) value = 0;
641     return cast(ushort) value;
642 }
643 
644 unittest // test saturate operations
645 {
646     assert( saturateSignedWordToSignedByte(32000) == 127);
647     assert( saturateSignedWordToUnsignedByte(32000) == 255);
648     assert( saturateSignedWordToSignedByte(-4000) == -128);
649     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
650     assert( saturateSignedIntToSignedShort(32768) == 32767);
651     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
652     assert( saturateSignedIntToSignedShort(-32769) == -32768);
653     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
654 }
655 
656 version(unittest)
657 {
658     // This is just for debugging tests
659     import core.stdc.stdio: printf;
660 
661     // printing vectors for implementation
662     // Note: you can override `pure` within a `debug` clause
663 
664     void _mm_print_pi64(__m64 v) @trusted
665     {
666         long1 vl = cast(long1)v;
667         printf("%lld\n", vl.array[0]);
668     }
669 
670     void _mm_print_pi32(__m64 v) @trusted
671     {
672         int[2] C = (cast(int2)v).array;
673         printf("%d %d\n", C[0], C[1]);
674     }
675 
676     void _mm_print_pi16(__m64 v) @trusted
677     {
678         short[4] C = (cast(short4)v).array;
679         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
680     }
681 
682     void _mm_print_pi8(__m64 v) @trusted
683     {
684         byte[8] C = (cast(byte8)v).array;
685         printf("%d %d %d %d %d %d %d %d\n",
686         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
687     }
688 
689     void _mm_print_epi64(__m128i v) @trusted
690     {
691         long2 vl = cast(long2)v;
692         printf("%lld %lld\n", vl.array[0], vl.array[1]);
693     }
694 
695     void _mm_print_epi32(__m128i v) @trusted
696     {
697         printf("%d %d %d %d\n",
698               v.array[0], v.array[1], v.array[2], v.array[3]);
699     }  
700 
701     void _mm_print_epi16(__m128i v) @trusted
702     {
703         short[8] C = (cast(short8)v).array;
704         printf("%d %d %d %d %d %d %d %d\n",
705         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
706     }
707 
708     void _mm_print_epi8(__m128i v) @trusted
709     {
710         byte[16] C = (cast(byte16)v).array;
711         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
712         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
713     }
714 
715     void _mm_print_ps(__m128 v) @trusted
716     {
717         float[4] C = (cast(float4)v).array;
718         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
719     }
720 
721     void _mm_print_pd(__m128d v) @trusted
722     {
723         double[2] C = (cast(double2)v).array;
724         printf("%f %f\n", C[0], C[1]);
725     }    
726 }
727 
728 
729 //
730 //  <FLOATING-POINT COMPARISONS>
731 //
732 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
733 //       need different IR generation.
734 
735 enum FPComparison
736 {
737     oeq,   // ordered and equal
738     ogt,   // ordered and greater than
739     oge,   // ordered and greater than or equal
740     olt,   // ordered and less than
741     ole,   // ordered and less than or equal
742     one,   // ordered and not equal
743     ord,   // ordered (no nans)
744     ueq,   // unordered or equal
745     ugt,   // unordered or greater than ("nle")
746     uge,   // unordered or greater than or equal ("nlt")
747     ult,   // unordered or less than ("nge")
748     ule,   // unordered or less than or equal ("ngt")
749     une,   // unordered or not equal ("neq")
750     uno,   // unordered (either nans)
751 }
752 
753 private static immutable string[FPComparison.max+1] FPComparisonToString =
754 [
755     "oeq",
756     "ogt",
757     "oge",
758     "olt",
759     "ole",
760     "one",
761     "ord",
762     "ueq",
763     "ugt",
764     "uge",
765     "ult",
766     "ule",
767     "une",
768     "uno",
769 ];
770 
771 // Individual float comparison: returns -1 for true or 0 for false.
772 // Useful for DMD and testing
773 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
774 {
775     import std.math;
776     bool unordered = isNaN(a) || isNaN(b);
777     final switch(comparison) with(FPComparison)
778     {
779         case oeq: return a == b;
780         case ogt: return a > b;
781         case oge: return a >= b;
782         case olt: return a < b;
783         case ole: return a <= b;
784         case one: return !unordered && (a != b); // NaN with != always yields true
785         case ord: return !unordered; 
786         case ueq: return unordered || (a == b);
787         case ugt: return unordered || (a > b);
788         case uge: return unordered || (a >= b);
789         case ult: return unordered || (a < b);
790         case ule: return unordered || (a <= b);
791         case une: return (a != b); // NaN with != always yields true
792         case uno: return unordered;
793     }
794 }
795 
796 version(LDC)
797 {
798     /// Provides packed float comparisons
799     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
800     {
801         enum ir = `
802             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
803             %r = sext <4 x i1> %cmp to <4 x i32>
804             ret <4 x i32> %r`;
805 
806         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
807     }
808 
809     /// Provides packed double comparisons
810     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
811     {
812         enum ir = `
813             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
814             %r = sext <2 x i1> %cmp to <2 x i64>
815             ret <2 x i64> %r`;
816 
817         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
818     }
819 
820     /// CMPSS-style comparisons
821     /// clang implement it through x86 intrinsics, it is possible with IR alone
822     /// but leads to less optimal code.
823     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
824     /// Not that simple.
825     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
826     {
827         /*
828         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
829         enum bool invertOp = (predicateNumber & 0x80) != 0;
830         static if(invertOp)
831             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
832         else
833             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
834         */
835         enum ir = `
836             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
837             %r = sext i1 %cmp to i32
838             %r2 = bitcast i32 %r to float
839             ret float %r2`;
840 
841         float4 r = a;
842         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
843         return r;
844     }
845 
846     /// CMPSD-style comparisons
847     /// clang implement it through x86 intrinsics, it is possible with IR alone
848     /// but leads to less optimal code.
849     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
850     /// Not that simple.    
851     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
852     {
853         enum ir = `
854             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
855             %r = sext i1 %cmp to i64
856             %r2 = bitcast i64 %r to double
857             ret double %r2`;
858 
859         double2 r = a;
860         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
861         return r;
862     }
863 
864     // Note: ucomss and ucomsd are left unimplemented
865     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
866     {
867         enum ir = `
868             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
869             %r = zext i1 %cmp to i32
870             ret i32 %r`;
871 
872         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
873     }
874 
875     // Note: ucomss and ucomsd are left unimplemented
876     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
877     {
878         enum ir = `
879             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
880             %r = zext i1 %cmp to i32
881             ret i32 %r`;
882 
883         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
884     }
885 }
886 else
887 {
888     /// Provides packed float comparisons
889     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
890     {
891         int4 result;
892         foreach(i; 0..4)
893         {
894             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
895         }
896         return result;
897     }
898 
899     /// Provides packed double comparisons
900     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
901     {
902         long2 result;
903         foreach(i; 0..2)
904         {
905             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
906         }
907         return result;
908     }
909 
910     /// Provides CMPSS-style comparison
911     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
912     {
913         int4 result = cast(int4)a;
914         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
915         return cast(float4)result;
916     }
917 
918     /// Provides CMPSD-style comparison
919     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
920     {
921         long2 result = cast(long2)a;
922         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
923         return cast(double2)result;
924     }
925 
926     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
927     {
928         return compareFloat!float(comparison, a.array[0], b.array[0]) ? 1 : 0;
929     }
930 
931     // Note: ucomss and ucomsd are left unimplemented
932     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
933     {
934         return compareFloat!double(comparison, a.array[0], b.array[0]) ? 1 : 0;
935     }
936 }
937 unittest // cmpps
938 {
939     // Check all comparison type is working
940     float4 A = [1, 3, 5, float.nan];
941     float4 B = [2, 3, 4, 5];
942 
943     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
944     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
945     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
946     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
947     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
948     int4 result_one = cmpps!(FPComparison.one)(A, B);
949     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
950     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
951     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
952     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
953     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
954     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
955     int4 result_une = cmpps!(FPComparison.une)(A, B);
956     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
957 
958     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
959     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
960     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
961     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
962     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
963     static immutable int[4] correct_one    = [-1, 0,-1, 0];
964     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
965     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
966     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
967     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
968     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
969     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
970     static immutable int[4] correct_une    = [-1, 0,-1,-1];
971     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
972 
973     assert(result_oeq.array == correct_oeq);
974     assert(result_ogt.array == correct_ogt);
975     assert(result_oge.array == correct_oge);
976     assert(result_olt.array == correct_olt);
977     assert(result_ole.array == correct_ole);
978     assert(result_one.array == correct_one);
979     assert(result_ord.array == correct_ord);
980     assert(result_ueq.array == correct_ueq);
981     assert(result_ugt.array == correct_ugt);
982     assert(result_uge.array == correct_uge);
983     assert(result_ult.array == correct_ult);
984     assert(result_ule.array == correct_ule);
985     assert(result_une.array == correct_une);
986     assert(result_uno.array == correct_uno);
987 }
988 unittest
989 {
990     double2 a = [1, 3];
991     double2 b = [2, 3];
992     long2 c = cmppd!(FPComparison.ult)(a, b);
993     static immutable long[2] correct = [cast(long)(-1), 0];
994     assert(c.array == correct);
995 }
996 unittest // cmpss and comss
997 {
998     void testComparison(FPComparison comparison)(float4 A, float4 B)
999     {
1000         float4 result = cmpss!comparison(A, B);
1001         int4 iresult = cast(int4)result;
1002         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1003         assert(iresult.array[0] == expected);
1004         assert(result.array[1] == A.array[1]);
1005         assert(result.array[2] == A.array[2]);
1006         assert(result.array[3] == A.array[3]);
1007 
1008         // check comss
1009         int comResult = comss!comparison(A, B);
1010         assert( (expected != 0) == (comResult != 0) );
1011     }
1012 
1013     // Check all comparison type is working
1014     float4 A = [1, 3, 5, 6];
1015     float4 B = [2, 3, 4, 5];
1016     float4 C = [float.nan, 3, 4, 5];
1017 
1018     testComparison!(FPComparison.oeq)(A, B);
1019     testComparison!(FPComparison.oeq)(A, C);
1020     testComparison!(FPComparison.ogt)(A, B);
1021     testComparison!(FPComparison.ogt)(A, C);
1022     testComparison!(FPComparison.oge)(A, B);
1023     testComparison!(FPComparison.oge)(A, C);
1024     testComparison!(FPComparison.olt)(A, B);
1025     testComparison!(FPComparison.olt)(A, C);
1026     testComparison!(FPComparison.ole)(A, B);
1027     testComparison!(FPComparison.ole)(A, C);
1028     testComparison!(FPComparison.one)(A, B);
1029     testComparison!(FPComparison.one)(A, C);
1030     testComparison!(FPComparison.ord)(A, B);
1031     testComparison!(FPComparison.ord)(A, C);
1032     testComparison!(FPComparison.ueq)(A, B);
1033     testComparison!(FPComparison.ueq)(A, C);
1034     testComparison!(FPComparison.ugt)(A, B);
1035     testComparison!(FPComparison.ugt)(A, C);
1036     testComparison!(FPComparison.uge)(A, B);
1037     testComparison!(FPComparison.uge)(A, C);
1038     testComparison!(FPComparison.ult)(A, B);
1039     testComparison!(FPComparison.ult)(A, C);
1040     testComparison!(FPComparison.ule)(A, B);
1041     testComparison!(FPComparison.ule)(A, C);
1042     testComparison!(FPComparison.une)(A, B);
1043     testComparison!(FPComparison.une)(A, C);
1044     testComparison!(FPComparison.uno)(A, B);
1045     testComparison!(FPComparison.uno)(A, C);
1046 }
1047 unittest // cmpsd and comsd
1048 {
1049     void testComparison(FPComparison comparison)(double2 A, double2 B)
1050     {
1051         double2 result = cmpsd!comparison(A, B);
1052         long2 iresult = cast(long2)result;
1053         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1054         assert(iresult.array[0] == expected);
1055         assert(result.array[1] == A.array[1]);
1056 
1057         // check comsd
1058         int comResult = comsd!comparison(A, B);
1059         assert( (expected != 0) == (comResult != 0) );
1060     }
1061 
1062     // Check all comparison type is working
1063     double2 A = [1, 3];
1064     double2 B = [2, 4];
1065     double2 C = [double.nan, 5];
1066 
1067     testComparison!(FPComparison.oeq)(A, B);
1068     testComparison!(FPComparison.oeq)(A, C);
1069     testComparison!(FPComparison.ogt)(A, B);
1070     testComparison!(FPComparison.ogt)(A, C);
1071     testComparison!(FPComparison.oge)(A, B);
1072     testComparison!(FPComparison.oge)(A, C);
1073     testComparison!(FPComparison.olt)(A, B);
1074     testComparison!(FPComparison.olt)(A, C);
1075     testComparison!(FPComparison.ole)(A, B);
1076     testComparison!(FPComparison.ole)(A, C);
1077     testComparison!(FPComparison.one)(A, B);
1078     testComparison!(FPComparison.one)(A, C);
1079     testComparison!(FPComparison.ord)(A, B);
1080     testComparison!(FPComparison.ord)(A, C);
1081     testComparison!(FPComparison.ueq)(A, B);
1082     testComparison!(FPComparison.ueq)(A, C);
1083     testComparison!(FPComparison.ugt)(A, B);
1084     testComparison!(FPComparison.ugt)(A, C);
1085     testComparison!(FPComparison.uge)(A, B);
1086     testComparison!(FPComparison.uge)(A, C);
1087     testComparison!(FPComparison.ult)(A, B);
1088     testComparison!(FPComparison.ult)(A, C);
1089     testComparison!(FPComparison.ule)(A, B);
1090     testComparison!(FPComparison.ule)(A, C);
1091     testComparison!(FPComparison.une)(A, B);
1092     testComparison!(FPComparison.une)(A, C);
1093     testComparison!(FPComparison.uno)(A, B);
1094     testComparison!(FPComparison.uno)(A, C);
1095 }
1096 
1097 //
1098 //  </FLOATING-POINT COMPARISONS>
1099 //
1100 
1101 
1102 __m64 to_m64(__m128i a) pure @trusted
1103 {
1104     long2 la = cast(long2)a;
1105     long1 r;
1106     r.ptr[0] = la.array[0];
1107     return r;
1108 }
1109 
1110 __m128i to_m128i(__m64 a) pure @trusted
1111 {
1112     long2 r = [0, 0];
1113     r.ptr[0] = a.array[0];
1114     return cast(__m128i)r;
1115 }
1116 
1117 // SOME NEON INTRINSICS
1118 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1119 // Not in the public API but the simde project expose it all for the user to use.
1120 // MAYDO: create a new neon.d module, for internal use only.
1121 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1122 static if (LDC_with_ARM64)
1123 {
1124     // VERY USEFUL LINK
1125     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1126 
1127     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1128         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1129 
1130     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1131     {
1132         return a & b;
1133     }
1134 
1135     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1136     {
1137         int4 r;
1138         r.ptr[0] = lo.array[0];
1139         r.ptr[1] = lo.array[1];
1140         r.ptr[2] = hi.array[0];
1141         r.ptr[3] = hi.array[1];
1142         return r;
1143     }
1144 
1145     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1146     {
1147         byte16 r;
1148         r.ptr[0]  = lo.array[0];
1149         r.ptr[1]  = lo.array[1];
1150         r.ptr[2]  = lo.array[2];
1151         r.ptr[3]  = lo.array[3];
1152         r.ptr[4]  = lo.array[4];
1153         r.ptr[5]  = lo.array[5];
1154         r.ptr[6]  = lo.array[6];
1155         r.ptr[7]  = lo.array[7];
1156         r.ptr[8]  = hi.array[0];
1157         r.ptr[9]  = hi.array[1];
1158         r.ptr[10] = hi.array[2];
1159         r.ptr[11] = hi.array[3];
1160         r.ptr[12] = hi.array[4];
1161         r.ptr[13] = hi.array[5];
1162         r.ptr[14] = hi.array[6];
1163         r.ptr[15] = hi.array[7];
1164         return r;
1165     }
1166 
1167     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1168         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1169 
1170     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1171         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1172 
1173     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1174         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1175 
1176     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1177         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1178 
1179     short4 vget_high_s16(short8 a) pure @trusted
1180     {
1181         short4 r;
1182         r.ptr[0] = a.array[4];
1183         r.ptr[1] = a.array[5];
1184         r.ptr[2] = a.array[6];
1185         r.ptr[3] = a.array[7];
1186         return r;
1187     }
1188 
1189     int2 vget_high_s32(int4 a) pure @trusted
1190     {
1191         int2 r;
1192         r.ptr[0] = a.array[2];
1193         r.ptr[1] = a.array[3];
1194         return r;
1195     }
1196 
1197     byte8 vget_high_u8(byte16 a) pure @trusted
1198     {
1199         byte8 r;
1200         r.ptr[0] = a.array[8];
1201         r.ptr[1] = a.array[9];
1202         r.ptr[2] = a.array[10];
1203         r.ptr[3] = a.array[11];
1204         r.ptr[4] = a.array[12];
1205         r.ptr[5] = a.array[13];
1206         r.ptr[6] = a.array[14];
1207         r.ptr[7] = a.array[15];
1208         return r;
1209     }
1210 
1211     short4 vget_low_s16(short8 a) pure @trusted
1212     {
1213         short4 r;
1214         r.ptr[0] = a.array[0];
1215         r.ptr[1] = a.array[1];
1216         r.ptr[2] = a.array[2];
1217         r.ptr[3] = a.array[3];
1218         return r;
1219     } 
1220 
1221     int2 vget_low_s32(int4 a) pure @trusted
1222     {
1223         int2 r;
1224         r.ptr[0] = a.array[0];
1225         r.ptr[1] = a.array[1];
1226         return r;
1227     }
1228 
1229     byte8 vget_low_u8(byte16 a) pure @trusted
1230     {
1231         byte8 r;
1232         r.ptr[0] = a.array[0];
1233         r.ptr[1] = a.array[1];
1234         r.ptr[2] = a.array[2];
1235         r.ptr[3] = a.array[3];
1236         r.ptr[4] = a.array[4];
1237         r.ptr[5] = a.array[5];
1238         r.ptr[6] = a.array[6];
1239         r.ptr[7] = a.array[7];
1240         return r;
1241     }
1242 
1243     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1244         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1245 
1246     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1247         short8 vminq_s16(short8 a, short8 b) pure @safe;
1248 
1249     int4 vmull_s16(short4 a, short4 b) pure @trusted
1250     {
1251         int4 r;
1252         r.ptr[0] = a.array[0] * b.array[0];
1253         r.ptr[1] = a.array[1] * b.array[1];
1254         r.ptr[2] = a.array[2] * b.array[2];
1255         r.ptr[3] = a.array[3] * b.array[3];
1256         return r;
1257     }
1258 
1259     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1260     {
1261         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1262             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1263     }
1264     else
1265     {
1266         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1267             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1268     }
1269 
1270     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1271         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1272 
1273     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1274         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1275 
1276     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1277         byte8 vqmovn_s16(short8 a) pure @safe;
1278 
1279     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1280         byte8 vqmovun_s16(short8 a) pure @safe;
1281 
1282     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1283         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1284 
1285     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1286         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1287 
1288     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1289     {
1290         return a >>> b;
1291     }
1292 }
1293